r600g: compute support for evergreen

Tom Stellard: - Updated for gallium interface changes - Fixed a few bugs: + Set the loop counter + Calculate the correct number of pipes - Added hooks into the LLVM compiler
2026-05-07 04:58:05 +02:00 · 2011-11-30 22:20:41 +01:00 · 2011-11-30 22:20:41 +01:00 · 6a829a1b72
commit 6a829a1b72
parent 46a13b3b11
21 changed files with 2680 additions and 13 deletions
--- a/configure.ac
+++ b/configure.ac
@ -1993,13 +1993,18 @@ if test "x$with_gallium_drivers" != x; then
            PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
            gallium_require_drm_loader
            GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS r600"
-            if test "x$enable_r600_llvm" = xyes; then
+            if test "x$enable_r600_llvm" = xyes -o "x$enable_opencl" = xyes; then
                if test "x$LLVM_VERSION" != "x3.1"; then
                    AC_MSG_ERROR([LLVM 3.1 is required for the r600 llvm compiler.])
                fi
                NEED_RADEON_GALLIUM=yes;
+            fi
+            if test "x$enable_r600_llvm" = xyes; then
                USE_R600_LLVM_COMPILER=yes;
            fi
+            if test "x$enable_opencl" = xyes -a "x$with_llvm_shared_libs" = xno; then
+                LLVM_LIBS="${LLVM_LIBS} `llvm-config --libs bitreader asmparser`"
+            fi
            gallium_check_st "radeon/drm" "dri-r600" "xorg-r600" "" "xvmc-r600" "vdpau-r600" "va-r600"
            ;;
        xradeonsi)
--- a/src/gallium/drivers/r600/Makefile.am
+++ b/src/gallium/drivers/r600/Makefile.am
@ -18,7 +18,7 @@ AM_CFLAGS = \
 libr600_a_SOURCES = \
 	$(C_SOURCES)

-if USE_R600_LLVM_COMPILER
+if NEED_RADEON_GALLIUM

 # This is a hack until we can move the backend into the LLVM project.
 # We need to use mklib, because it splits up libradeon.a into object files
@ -26,18 +26,28 @@ if USE_R600_LLVM_COMPILER
 libr600_a_AR = $(top_srcdir)/bin/mklib -o r600 -static

 libr600_a_SOURCES += \
-	$(LLVM_C_SOURCES)
+	$(LLVM_C_SOURCES) \
+	$(LLVM_CXX_SOURCES)

 libr600_a_LIBADD = \
 	$(top_builddir)/src/gallium/drivers/radeon/libradeon.a

 AM_CFLAGS += \
 	$(LLVM_CFLAGS) \
-	-I$(top_srcdir)/src/gallium/drivers/radeon/ \
-	-DR600_USE_LLVM
+	-I$(top_srcdir)/src/gallium/drivers/radeon/

 AM_CXXFLAGS= \
 	$(LLVM_CXXFLAGS)
 else
 libr600_a_AR = $(AR) $(ARFLAGS)
 endif
+
+if USE_R600_LLVM_COMPILER
+AM_CFLAGS += \
+	-DR600_USE_LLVM
+endif
+
+if HAVE_GALLIUM_COMPUTE
+AM_CFLAGS += \
+	-DHAVE_OPENCL
+endif
--- a/src/gallium/drivers/r600/Makefile.sources
+++ b/src/gallium/drivers/r600/Makefile.sources
@ -14,6 +14,10 @@ C_SOURCES = \
 	evergreen_state.c \
 	eg_asm.c \
 	r600_translate.c \
-	r600_state_common.c
+	r600_state_common.c \
+	evergreen_compute.c \
+	evergreen_compute_internal.c \
+	compute_memory_pool.c

 LLVM_C_SOURCES = r600_llvm.c
+LLVM_CXX_SOURCES = llvm_wrapper.cpp
--- a/src/gallium/drivers/r600/compute_memory_pool.c
+++ b/src/gallium/drivers/r600/compute_memory_pool.c
@ -0,0 +1,397 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "util/u_blitter.h"
+#include "util/u_double_list.h"
+#include "util/u_transfer.h"
+#include "util/u_surface.h"
+#include "util/u_pack_color.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
+#include "r600.h"
+#include "r600_resource.h"
+#include "r600_shader.h"
+#include "r600_pipe.h"
+#include "r600_formats.h"
+#include "compute_memory_pool.h"
+#include "evergreen_compute_internal.h"
+
+/**
+ * Creates a new pool
+ */
+struct compute_memory_pool* compute_memory_pool_new(
+	int64_t initial_size_in_dw,
+	struct r600_screen * rscreen)
+{
+	struct compute_memory_pool* pool = (struct compute_memory_pool*)
+				CALLOC(sizeof(struct compute_memory_pool), 1);
+
+	pool->next_id = 1;
+	pool->size_in_dw = initial_size_in_dw;
+	pool->screen = rscreen;
+	pool->bo = (struct r600_resource*)r600_compute_buffer_alloc_vram(
+					pool->screen, pool->size_in_dw*4);
+	pool->shadow = (uint32_t*)CALLOC(4, pool->size_in_dw);
+
+	return pool;
+}
+
+/**
+ * Frees all stuff in the pool and the pool struct itself too
+ */
+void compute_memory_pool_delete(struct compute_memory_pool* pool)
+{
+	free(pool->shadow);
+	pool->screen->screen.resource_destroy((struct pipe_screen *)
+			pool->screen, (struct pipe_resource *)pool->bo);
+	free(pool);
+}
+
+/**
+ * Searches for an empty space in the pool, return with the pointer to the
+ * allocatable space in the pool, returns -1 on failure.
+ */
+int64_t compute_memory_prealloc_chunk(
+	struct compute_memory_pool* pool,
+	int64_t size_in_dw)
+{
+	assert(size_in_dw <= pool->size_in_dw);
+
+	struct compute_memory_item *item;
+
+	int last_end = 0;
+
+	for (item = pool->item_list; item; item = item->next) {
+		if (item->start_in_dw > -1) {
+			if (item->start_in_dw-last_end > size_in_dw) {
+				return last_end;
+			}
+
+			last_end = item->start_in_dw + item->size_in_dw;
+			last_end += (1024 - last_end % 1024);
+		}
+	}
+
+	if (pool->size_in_dw - last_end < size_in_dw) {
+		return -1;
+	}
+
+	return last_end;
+}
+
+/**
+ *  Search for the chunk where we can link our new chunk after it.
+ */
+struct compute_memory_item* compute_memory_postalloc_chunk(
+	struct compute_memory_pool* pool,
+	int64_t start_in_dw)
+{
+	struct compute_memory_item* item;
+
+	for (item = pool->item_list; item; item = item->next) {
+		if (item->next) {
+			if (item->start_in_dw < start_in_dw
+				&& item->next->start_in_dw > start_in_dw) {
+				return item;
+			}
+		}
+		else {
+			/* end of chain */
+			assert(item->start_in_dw < start_in_dw);
+			return item;
+		}
+	}
+
+	assert(0 && "unreachable");
+	return NULL;
+}
+
+/**
+ * Reallocates pool, conserves data
+ */
+void compute_memory_grow_pool(struct compute_memory_pool* pool,
+	struct pipe_context * pipe, int new_size_in_dw)
+{
+	assert(new_size_in_dw >= pool->size_in_dw);
+
+	new_size_in_dw += 1024 - (new_size_in_dw % 1024);
+
+	compute_memory_shadow(pool, pipe, 1);
+	pool->shadow = (uint32_t*)realloc(pool->shadow, new_size_in_dw*4);
+	pool->size_in_dw = new_size_in_dw;
+	pool->screen->screen.resource_destroy(
+		(struct pipe_screen *)pool->screen,
+		(struct pipe_resource *)pool->bo);
+	pool->bo = r600_compute_buffer_alloc_vram(pool->screen,
+						pool->size_in_dw*4);
+	compute_memory_shadow(pool, pipe, 0);
+}
+
+/**
+ * Copy pool from device to host, or host to device.
+ */
+void compute_memory_shadow(struct compute_memory_pool* pool,
+	struct pipe_context * pipe, int device_to_host)
+{
+	struct compute_memory_item chunk;
+
+	chunk.id = 0;
+	chunk.start_in_dw = 0;
+	chunk.size_in_dw = pool->size_in_dw;
+	chunk.prev = chunk.next = NULL;
+	compute_memory_transfer(pool, pipe, device_to_host, &chunk,
+				pool->shadow, 0, pool->size_in_dw*4);
+}
+
+/**
+ * Allocates pending allocations in the pool
+ */
+void compute_memory_finalize_pending(struct compute_memory_pool* pool,
+	struct pipe_context * pipe)
+{
+	struct compute_memory_item *pending_list = NULL, *end_p = NULL;
+	struct compute_memory_item *item, *next;
+
+	int64_t allocated = 0;
+	int64_t unallocated = 0;
+
+	for (item = pool->item_list; item; item = item->next) {
+		COMPUTE_DBG("list: %i %p\n", item->start_in_dw, item->next);
+	}
+
+	for (item = pool->item_list; item; item = next) {
+		next = item->next;
+
+
+		if (item->start_in_dw == -1) {
+			if (end_p) {
+				end_p->next = item;
+			}
+			else {
+				pending_list = item;
+			}
+
+			if (item->prev) {
+				item->prev->next = next;
+			}
+			else {
+				pool->item_list = next;
+			}
+
+			if (next) {
+				next->prev = item->prev;
+			}
+
+			item->prev = end_p;
+			item->next = NULL;
+			end_p = item;
+
+			unallocated += item->size_in_dw+1024;
+		}
+		else {
+			allocated += item->size_in_dw;
+		}
+	}
+
+	if (pool->size_in_dw < allocated+unallocated) {
+		compute_memory_grow_pool(pool, pipe, allocated+unallocated);
+	}
+
+	for (item = pending_list; item; item = next) {
+		next = item->next;
+
+		int64_t start_in_dw;
+
+		while ((start_in_dw=compute_memory_prealloc_chunk(pool,
+						item->size_in_dw)) == -1) {
+			int64_t need = item->size_in_dw+2048 -
+						(pool->size_in_dw - allocated);
+
+			need += 1024 - (need % 1024);
+
+			if (need > 0) {
+				compute_memory_grow_pool(pool,
+						pipe,
+						pool->size_in_dw + need);
+			}
+			else {
+				need = pool->size_in_dw / 10;
+				need += 1024 - (need % 1024);
+				compute_memory_grow_pool(pool,
+						pipe,
+						pool->size_in_dw + need);
+			}
+		}
+
+		item->start_in_dw = start_in_dw;
+		item->next = NULL;
+		item->prev = NULL;
+
+		if (pool->item_list) {
+			struct compute_memory_item *pos;
+
+			pos = compute_memory_postalloc_chunk(pool, start_in_dw);
+			item->prev = pos;
+			item->next = pos->next;
+			pos->next = item;
+
+			if (item->next) {
+				item->next->prev = item;
+			}
+		}
+		else {
+			pool->item_list = item;
+		}
+
+		allocated += item->size_in_dw;
+	}
+}
+
+
+void compute_memory_free(struct compute_memory_pool* pool, int64_t id)
+{
+	struct compute_memory_item *item, *next;
+
+	for (item = pool->item_list; item; item = next) {
+		next = item->next;
+
+		if (item->id == id) {
+			if (item->prev) {
+				item->prev->next = item->next;
+			}
+			else {
+				pool->item_list = item->next;
+			}
+
+			if (item->next) {
+				item->next->prev = item->prev;
+			}
+
+			free(item);
+
+			return;
+		}
+	}
+
+	fprintf(stderr, "Internal error, invalid id %ld "
+		"for compute_memory_free\n", id);
+
+	assert(0 && "error");
+}
+
+/**
+ * Creates pending allocations
+ */
+struct compute_memory_item* compute_memory_alloc(
+	struct compute_memory_pool* pool,
+	int64_t size_in_dw)
+{
+	struct compute_memory_item *new_item;
+
+	COMPUTE_DBG("Alloc: %i\n", size_in_dw);
+
+	new_item = (struct compute_memory_item *)
+				CALLOC(sizeof(struct compute_memory_item), 1);
+	new_item->size_in_dw = size_in_dw;
+	new_item->start_in_dw = -1; /* mark pending */
+	new_item->id = pool->next_id++;
+	new_item->pool = pool;
+
+	struct compute_memory_item *last_item;
+
+	if (pool->item_list) {
+		for (last_item = pool->item_list; last_item->next;
+						last_item = last_item->next);
+
+		last_item->next = new_item;
+		new_item->prev = last_item;
+	}
+	else {
+		pool->item_list = new_item;
+	}
+
+	return new_item;
+}
+
+/**
+ * Transfer data host<->device, offset and size is in bytes
+ */
+void compute_memory_transfer(
+	struct compute_memory_pool* pool,
+	struct pipe_context * pipe,
+	int device_to_host,
+	struct compute_memory_item* chunk,
+	void* data,
+	int offset_in_chunk,
+	int size)
+{
+	int64_t aligned_size = pool->size_in_dw;
+	struct pipe_resource* gart = (struct pipe_resource*)pool->bo;
+	int64_t internal_offset = chunk->start_in_dw*4 + offset_in_chunk;
+
+	struct pipe_transfer *xfer;
+	uint32_t *map;
+
+	if (device_to_host)
+	{
+		xfer = pipe->get_transfer(pipe, gart, 0, PIPE_TRANSFER_READ,
+			&(struct pipe_box) { .width = aligned_size,
+			.height = 1, .depth = 1 });
+		assert(xfer);
+		map = pipe->transfer_map(pipe, xfer);
+		assert(map);
+		memcpy(data, map + internal_offset, size);
+		pipe->transfer_unmap(pipe, xfer);
+		pipe->transfer_destroy(pipe, xfer);
+	} else {
+		xfer = pipe->get_transfer(pipe, gart, 0, PIPE_TRANSFER_WRITE,
+			&(struct pipe_box) { .width = aligned_size,
+			.height = 1, .depth = 1 });
+		assert(xfer);
+		map = pipe->transfer_map(pipe, xfer);
+		assert(map);
+		memcpy(map + internal_offset, data, size);
+		pipe->transfer_unmap(pipe, xfer);
+		pipe->transfer_destroy(pipe, xfer);
+	}
+}
+
+/**
+ * Transfer data between chunk<->data, it is for VRAM<->GART transfers
+ */
+void compute_memory_transfer_direct(
+	struct compute_memory_pool* pool,
+	int chunk_to_data,
+	struct compute_memory_item* chunk,
+	struct r600_resource* data,
+	int offset_in_chunk,
+	int offset_in_data,
+	int size)
+{
+	///TODO: DMA
+}
--- a/src/gallium/drivers/r600/compute_memory_pool.h
+++ b/src/gallium/drivers/r600/compute_memory_pool.h
@ -0,0 +1,98 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+#ifndef COMPUTE_MEMORY_POOL
+#define COMPUTE_MEMORY_POOL
+
+#include <stdlib.h>
+
+struct compute_memory_pool;
+
+struct compute_memory_item
+{
+	int64_t id; ///ID of the memory chunk
+
+	int untouched; ///True if the memory contains only junk, no need to save it for defrag
+
+	int64_t start_in_dw; ///Start pointer in dwords relative in the pool bo
+	int64_t size_in_dw; ///Size of the chunk in dwords
+
+	struct compute_memory_pool* pool;
+
+	struct compute_memory_item* prev;
+	struct compute_memory_item* next;
+};
+
+struct compute_memory_pool
+{
+	int64_t next_id; ///For generating unique IDs for memory chunks
+	int64_t size_in_dw; ///Size of the pool in dwords
+
+	struct r600_resource *bo; ///The pool buffer object resource
+	struct compute_memory_item* item_list; ///Allocated memory chunks in the buffer,they must be ordered by "start_in_dw"
+	struct r600_screen *screen;
+
+	uint32_t *shadow; ///host copy of the pool, used for defragmentation
+};
+
+
+struct compute_memory_pool* compute_memory_pool_new(int64_t initial_size_in_dw, struct r600_screen *rscreen); ///Creates a new pool
+void compute_memory_pool_delete(struct compute_memory_pool* pool); ///Frees all stuff in the pool and the pool struct itself too
+
+int64_t compute_memory_prealloc_chunk(struct compute_memory_pool* pool, int64_t size_in_dw); ///searches for an empty space in the pool, return with the pointer to the allocatable space in the pool, returns -1 on failure
+
+struct compute_memory_item* compute_memory_postalloc_chunk(struct compute_memory_pool* pool, int64_t start_in_dw); ///search for the chunk where we can link our new chunk after it
+
+/** 
+ * reallocates pool, conserves data
+ */
+void compute_memory_grow_pool(struct compute_memory_pool* pool, struct pipe_context * pipe,
+	int new_size_in_dw);
+
+/**
+ * Copy pool from device to host, or host to device
+ */
+void compute_memory_shadow(struct compute_memory_pool* pool,
+	struct pipe_context * pipe, int device_to_host);
+
+/**
+ * Allocates pending allocations in the pool
+ */
+void compute_memory_finalize_pending(struct compute_memory_pool* pool,
+	struct pipe_context * pipe);
+void compute_memory_defrag(struct compute_memory_pool* pool); ///Defragment the memory pool, always heavy memory usage
+void compute_memory_free(struct compute_memory_pool* pool, int64_t id);
+struct compute_memory_item* compute_memory_alloc(struct compute_memory_pool* pool, int64_t size_in_dw); ///Creates pending allocations
+
+/**
+ * Transfer data host<->device, offset and size is in bytes
+ */
+void compute_memory_transfer(struct compute_memory_pool* pool,
+	struct pipe_context * pipe, int device_to_host,
+	struct compute_memory_item* chunk, void* data,
+	int offset_in_chunk, int size);
+
+void compute_memory_transfer_direct(struct compute_memory_pool* pool, int chunk_to_data, struct compute_memory_item* chunk, struct r600_resource* data, int offset_in_chunk, int offset_in_data, int size); ///Transfer data between chunk<->data, it is for VRAM<->GART transfers
+
+#endif
--- a/src/gallium/drivers/r600/compute_resource.def
+++ b/src/gallium/drivers/r600/compute_resource.def
@ -0,0 +1,38 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+
+DECL_COMPUTE_RESOURCE(CONFIG, 1)
+DECL_COMPUTE_RESOURCE(CONST_MEM, 16)
+DECL_COMPUTE_RESOURCE(RAT, 12)
+DECL_COMPUTE_RESOURCE(VERT, 16)
+DECL_COMPUTE_RESOURCE(TEX, 16)
+DECL_COMPUTE_RESOURCE(SAMPLER, 18)
+DECL_COMPUTE_RESOURCE(LOOP, 32)
+DECL_COMPUTE_RESOURCE(LDS, 1)
+DECL_COMPUTE_RESOURCE(GDS, 1)
+DECL_COMPUTE_RESOURCE(EXPORT, 1)
+DECL_COMPUTE_RESOURCE(SHADER, 1)
+DECL_COMPUTE_RESOURCE(TMPRING, 4)
+DECL_COMPUTE_RESOURCE(DISPATCH, 1)
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@ -0,0 +1,814 @@
+/*
+ * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "util/u_blitter.h"
+#include "util/u_double_list.h"
+#include "util/u_transfer.h"
+#include "util/u_surface.h"
+#include "util/u_pack_color.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
+#include "pipebuffer/pb_buffer.h"
+#include "r600.h"
+#include "evergreend.h"
+#include "r600_resource.h"
+#include "r600_shader.h"
+#include "r600_pipe.h"
+#include "r600_formats.h"
+#include "evergreen_compute.h"
+#include "r600_hw_context_priv.h"
+#include "evergreen_compute_internal.h"
+#include "compute_memory_pool.h"
+#ifdef HAVE_OPENCL
+#include "llvm_wrapper.h"
+#endif
+
+/**
+RAT0 is for global binding write
+VTX1 is for global binding read
+
+for wrting images RAT1...
+for reading images TEX2...
+  TEX2-RAT1 is paired
+
+TEX2... consumes the same fetch resources, that VTX2... would consume
+
+CONST0 and VTX0 is for parameters
+  CONST0 is binding smaller input parameter buffer, and for constant indexing,
+  also constant cached
+  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
+  the constant cache can handle
+
+RAT-s are limited to 12, so we can only bind at most 11 texture for writing
+because we reserve RAT0 for global bindings. With byteaddressing enabled,
+we should reserve another one too.=> 10 image binding for writing max.
+
+from Nvidia OpenCL:
+  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
+  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8 
+
+so 10 for writing is enough. 176 is the max for reading according to the docs
+
+writable images should be listed first < 10, so their id corresponds to RAT(id+1)
+writable images will consume TEX slots, VTX slots too because of linear indexing
+
+*/
+
+const struct u_resource_vtbl r600_global_buffer_vtbl =
+{
+	u_default_resource_get_handle, /* get_handle */
+	r600_compute_global_buffer_destroy, /* resource_destroy */
+	r600_compute_global_get_transfer, /* get_transfer */
+	r600_compute_global_transfer_destroy, /* transfer_destroy */
+	r600_compute_global_transfer_map, /* transfer_map */
+	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
+	r600_compute_global_transfer_unmap, /* transfer_unmap */
+	r600_compute_global_transfer_inline_write /* transfer_inline_write */
+};
+
+
+void *evergreen_create_compute_state(
+	struct pipe_context *ctx_,
+	const const struct pipe_compute_state *cso)
+{
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+
+#ifdef HAVE_OPENCL
+	const struct pipe_llvm_program_header * header;
+	const unsigned char * code;
+
+	header = cso->prog;
+	code = cso->prog + sizeof(struct pipe_llvm_program_header);
+#endif
+
+	if (!ctx->screen->screen.get_param(&ctx->screen->screen,
+							PIPE_CAP_COMPUTE)) {
+		fprintf(stderr, "Compute is not supported\n");
+		return NULL;
+	}
+	struct r600_pipe_compute *shader =	CALLOC_STRUCT(r600_pipe_compute);
+
+	shader->ctx = (struct r600_context*)ctx;
+	shader->resources = (struct evergreen_compute_resource*)
+			CALLOC(sizeof(struct evergreen_compute_resource),
+			get_compute_resource_num());
+	shader->local_size = cso->req_local_mem; ///TODO: assert it
+	shader->private_size = cso->req_private_mem;
+	shader->input_size = cso->req_input_mem;
+
+#ifdef HAVE_OPENCL 
+	shader->mod = llvm_parse_bitcode(code, header->num_bytes);
+
+	r600_compute_shader_create(ctx_, shader->mod, &shader->bc);
+#endif
+	return shader;
+}
+
+void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
+{
+	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
+
+	free(shader->resources);
+	free(shader);
+}
+
+static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
+{
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+
+	ctx->cs_shader = (struct r600_pipe_compute *)state;
+
+	assert(!ctx->cs_shader->shader_code_bo);
+
+	ctx->cs_shader->shader_code_bo =
+		r600_compute_buffer_alloc_vram(ctx->screen,
+					ctx->cs_shader->bc.ndw * 4);
+
+	void *p = ctx->ws->buffer_map(ctx->cs_shader->shader_code_bo->cs_buf,
+					ctx->cs, PIPE_TRANSFER_WRITE);
+
+	memcpy(p, ctx->cs_shader->bc.bytecode, ctx->cs_shader->bc.ndw * 4);
+
+	ctx->ws->buffer_unmap(ctx->cs_shader->shader_code_bo->cs_buf);
+
+	evergreen_compute_init_config(ctx);
+
+	struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader,
+						COMPUTE_RESOURCE_SHADER, 0);
+
+	evergreen_reg_set(res, R_008C0C_SQ_GPR_RESOURCE_MGMT_3,
+			S_008C0C_NUM_LS_GPRS(ctx->cs_shader->bc.ngpr));
+
+	///maybe we can use it later
+	evergreen_reg_set(res, R_0286C8_SPI_THREAD_GROUPING, 0);
+	///maybe we can use it later
+	evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
+
+	evergreen_reg_set(res, R_0288D4_SQ_PGM_RESOURCES_LS,
+		S_0288D4_NUM_GPRS(ctx->cs_shader->bc.ngpr)
+		| S_0288D4_STACK_SIZE(ctx->cs_shader->bc.nstack));
+	evergreen_reg_set(res, R_0288D8_SQ_PGM_RESOURCES_LS_2, 0);
+
+	evergreen_reg_set(res, R_0288D0_SQ_PGM_START_LS, 0);
+	res->bo = ctx->cs_shader->shader_code_bo;
+	res->usage = RADEON_USAGE_READ;
+	res->coher_bo_size = ctx->cs_shader->bc.ndw*4;
+	res->flags = COMPUTE_RES_SH_FLUSH;
+
+	/* We can't always determine the
+	 * number of iterations in a loop before it's executed,
+	 * so we just need to set up the loop counter to give us the maximum
+	 * number of iterations possible.  Currently, loops in shader code
+	 * ignore the loop counter and use a break instruction to exit the
+	 * loop at the correct time.
+	 */
+	evergreen_set_loop_const(ctx->cs_shader,
+		0, /* index */
+		0xFFF, /* Maximum value of the loop counter (i.e. when the loop
+			* counter reaches this value, the program will break
+			* out of the loop. */
+		0x0,   /* Starting value of the loop counter. */
+		0x1);  /* Amount to increment the loop counter each iteration. */
+}
+
+/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
+ * kernel parameters there are inplicit parameters that need to be stored
+ * in the vertex buffer as well.  Here is how these parameters are organized in
+ * the buffer:
+ *
+ * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
+ * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
+ * DWORDS 6-8: Number of work items within each work group in each dimension
+ *             (x,y,z)
+ * DWORDS 9+ : Kernel parameters
+ */
+void evergreen_compute_upload_input(
+	struct pipe_context *ctx_,
+	const uint *block_layout,
+	const uint *grid_layout,
+	const void *input)
+{
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	int i;
+	unsigned kernel_parameters_offset_bytes = 36;
+	uint32_t * num_work_groups_start;
+	uint32_t * global_size_start;
+	uint32_t * local_size_start;
+	uint32_t * kernel_parameters_start;
+
+	if (ctx->cs_shader->input_size == 0) {
+		return;
+	}
+
+	if (!ctx->cs_shader->kernel_param) {
+		unsigned buffer_size = ctx->cs_shader->input_size;
+
+		/* Add space for the grid dimensions */
+		buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
+		ctx->cs_shader->kernel_param =
+				r600_compute_buffer_alloc_vram(ctx->screen,
+						buffer_size);
+	}
+
+	num_work_groups_start = ctx->ws->buffer_map(
+			ctx->cs_shader->kernel_param->cs_buf,
+			ctx->cs, PIPE_TRANSFER_WRITE);
+	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
+	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
+	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
+
+	/* Copy the work group size */
+	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
+
+	/* Copy the global size */
+	for (i = 0; i < 3; i++) {
+		global_size_start[i] = grid_layout[i] * block_layout[i];
+	}
+
+	/* Copy the local dimensions */
+	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
+
+	/* Copy the kernel inputs */
+	memcpy(kernel_parameters_start, input, ctx->cs_shader->input_size);
+
+	for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
+					(ctx->cs_shader->input_size / 4); i++) {
+		COMPUTE_DBG("input %i : %i\n", i,
+			((unsigned*)num_work_groups_start)[i]);
+	}
+
+	ctx->ws->buffer_unmap(ctx->cs_shader->kernel_param->cs_buf);
+
+	///ID=0 is reserved for the parameters
+	evergreen_set_vtx_resource(ctx->cs_shader,
+		ctx->cs_shader->kernel_param, 0, 0, 0);
+	///ID=0 is reserved for parameters
+	evergreen_set_const_cache(ctx->cs_shader, 0,
+		ctx->cs_shader->kernel_param, ctx->cs_shader->input_size, 0);
+}
+
+void evergreen_direct_dispatch(
+		struct pipe_context *ctx_,
+		const uint *block_layout, const uint *grid_layout)
+{
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+
+	int i;
+
+	struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader,
+		COMPUTE_RESOURCE_DISPATCH, 0);
+
+	evergreen_reg_set(res, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST);
+
+	evergreen_reg_set(res, R_00899C_VGT_COMPUTE_START_X, 0);
+	evergreen_reg_set(res, R_0089A0_VGT_COMPUTE_START_Y, 0);
+	evergreen_reg_set(res, R_0089A4_VGT_COMPUTE_START_Z, 0);
+
+	evergreen_reg_set(res, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, block_layout[0]);
+	evergreen_reg_set(res, R_0286F0_SPI_COMPUTE_NUM_THREAD_Y, block_layout[1]);
+	evergreen_reg_set(res, R_0286F4_SPI_COMPUTE_NUM_THREAD_Z, block_layout[2]);
+
+	int group_size = 1;
+
+	int grid_size = 1;
+
+	for (i = 0; i < 3; i++) {
+		group_size *= block_layout[i];
+	}
+
+	for (i = 0; i < 3; i++)	{
+		grid_size *= grid_layout[i];
+	}
+
+	evergreen_reg_set(res, R_008970_VGT_NUM_INDICES, group_size);
+	evergreen_reg_set(res, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, group_size);
+
+	evergreen_emit_raw_value(res, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
+	evergreen_emit_raw_value(res, grid_layout[0]);
+	evergreen_emit_raw_value(res, grid_layout[1]);
+	evergreen_emit_raw_value(res, grid_layout[2]);
+	///VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN
+	evergreen_emit_raw_value(res, 1);
+}
+
+static void compute_emit_cs(struct r600_context *ctx)
+{
+	struct radeon_winsys_cs *cs = ctx->cs;
+	int i;
+
+	r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
+
+	struct r600_resource *onebo = NULL;
+
+	for (i = 0; i < get_compute_resource_num(); i++) {
+		if (ctx->cs_shader->resources[i].enabled) {
+			int j;
+			COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw);
+
+			for (j = 0; j < ctx->cs_shader->resources[i].cs_end; j++) {
+				if (ctx->cs_shader->resources[i].do_reloc[j]) {
+					assert(ctx->cs_shader->resources[i].bo);
+					evergreen_emit_ctx_reloc(ctx,
+						ctx->cs_shader->resources[i].bo,
+						ctx->cs_shader->resources[i].usage);
+				}
+
+				cs->buf[cs->cdw++] = ctx->cs_shader->resources[i].cs[j];
+			}
+
+			if (ctx->cs_shader->resources[i].bo) {
+				onebo = ctx->cs_shader->resources[i].bo;
+				evergreen_emit_ctx_reloc(ctx,
+					ctx->cs_shader->resources[i].bo,
+					ctx->cs_shader->resources[i].usage);
+
+				///special case for textures
+				if (ctx->cs_shader->resources[i].do_reloc
+					[ctx->cs_shader->resources[i].cs_end] == 2) {
+					evergreen_emit_ctx_reloc(ctx,
+						ctx->cs_shader->resources[i].bo,
+						ctx->cs_shader->resources[i].usage);
+				}
+
+				evergreen_set_buffer_sync(ctx, ctx->cs_shader->resources[i].bo,
+					ctx->cs_shader->resources[i].coher_bo_size,
+					ctx->cs_shader->resources[i].flags,
+					ctx->cs_shader->resources[i].usage);
+			}
+		}
+	}
+
+#if 0
+	COMPUTE_DBG("cdw: %i\n", cs->cdw);
+	for (i = 0; i < cs->cdw; i++) {
+		COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]);
+	}
+#endif
+
+	ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC);
+
+	ctx->pm4_dirty_cdwords = 0;
+	ctx->flags = 0;
+
+	COMPUTE_DBG("shader started\n");
+
+	ctx->ws->buffer_wait(onebo->buf, 0);
+
+	COMPUTE_DBG("...\n");
+
+	r600_emit_atom(ctx, &ctx->start_cs_cmd.atom);
+
+	ctx->streamout_start = TRUE;
+	ctx->streamout_append_bitmask = ~0;
+
+}
+
+static void evergreen_launch_grid(
+		struct pipe_context *ctx_,
+		const uint *block_layout, const uint *grid_layout,
+		uint32_t pc, const void *input)
+{
+	COMPUTE_DBG("PC: %i\n", pc);
+
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	unsigned num_waves;
+	unsigned num_pipes = ctx->screen->info.r600_max_pipes;
+	unsigned wave_divisor = (16 * num_pipes);
+
+	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
+	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
+			wave_divisor - 1) / wave_divisor;
+
+	COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
+							num_pipes, num_waves);
+
+	evergreen_set_lds(ctx->cs_shader, 0, 0, num_waves);
+	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
+	evergreen_direct_dispatch(ctx_, block_layout, grid_layout);
+	compute_emit_cs(ctx);
+}
+
+static void evergreen_set_compute_resources(struct pipe_context * ctx_,
+		unsigned start, unsigned count,
+		struct pipe_surface ** surfaces)
+{
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct r600_surface **resources = (struct r600_surface **)surfaces;
+	for (int i = 0; i < count; i++)	{
+		if (resources[i]) {
+			struct r600_resource_global *buffer =
+				(struct r600_resource_global*)resources[i]->base.texture;
+			if (resources[i]->base.writable) {
+				assert(i+1 < 12);
+				struct r600_resource_global *buffer =
+					(struct r600_resource_global*)
+					resources[i]->base.texture;
+
+				evergreen_set_rat(ctx->cs_shader, i+1,
+				(struct r600_resource *)resources[i]->base.texture,
+				buffer->chunk->start_in_dw*4,
+				resources[i]->base.texture->width0);
+			}
+
+			evergreen_set_vtx_resource(ctx->cs_shader,
+				(struct r600_resource *)resources[i]->base.texture, i+2,
+				 buffer->chunk->start_in_dw*4, resources[i]->base.writable);
+		}
+	}
+
+}
+
+static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
+		unsigned start_slot, unsigned count,
+		struct pipe_sampler_view **views)
+{
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct r600_pipe_sampler_view **resource =
+		(struct r600_pipe_sampler_view **)views;
+
+	for (int i = 0; i < count; i++)	{
+		if (resource[i]) {
+			assert(i+1 < 12);
+			///FETCH0 = VTX0 (param buffer),
+			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
+			evergreen_set_tex_resource(ctx->cs_shader, resource[i], i+2);
+		}
+	}
+}
+
+static void evergreen_bind_compute_sampler_states(
+	struct pipe_context *ctx_,
+	unsigned start_slot,
+	unsigned num_samplers,
+	void **samplers_)
+{
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct compute_sampler_state ** samplers =
+		(struct compute_sampler_state **)samplers_;
+
+	for (int i = 0; i < num_samplers; i++) {
+		if (samplers[i]) {
+			evergreen_set_sampler_resource(ctx->cs_shader, samplers[i], i);
+		}
+	}
+}
+
+static void evergreen_set_global_binding(
+	struct pipe_context *ctx_, unsigned first, unsigned n,
+	struct pipe_resource **resources,
+	uint32_t **handles)
+{
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct compute_memory_pool *pool = ctx->screen->global_pool;
+	struct r600_resource_global **buffers =
+		(struct r600_resource_global **)resources;
+
+	if (!resources) {
+		/* XXX: Unset */
+		return;
+	}
+
+	compute_memory_finalize_pending(pool, ctx_);
+
+	for (int i = 0; i < n; i++)
+	{
+		assert(resources[i]->target == PIPE_BUFFER);
+		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
+
+		*(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
+	}
+
+	evergreen_set_rat(ctx->cs_shader, 0, pool->bo, 0, pool->size_in_dw * 4);
+	evergreen_set_vtx_resource(ctx->cs_shader, pool->bo, 1, 0, 1);
+}
+
+
+void evergreen_compute_init_config(struct r600_context *ctx)
+{
+	struct evergreen_compute_resource* res =
+		get_empty_res(ctx->cs_shader, COMPUTE_RESOURCE_CONFIG, 0);
+
+	int num_threads;
+	int num_stack_entries;
+	int num_temp_gprs;
+
+	enum radeon_family family;
+	unsigned tmp;
+
+	family = ctx->family;
+
+	switch (family) {
+	case CHIP_CEDAR:
+	default:
+		num_temp_gprs = 4;
+		num_threads = 128;
+		num_stack_entries = 256;
+		break;
+	case CHIP_REDWOOD:
+		num_temp_gprs = 4;
+		num_threads = 128;
+		num_stack_entries = 256;
+		break;
+	case CHIP_JUNIPER:
+		num_temp_gprs = 4;
+		num_threads = 128;
+		num_stack_entries = 512;
+		break;
+	case CHIP_CYPRESS:
+	case CHIP_HEMLOCK:
+		num_temp_gprs = 4;
+		num_threads = 128;
+		num_stack_entries = 512;
+		break;
+	case CHIP_PALM:
+		num_temp_gprs = 4;
+		num_threads = 128;
+		num_stack_entries = 256;
+		break;
+	case CHIP_SUMO:
+		num_temp_gprs = 4;
+		num_threads = 128;
+		num_stack_entries = 256;
+		break;
+	case CHIP_SUMO2:
+		num_temp_gprs = 4;
+		num_threads = 128;
+		num_stack_entries = 512;
+		break;
+	case CHIP_BARTS:
+		num_temp_gprs = 4;
+		num_threads = 128;
+		num_stack_entries = 512;
+		break;
+	case CHIP_TURKS:
+		num_temp_gprs = 4;
+		num_threads = 128;
+		num_stack_entries = 256;
+		break;
+	case CHIP_CAICOS:
+		num_temp_gprs = 4;
+		num_threads = 128;
+		num_stack_entries = 256;
+		break;
+	}
+
+	tmp = 0x00000000;
+	switch (family) {
+	case CHIP_CEDAR:
+	case CHIP_PALM:
+	case CHIP_SUMO:
+	case CHIP_SUMO2:
+	case CHIP_CAICOS:
+		break;
+	default:
+		tmp |= S_008C00_VC_ENABLE(1);
+		break;
+	}
+	tmp |= S_008C00_EXPORT_SRC_C(1);
+	tmp |= S_008C00_CS_PRIO(0);
+	tmp |= S_008C00_LS_PRIO(0);
+	tmp |= S_008C00_HS_PRIO(0);
+	tmp |= S_008C00_PS_PRIO(0);
+	tmp |= S_008C00_VS_PRIO(0);
+	tmp |= S_008C00_GS_PRIO(0);
+	tmp |= S_008C00_ES_PRIO(0);
+
+	evergreen_reg_set(res, R_008C00_SQ_CONFIG, tmp);
+
+	evergreen_reg_set(res, R_008C04_SQ_GPR_RESOURCE_MGMT_1,
+				S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs));
+	evergreen_reg_set(res, R_008C08_SQ_GPR_RESOURCE_MGMT_2, 0);
+	evergreen_reg_set(res, R_008C10_SQ_GLOBAL_GPR_RESOURCE_MGMT_1, 0);
+	evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
+	evergreen_reg_set(res, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
+	/* workaround for hw issues with dyn gpr - must set all limits to 240
+	 * instead of 0, 0x1e == 240/8 */
+	evergreen_reg_set(res, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
+				S_028838_PS_GPRS(0x1e) |
+				S_028838_VS_GPRS(0x1e) |
+				S_028838_GS_GPRS(0x1e) |
+				S_028838_ES_GPRS(0x1e) |
+				S_028838_HS_GPRS(0x1e) |
+				S_028838_LS_GPRS(0x1e));
+
+
+	evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, 0xFFFFFFFF);
+	evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, 0xFFFFFFFF);
+	evergreen_reg_set(res, R_008E28_SQ_STATIC_THREAD_MGMT3, 0xFFFFFFFF);
+	evergreen_reg_set(res, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 0);
+	tmp = S_008C1C_NUM_LS_THREADS(num_threads);
+	evergreen_reg_set(res, R_008C1C_SQ_THREAD_RESOURCE_MGMT_2, tmp);
+	evergreen_reg_set(res, R_008C20_SQ_STACK_RESOURCE_MGMT_1, 0);
+	evergreen_reg_set(res, R_008C24_SQ_STACK_RESOURCE_MGMT_2, 0);
+	tmp = S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries);
+	evergreen_reg_set(res, R_008C28_SQ_STACK_RESOURCE_MGMT_3, tmp);
+	evergreen_reg_set(res, R_0286CC_SPI_PS_IN_CONTROL_0, S_0286CC_LINEAR_GRADIENT_ENA(1));
+	evergreen_reg_set(res, R_0286D0_SPI_PS_IN_CONTROL_1, 0);
+	evergreen_reg_set(res, R_0286E4_SPI_PS_IN_CONTROL_2, 0);
+	evergreen_reg_set(res, R_0286D8_SPI_INPUT_Z, 0);
+	evergreen_reg_set(res, R_0286E0_SPI_BARYC_CNTL, 1 << 20);
+	tmp = S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK;
+	evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, tmp);
+	tmp = S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1);
+	evergreen_reg_set(res, R_028A40_VGT_GS_MODE, tmp);
+	evergreen_reg_set(res, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
+	evergreen_reg_set(res, R_028800_DB_DEPTH_CONTROL, 0);
+	evergreen_reg_set(res, R_02880C_DB_SHADER_CONTROL, 0);
+	evergreen_reg_set(res, R_028000_DB_RENDER_CONTROL, S_028000_COLOR_DISABLE(1));
+	evergreen_reg_set(res, R_02800C_DB_RENDER_OVERRIDE, 0);
+	evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
+						S_0286E8_TID_IN_GROUP_ENA
+						| S_0286E8_TGID_ENA
+						| S_0286E8_DISABLE_INDEX_PACK)
+						;
+}
+
+void evergreen_init_compute_state_functions(struct r600_context *ctx)
+{
+	ctx->context.create_compute_state = evergreen_create_compute_state;
+	ctx->context.delete_compute_state = evergreen_delete_compute_state;
+	ctx->context.bind_compute_state = evergreen_bind_compute_state;
+//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
+	ctx->context.set_compute_resources = evergreen_set_compute_resources;
+	ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
+	ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
+	ctx->context.set_global_binding = evergreen_set_global_binding;
+	ctx->context.launch_grid = evergreen_launch_grid;
+}
+
+
+struct pipe_resource *r600_compute_global_buffer_create(
+	struct pipe_screen *screen,
+	const struct pipe_resource *templ)
+{
+	assert(templ->target == PIPE_BUFFER);
+	assert(templ->bind & PIPE_BIND_GLOBAL);
+	assert(templ->array_size == 1 || templ->array_size == 0);
+	assert(templ->depth0 == 1 || templ->depth0 == 0);
+	assert(templ->height0 == 1 || templ->height0 == 0);
+
+	struct r600_resource_global* result = (struct r600_resource_global*)
+		CALLOC(sizeof(struct r600_resource_global), 1);
+	struct r600_screen* rscreen = (struct r600_screen*)screen;
+
+	result->base.b.vtbl = &r600_global_buffer_vtbl;
+	result->base.b.b.screen = screen;
+	result->base.b.b = *templ;
+	pipe_reference_init(&result->base.b.b.reference, 1);
+
+	int size_in_dw = (templ->width0+3) / 4;
+
+	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
+
+	if (result->chunk == NULL)
+	{
+		free(result);
+		return NULL;
+	}
+
+	return &result->base.b.b;
+}
+
+void r600_compute_global_buffer_destroy(
+	struct pipe_screen *screen,
+	struct pipe_resource *res)
+{
+	assert(res->target == PIPE_BUFFER);
+	assert(res->bind & PIPE_BIND_GLOBAL);
+
+	struct r600_resource_global* buffer = (struct r600_resource_global*)res;
+	struct r600_screen* rscreen = (struct r600_screen*)screen;
+
+	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
+
+	buffer->chunk = NULL;
+	free(res);
+}
+
+void* r600_compute_global_transfer_map(
+	struct pipe_context *ctx_,
+	struct pipe_transfer* transfer)
+{
+	assert(transfer->resource->target == PIPE_BUFFER);
+	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
+	assert(transfer->box.x >= 0);
+	assert(transfer->box.y == 0);
+	assert(transfer->box.z == 0);
+
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct r600_resource_global* buffer =
+		(struct r600_resource_global*)transfer->resource;
+
+	uint32_t* map;
+	///TODO: do it better, mapping is not possible if the pool is too big
+
+	if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf,
+						ctx->cs, transfer->usage))) {
+		return NULL;
+	}
+
+	COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw);
+	return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
+}
+
+void r600_compute_global_transfer_unmap(
+	struct pipe_context *ctx_,
+	struct pipe_transfer* transfer)
+{
+	assert(transfer->resource->target == PIPE_BUFFER);
+	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
+
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct r600_resource_global* buffer =
+		(struct r600_resource_global*)transfer->resource;
+
+	ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
+}
+
+struct pipe_transfer * r600_compute_global_get_transfer(
+	struct pipe_context *ctx_,
+	struct pipe_resource *resource,
+	unsigned level,
+	unsigned usage,
+	const struct pipe_box *box)
+{
+	struct r600_context *ctx = (struct r600_context *)ctx_;
+	struct compute_memory_pool *pool = ctx->screen->global_pool;
+
+	compute_memory_finalize_pending(pool, ctx_);
+
+	assert(resource->target == PIPE_BUFFER);
+	struct r600_context *rctx = (struct r600_context*)ctx_;
+	struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
+
+	transfer->resource = resource;
+	transfer->level = level;
+	transfer->usage = usage;
+	transfer->box = *box;
+	transfer->stride = 0;
+	transfer->layer_stride = 0;
+	transfer->data = NULL;
+
+	/* Note strides are zero, this is ok for buffers, but not for
+	* textures 2d & higher at least.
+	*/
+	return transfer;
+}
+
+void r600_compute_global_transfer_destroy(
+	struct pipe_context *ctx_,
+	struct pipe_transfer *transfer)
+{
+	struct r600_context *rctx = (struct r600_context*)ctx_;
+	util_slab_free(&rctx->pool_transfers, transfer);
+}
+
+void r600_compute_global_transfer_flush_region(
+	struct pipe_context *ctx_,
+	struct pipe_transfer *transfer,
+	const struct pipe_box *box)
+{
+	assert(0 && "TODO");
+}
+
+void r600_compute_global_transfer_inline_write(
+	struct pipe_context *pipe,
+	struct pipe_resource *resource,
+	unsigned level,
+	unsigned usage,
+	const struct pipe_box *box,
+	const void *data,
+	unsigned stride,
+	unsigned layer_stride)
+{
+	assert(0 && "TODO");
+}
--- a/src/gallium/drivers/r600/evergreen_compute.h
+++ b/src/gallium/drivers/r600/evergreen_compute.h
@ -0,0 +1,69 @@
+/*
+ * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+#ifndef EVERGREEN_COMPUTE_H
+#define EVERGREEN_COMPUTE_H
+#include "r600.h"
+#include "r600_pipe.h"
+
+struct evergreen_compute_resource;
+
+void *evergreen_create_compute_state(struct pipe_context *ctx, const const struct pipe_compute_state *cso);
+void evergreen_delete_compute_state(struct pipe_context *ctx, void *state);
+void evergreen_direct_dispatch( struct pipe_context *context, const uint *block_layout, const uint *grid_layout);
+void evergreen_compute_upload_input(struct pipe_context *context, const uint *block_layout, const uint *grid_layout, const void *input);
+void evergreen_compute_init_config(struct r600_context *rctx);
+void evergreen_init_compute_state_functions(struct r600_context *rctx);
+
+struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen, const struct pipe_resource *templ);
+void r600_compute_global_buffer_destroy(struct pipe_screen *screen, struct pipe_resource *res);
+void* r600_compute_global_transfer_map(struct pipe_context *ctx, struct pipe_transfer* transfer);
+void r600_compute_global_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer* transfer);
+struct pipe_transfer * r600_compute_global_get_transfer(struct pipe_context *, struct pipe_resource *, unsigned level,
+                                                        unsigned usage, const struct pipe_box *);
+void r600_compute_global_transfer_destroy(struct pipe_context *, struct pipe_transfer *);
+void r600_compute_global_transfer_flush_region( struct pipe_context *, struct pipe_transfer *, const struct pipe_box *);
+void r600_compute_global_transfer_inline_write( struct pipe_context *, struct pipe_resource *, unsigned level,
+                                                unsigned usage, const struct pipe_box *, const void *data, unsigned stride, unsigned layer_stride);
+
+
+static inline void COMPUTE_DBG(const char *fmt, ...)
+{
+   static bool check_debug = false, debug = false;
+
+   if (!check_debug) {
+		debug = debug_get_bool_option("R600_COMPUTE_DEBUG", FALSE);
+   }
+
+   if (debug) {
+      va_list ap;
+      va_start(ap, fmt);
+      _debug_vprintf(fmt, ap);
+      va_end(ap);
+   }
+}
+
+#endif
--- a/src/gallium/drivers/r600/evergreen_compute_internal.c
+++ b/src/gallium/drivers/r600/evergreen_compute_internal.c
@ -0,0 +1,830 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "util/u_blitter.h"
+#include "util/u_double_list.h"
+#include "util/u_transfer.h"
+#include "util/u_surface.h"
+#include "util/u_pack_color.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
+#include "r600.h"
+#include "r600_resource.h"
+#include "r600_shader.h"
+#include "r600_pipe.h"
+#include "r600_formats.h"
+#include "evergreend.h"
+#include "evergreen_compute_internal.h"
+#include "r600_hw_context_priv.h"
+
+int get_compute_resource_num(void)
+{
+	int num = 0;
+#define DECL_COMPUTE_RESOURCE(name, n) num += n;
+#include "compute_resource.def"
+#undef DECL_COMPUTE_RESOURCE
+	return num;
+}
+
+void evergreen_emit_raw_value(
+	struct evergreen_compute_resource* res,
+	unsigned value)
+{
+	res->cs[res->cs_end++] = value;
+}
+
+void evergreen_emit_ctx_value(struct r600_context *ctx, unsigned value)
+{
+	ctx->cs->buf[ctx->cs->cdw++] = value;
+}
+
+void evergreen_mult_reg_set_(
+	struct evergreen_compute_resource* res,
+	int index,
+	u32* array,
+	int size)
+{
+	int i = 0;
+
+	evergreen_emit_raw_reg_set(res, index, size / 4);
+
+	for (i = 0; i < size; i+=4) {
+		res->cs[res->cs_end++] = array[i / 4];
+	}
+}
+
+void evergreen_reg_set(
+	struct evergreen_compute_resource* res,
+	unsigned index,
+	unsigned value)
+{
+	evergreen_emit_raw_reg_set(res, index, 1);
+	res->cs[res->cs_end++] = value;
+}
+
+struct evergreen_compute_resource* get_empty_res(
+	struct r600_pipe_compute* pipe,
+	enum evergreen_compute_resources res_code,
+	int offset_index)
+{
+	int code_index = -1;
+	int code_size = -1;
+
+	{
+		int i = 0;
+		#define DECL_COMPUTE_RESOURCE(name, n) if (COMPUTE_RESOURCE_ ## name	== res_code) {code_index = i; code_size = n;} i += n;
+		#include "compute_resource.def"
+		#undef DECL_COMPUTE_RESOURCE
+	}
+
+	assert(code_index != -1 && "internal error: resouce index not found");
+	assert(offset_index < code_size && "internal error: overindexing resource");
+
+	int index = code_index + offset_index;
+
+	struct evergreen_compute_resource* res = &pipe->resources[index];
+
+	res->enabled = true;
+	res->bo = NULL;
+	res->cs_end = 0;
+	bzero(&res->do_reloc, sizeof(res->do_reloc));
+
+	return res;
+}
+
+void evergreen_emit_raw_reg_set(
+	struct evergreen_compute_resource* res,
+	unsigned index,
+	int num)
+{
+	res->enabled = 1;
+	int cs_end = res->cs_end;
+
+	if (index >= EVERGREEN_CONFIG_REG_OFFSET
+			&& index < EVERGREEN_CONFIG_REG_END) {
+		res->cs[cs_end] = PKT3C(PKT3_SET_CONFIG_REG, num, 0);
+		res->cs[cs_end+1] = (index - EVERGREEN_CONFIG_REG_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_CONTEXT_REG_OFFSET
+			&& index < EVERGREEN_CONTEXT_REG_END) {
+		res->cs[cs_end] = PKT3C(PKT3_SET_CONTEXT_REG, num, 0);
+		res->cs[cs_end+1] = (index - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_RESOURCE_OFFSET
+			&& index < EVERGREEN_RESOURCE_END) {
+		res->cs[cs_end] = PKT3C(PKT3_SET_RESOURCE, num, 0);
+		res->cs[cs_end+1] = (index - EVERGREEN_RESOURCE_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_SAMPLER_OFFSET
+			&& index < EVERGREEN_SAMPLER_END) {
+		res->cs[cs_end] = PKT3C(PKT3_SET_SAMPLER, num, 0);
+		res->cs[cs_end+1] = (index - EVERGREEN_SAMPLER_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_CTL_CONST_OFFSET
+			&& index < EVERGREEN_CTL_CONST_END) {
+		res->cs[cs_end] = PKT3C(PKT3_SET_CTL_CONST, num, 0);
+		res->cs[cs_end+1] = (index - EVERGREEN_CTL_CONST_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_LOOP_CONST_OFFSET
+			&& index < EVERGREEN_LOOP_CONST_END) {
+		res->cs[cs_end] = PKT3C(PKT3_SET_LOOP_CONST, num, 0);
+		res->cs[cs_end+1] = (index - EVERGREEN_LOOP_CONST_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_BOOL_CONST_OFFSET
+			&& index < EVERGREEN_BOOL_CONST_END) {
+		res->cs[cs_end] = PKT3C(PKT3_SET_BOOL_CONST, num, 0);
+		res->cs[cs_end+1] = (index - EVERGREEN_BOOL_CONST_OFFSET) >> 2;
+	} else {
+		res->cs[cs_end] = PKT0(index, num-1);
+		res->cs_end--;
+	}
+
+	res->cs_end += 2;
+}
+
+void evergreen_emit_force_reloc(struct evergreen_compute_resource* res)
+{
+	res->do_reloc[res->cs_end] += 1;
+}
+
+void evergreen_emit_ctx_reg_set(
+	struct r600_context *ctx,
+	unsigned index,
+	int num)
+{
+
+	if (index >= EVERGREEN_CONFIG_REG_OFFSET
+			&& index < EVERGREEN_CONFIG_REG_END) {
+		ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_CONFIG_REG, num, 0);
+		ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_CONFIG_REG_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_CONTEXT_REG_OFFSET
+			&& index < EVERGREEN_CONTEXT_REG_END) {
+		ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_CONTEXT_REG, num, 0);
+		ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_RESOURCE_OFFSET
+			&& index < EVERGREEN_RESOURCE_END) {
+		ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_RESOURCE, num, 0);
+		ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_RESOURCE_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_SAMPLER_OFFSET
+			&& index < EVERGREEN_SAMPLER_END) {
+		ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_SAMPLER, num, 0);
+		ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_SAMPLER_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_CTL_CONST_OFFSET
+			&& index < EVERGREEN_CTL_CONST_END) {
+		ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_CTL_CONST, num, 0);
+		ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_CTL_CONST_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_LOOP_CONST_OFFSET
+			&& index < EVERGREEN_LOOP_CONST_END) {
+		ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_LOOP_CONST, num, 0);
+		ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_LOOP_CONST_OFFSET) >> 2;
+	} else if (index >= EVERGREEN_BOOL_CONST_OFFSET
+			&& index < EVERGREEN_BOOL_CONST_END) {
+		ctx->cs->buf[ctx->cs->cdw++] = PKT3C(PKT3_SET_BOOL_CONST, num, 0);
+		ctx->cs->buf[ctx->cs->cdw++] = (index - EVERGREEN_BOOL_CONST_OFFSET) >> 2;
+	} else {
+		ctx->cs->buf[ctx->cs->cdw++] = PKT0(index, num-1);
+	}
+}
+
+void evergreen_emit_ctx_reloc(
+	struct r600_context *ctx,
+	struct r600_resource *bo,
+	enum radeon_bo_usage usage)
+{
+	assert(bo);
+
+	ctx->cs->buf[ctx->cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+	u32 rr = r600_context_bo_reloc(ctx, bo, usage);
+	ctx->cs->buf[ctx->cs->cdw++] = rr;
+}
+
+void evergreen_set_buffer_sync(
+	struct r600_context *ctx,
+	struct r600_resource* bo,
+	int size,
+	int flags,
+	enum radeon_bo_usage usage)
+{
+	assert(bo);
+	int32_t cp_coher_size = 0;
+
+	if (size == 0xffffffff || size == 0) {
+		cp_coher_size = 0xffffffff;
+	}
+	else {
+		cp_coher_size = ((size + 255) >> 8);
+	}
+
+	uint32_t sync_flags = 0;
+
+	if ((flags & COMPUTE_RES_TC_FLUSH) == COMPUTE_RES_TC_FLUSH) {
+		sync_flags |= S_0085F0_TC_ACTION_ENA(1);
+	}
+
+	if ((flags & COMPUTE_RES_VC_FLUSH) == COMPUTE_RES_VC_FLUSH) {
+		sync_flags |= S_0085F0_VC_ACTION_ENA(1);
+	}
+
+	if ((flags & COMPUTE_RES_SH_FLUSH) == COMPUTE_RES_SH_FLUSH) {
+		sync_flags |= S_0085F0_SH_ACTION_ENA(1);
+	}
+
+	if ((flags & COMPUTE_RES_CB_FLUSH(0)) == COMPUTE_RES_CB_FLUSH(0)) {
+		sync_flags |= S_0085F0_CB_ACTION_ENA(1);
+
+		switch((flags >> 8) & 0xF) {
+		case 0:
+			sync_flags |= S_0085F0_CB0_DEST_BASE_ENA(1);
+			break;
+		case 1:
+			sync_flags |= S_0085F0_CB1_DEST_BASE_ENA(1);
+			break;
+		case 2:
+			sync_flags |= S_0085F0_CB2_DEST_BASE_ENA(1);
+			break;
+		case 3:
+			sync_flags |= S_0085F0_CB3_DEST_BASE_ENA(1);
+			break;
+		case 4:
+			sync_flags |= S_0085F0_CB4_DEST_BASE_ENA(1);
+			break;
+		case 5:
+			sync_flags |= S_0085F0_CB5_DEST_BASE_ENA(1);
+			break;
+		case 6:
+			sync_flags |= S_0085F0_CB6_DEST_BASE_ENA(1);
+			break;
+		case 7:
+			sync_flags |= S_0085F0_CB7_DEST_BASE_ENA(1);
+			break;
+		case 8:
+			sync_flags |= S_0085F0_CB8_DEST_BASE_ENA(1);
+			break;
+		case 9:
+			sync_flags |= S_0085F0_CB9_DEST_BASE_ENA(1);
+			break;
+		case 10:
+			sync_flags |= S_0085F0_CB10_DEST_BASE_ENA(1);
+			break;
+		case 11:
+			sync_flags |= S_0085F0_CB11_DEST_BASE_ENA(1);
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	int32_t poll_interval = 10;
+
+	ctx->cs->buf[ctx->cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0);
+	ctx->cs->buf[ctx->cs->cdw++] = sync_flags;
+	ctx->cs->buf[ctx->cs->cdw++] = cp_coher_size;
+	ctx->cs->buf[ctx->cs->cdw++] = 0;
+	ctx->cs->buf[ctx->cs->cdw++] = poll_interval;
+
+	if (cp_coher_size != 0xffffffff) {
+		evergreen_emit_ctx_reloc(ctx, bo, usage);
+	}
+}
+
+int evergreen_compute_get_gpu_format(
+	struct number_type_and_format* fmt,
+	struct r600_resource *bo)
+{
+	switch (bo->b.b.format)
+	{
+		case PIPE_FORMAT_R8_UNORM:
+		case PIPE_FORMAT_R32_UNORM:
+			fmt->format = V_028C70_COLOR_32;
+			fmt->number_type = V_028C70_NUMBER_UNORM;
+			fmt->num_format_all = 0;
+		break;
+		case PIPE_FORMAT_R32_FLOAT:
+			fmt->format = V_028C70_COLOR_32_FLOAT;
+			fmt->number_type = V_028C70_NUMBER_FLOAT;
+			fmt->num_format_all = 0;
+		break;
+		case PIPE_FORMAT_R32G32B32A32_FLOAT:
+			fmt->format = V_028C70_COLOR_32_32_32_32_FLOAT;
+			fmt->number_type = V_028C70_NUMBER_FLOAT;
+			fmt->num_format_all = 0;
+		break;
+
+		///TODO: other formats...
+
+		default:
+			return 0;
+	}
+
+	return 1;
+}
+
+void evergreen_set_rat(
+	struct r600_pipe_compute *pipe,
+	int id,
+	struct r600_resource* bo,
+	int start,
+	int size)
+{
+	assert(id < 12);
+	assert((size & 3) == 0);
+	assert((start & 0xFF) == 0);
+
+	int offset;
+	COMPUTE_DBG("bind rat: %i \n", id);
+
+	if (id < 8) {
+		offset = id*0x3c;
+	}
+	else {
+		offset = 8*0x3c + (id-8)*0x1c;
+	}
+
+	int linear = 0;
+
+	if (bo->b.b.height0 <= 1 && bo->b.b.depth0 <= 1
+			&& bo->b.b.target == PIPE_BUFFER) {
+		linear = 1;
+	}
+
+	struct evergreen_compute_resource* res =
+		get_empty_res(pipe, COMPUTE_RESOURCE_RAT, id);
+
+	evergreen_emit_force_reloc(res);
+
+	evergreen_reg_set(res, R_028C64_CB_COLOR0_PITCH, 0); ///TODO: for 2D?
+	evergreen_reg_set(res, R_028C68_CB_COLOR0_SLICE, 0);
+
+	struct number_type_and_format fmt;
+
+	///default config
+	if (bo->b.b.format == PIPE_FORMAT_NONE) {
+		 fmt.format = V_028C70_COLOR_32;
+		 fmt.number_type = V_028C70_NUMBER_FLOAT;
+	} else {
+		evergreen_compute_get_gpu_format(&fmt, bo);
+	}
+
+	if (linear) {
+		evergreen_reg_set(res,
+			R_028C70_CB_COLOR0_INFO, S_028C70_RAT(1)
+			| S_028C70_ARRAY_MODE(V_028C70_ARRAY_LINEAR_ALIGNED)
+			| S_028C70_FORMAT(fmt.format)
+			| S_028C70_NUMBER_TYPE(fmt.number_type)
+		);
+		evergreen_emit_force_reloc(res);
+	} else {
+		assert(0 && "TODO");
+		///TODO
+//	 evergreen_reg_set(res, R_028C70_CB_COLOR0_INFO, S_028C70_RAT(1) | S_028C70_ARRAY_MODE(????));
+//	 evergreen_emit_force_reloc(res);
+	}
+
+	evergreen_reg_set(res, R_028C74_CB_COLOR0_ATTRIB, S_028C74_NON_DISP_TILING_ORDER(1));
+	evergreen_emit_force_reloc(res);
+
+	if (linear) {
+		/* XXX: Why are we using size instead of bo->b.b.b.width0 ? */
+		evergreen_reg_set(res, R_028C78_CB_COLOR0_DIM, size);
+	} else {
+		evergreen_reg_set(res, R_028C78_CB_COLOR0_DIM,
+			S_028C78_WIDTH_MAX(bo->b.b.width0)
+			| S_028C78_HEIGHT_MAX(bo->b.b.height0));
+	}
+
+	if (id < 8) {
+		evergreen_reg_set(res, R_028C7C_CB_COLOR0_CMASK, 0);
+		evergreen_emit_force_reloc(res);
+		evergreen_reg_set(res, R_028C84_CB_COLOR0_FMASK, 0);
+		evergreen_emit_force_reloc(res);
+	}
+
+	evergreen_reg_set(res, R_028C60_CB_COLOR0_BASE + offset, start >> 8);
+
+	res->bo = bo;
+	res->usage = RADEON_USAGE_READWRITE;
+	res->coher_bo_size = size;
+	res->flags = COMPUTE_RES_CB_FLUSH(id);
+}
+
+void evergreen_set_lds(
+	struct r600_pipe_compute *pipe,
+	int num_lds,
+	int size,
+	int num_waves)
+{
+	struct evergreen_compute_resource* res =
+		get_empty_res(pipe, COMPUTE_RESOURCE_LDS, 0);
+
+	evergreen_reg_set(res, R_008E2C_SQ_LDS_RESOURCE_MGMT,
+		S_008E2C_NUM_LS_LDS(num_lds));
+	evergreen_reg_set(res, CM_R_0288E8_SQ_LDS_ALLOC, size | num_waves << 14);
+}
+
+void evergreen_set_gds(
+	struct r600_pipe_compute *pipe,
+	uint32_t addr,
+	uint32_t size)
+{
+	struct evergreen_compute_resource* res =
+		get_empty_res(pipe, COMPUTE_RESOURCE_GDS, 0);
+
+	evergreen_reg_set(res, R_028728_GDS_ORDERED_WAVE_PER_SE, 1);
+	evergreen_reg_set(res, R_028720_GDS_ADDR_BASE, addr);
+	evergreen_reg_set(res, R_028724_GDS_ADDR_SIZE, size);
+}
+
+void evergreen_set_export(
+	struct r600_pipe_compute *pipe,
+	struct r600_resource* bo,
+	int offset, int size)
+{
+	#define SX_MEMORY_EXPORT_BASE 0x9010
+	#define SX_MEMORY_EXPORT_SIZE 0x9014
+
+	struct evergreen_compute_resource* res =
+		get_empty_res(pipe, COMPUTE_RESOURCE_EXPORT, 0);
+
+	evergreen_reg_set(res, SX_MEMORY_EXPORT_SIZE, size);
+
+	if (size) {
+		evergreen_reg_set(res, SX_MEMORY_EXPORT_BASE, offset);
+		res->bo = bo;
+		res->usage = RADEON_USAGE_WRITE;
+		res->coher_bo_size = size;
+		res->flags = 0;
+	}
+}
+
+void evergreen_set_loop_const(
+	struct r600_pipe_compute *pipe,
+	int id, int count, int init, int inc) {
+
+	struct evergreen_compute_resource* res =
+		get_empty_res(pipe, COMPUTE_RESOURCE_LOOP, id);
+
+	assert(id < 32);
+	assert(count <= 0xFFF);
+	assert(init <= 0xFF);
+	assert(inc <= 0xFF);
+
+	/* Compute shaders use LOOP_CONST registers SQ_LOOP_CONST_160 to
+         * SQ_LOOP_CONST_191 */
+	evergreen_reg_set(res, R_03A200_SQ_LOOP_CONST_0 + (160 * 4) + (id * 4),
+		count | init << 12 | inc << 24);
+}
+
+void evergreen_set_tmp_ring(
+	struct r600_pipe_compute *pipe,
+	struct r600_resource* bo,
+	int offset, int size, int se)
+{
+	#define SQ_LSTMP_RING_BASE 0x00008e10
+	#define SQ_LSTMP_RING_SIZE 0x00008e14
+	#define GRBM_GFX_INDEX                                  0x802C
+	#define         INSTANCE_INDEX(x)                       ((x) << 0)
+	#define         SE_INDEX(x)                             ((x) << 16)
+	#define         INSTANCE_BROADCAST_WRITES               (1 << 30)
+	#define         SE_BROADCAST_WRITES                     (1 << 31)
+
+	struct evergreen_compute_resource* res =
+		get_empty_res(pipe, COMPUTE_RESOURCE_TMPRING, se);
+
+	evergreen_reg_set(res,
+		GRBM_GFX_INDEX,INSTANCE_INDEX(0)
+		| SE_INDEX(se)
+		| INSTANCE_BROADCAST_WRITES);
+	evergreen_reg_set(res, SQ_LSTMP_RING_SIZE, size);
+
+	if (size) {
+		assert(bo);
+
+		evergreen_reg_set(res, SQ_LSTMP_RING_BASE, offset);
+		res->bo = bo;
+		res->usage = RADEON_USAGE_WRITE;
+		res->coher_bo_size = 0;
+		res->flags = 0;
+	}
+
+	if (size) {
+		evergreen_emit_force_reloc(res);
+	}
+
+	evergreen_reg_set(res,
+		GRBM_GFX_INDEX,INSTANCE_INDEX(0)
+		| SE_INDEX(0)
+		| INSTANCE_BROADCAST_WRITES
+		| SE_BROADCAST_WRITES);
+}
+
+static uint32_t r600_colorformat_endian_swap(uint32_t colorformat)
+{
+	if (R600_BIG_ENDIAN) {
+		switch(colorformat) {
+		case V_028C70_COLOR_4_4:
+			return ENDIAN_NONE;
+
+		/* 8-bit buffers. */
+		case V_028C70_COLOR_8:
+			return ENDIAN_NONE;
+
+		/* 16-bit buffers. */
+		case V_028C70_COLOR_5_6_5:
+		case V_028C70_COLOR_1_5_5_5:
+		case V_028C70_COLOR_4_4_4_4:
+		case V_028C70_COLOR_16:
+		case V_028C70_COLOR_8_8:
+			return ENDIAN_8IN16;
+
+		/* 32-bit buffers. */
+		case V_028C70_COLOR_8_8_8_8:
+		case V_028C70_COLOR_2_10_10_10:
+		case V_028C70_COLOR_8_24:
+		case V_028C70_COLOR_24_8:
+		case V_028C70_COLOR_32_FLOAT:
+		case V_028C70_COLOR_16_16_FLOAT:
+		case V_028C70_COLOR_16_16:
+			return ENDIAN_8IN32;
+
+		/* 64-bit buffers. */
+		case V_028C70_COLOR_16_16_16_16:
+		case V_028C70_COLOR_16_16_16_16_FLOAT:
+			return ENDIAN_8IN16;
+
+		case V_028C70_COLOR_32_32_FLOAT:
+		case V_028C70_COLOR_32_32:
+		case V_028C70_COLOR_X24_8_32_FLOAT:
+			return ENDIAN_8IN32;
+
+		/* 96-bit buffers. */
+		case V_028C70_COLOR_32_32_32_FLOAT:
+		/* 128-bit buffers. */
+		case V_028C70_COLOR_32_32_32_32_FLOAT:
+		case V_028C70_COLOR_32_32_32_32:
+			return ENDIAN_8IN32;
+		default:
+			return ENDIAN_NONE; /* Unsupported. */
+		}
+	} else {
+		return ENDIAN_NONE;
+	}
+}
+
+static unsigned r600_tex_dim(unsigned dim)
+{
+	switch (dim) {
+	default:
+	case PIPE_TEXTURE_1D:
+		return V_030000_SQ_TEX_DIM_1D;
+	case PIPE_TEXTURE_1D_ARRAY:
+		return V_030000_SQ_TEX_DIM_1D_ARRAY;
+	case PIPE_TEXTURE_2D:
+	case PIPE_TEXTURE_RECT:
+		return V_030000_SQ_TEX_DIM_2D;
+	case PIPE_TEXTURE_2D_ARRAY:
+		return V_030000_SQ_TEX_DIM_2D_ARRAY;
+	case PIPE_TEXTURE_3D:
+		return V_030000_SQ_TEX_DIM_3D;
+	case PIPE_TEXTURE_CUBE:
+		return V_030000_SQ_TEX_DIM_CUBEMAP;
+	}
+}
+
+void evergreen_set_vtx_resource(
+	struct r600_pipe_compute *pipe,
+	struct r600_resource* bo,
+	int id, uint64_t offset, int writable)
+{
+	assert(id < 16);
+	uint32_t sq_vtx_constant_word2, sq_vtx_constant_word3, sq_vtx_constant_word4;
+	struct number_type_and_format fmt;
+
+	fmt.format = 0;
+
+	assert(bo->b.b.height0 <= 1);
+	assert(bo->b.b.depth0 <= 1);
+
+	int e = evergreen_compute_get_gpu_format(&fmt, bo);
+
+	assert(e && "unknown format");
+
+	struct evergreen_compute_resource* res =
+		get_empty_res(pipe, COMPUTE_RESOURCE_VERT, id);
+
+	unsigned size = bo->b.b.width0;
+	unsigned stride = 1;
+
+//	size = (size * util_format_get_blockwidth(bo->b.b.b.format) *
+//		util_format_get_blocksize(bo->b.b.b.format));
+
+	COMPUTE_DBG("id: %i vtx size: %i byte,	width0: %i elem\n",
+		id, size, bo->b.b.width0);
+
+	sq_vtx_constant_word2 =
+		S_030008_BASE_ADDRESS_HI(offset >> 32) |
+		S_030008_STRIDE(stride) |
+		S_030008_DATA_FORMAT(fmt.format) |
+		S_030008_NUM_FORMAT_ALL(fmt.num_format_all) |
+		S_030008_ENDIAN_SWAP(0);
+
+	COMPUTE_DBG("%08X %i %i %i %i\n", sq_vtx_constant_word2, offset,
+			stride, fmt.format, fmt.num_format_all);
+
+	sq_vtx_constant_word3 =
+		S_03000C_DST_SEL_X(0) |
+		S_03000C_DST_SEL_Y(1) |
+		S_03000C_DST_SEL_Z(2) |
+		S_03000C_DST_SEL_W(3);
+
+	sq_vtx_constant_word4 = 0;
+
+	evergreen_emit_raw_value(res, PKT3C(PKT3_SET_RESOURCE, 8, 0));
+	evergreen_emit_raw_value(res, (id+816)*32 >> 2);
+	evergreen_emit_raw_value(res, (unsigned)((offset) & 0xffffffff));
+	evergreen_emit_raw_value(res, size - 1);
+	evergreen_emit_raw_value(res, sq_vtx_constant_word2);
+	evergreen_emit_raw_value(res, sq_vtx_constant_word3);
+	evergreen_emit_raw_value(res, sq_vtx_constant_word4);
+	evergreen_emit_raw_value(res, 0);
+	evergreen_emit_raw_value(res, 0);
+	evergreen_emit_raw_value(res, S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER));
+
+	res->bo = bo;
+
+	if (writable) {
+		res->usage = RADEON_USAGE_READWRITE;
+	}
+	else {
+		res->usage = RADEON_USAGE_READ;
+	}
+
+	res->coher_bo_size = size;
+	res->flags = COMPUTE_RES_TC_FLUSH | COMPUTE_RES_VC_FLUSH;
+}
+
+void evergreen_set_tex_resource(
+	struct r600_pipe_compute *pipe,
+	struct r600_pipe_sampler_view* view,
+	int id)
+{
+	struct evergreen_compute_resource* res =
+		get_empty_res(pipe, COMPUTE_RESOURCE_TEX, id);
+	struct r600_resource_texture *tmp =
+		(struct r600_resource_texture*)view->base.texture;
+
+	unsigned format, endian;
+	uint32_t word4 = 0, yuv_format = 0, pitch = 0;
+	unsigned char swizzle[4], array_mode = 0, tile_type = 0;
+	unsigned height, depth;
+
+	swizzle[0] = 0;
+	swizzle[1] = 1;
+	swizzle[2] = 2;
+	swizzle[3] = 3;
+
+	format = r600_translate_texformat((struct pipe_screen *)pipe->ctx->screen,
+		view->base.format, swizzle, &word4, &yuv_format);
+
+	if (format == ~0) {
+		format = 0;
+	}
+
+	endian = r600_colorformat_endian_swap(format);
+
+	height = view->base.texture->height0;
+	depth = view->base.texture->depth0;
+
+	pitch = align(tmp->pitch_in_blocks[0] *
+		util_format_get_blockwidth(tmp->real_format), 8);
+	array_mode = tmp->array_mode[0];
+	tile_type = tmp->tile_type;
+
+	assert(view->base.texture->target != PIPE_TEXTURE_1D_ARRAY);
+	assert(view->base.texture->target != PIPE_TEXTURE_2D_ARRAY);
+
+	evergreen_emit_raw_value(res, PKT3C(PKT3_SET_RESOURCE, 8, 0));
+	evergreen_emit_raw_value(res, (id+816)*32 >> 2); ///TODO: check this line
+	evergreen_emit_raw_value(res,
+				(S_030000_DIM(r600_tex_dim(view->base.texture->target)) |
+				S_030000_PITCH((pitch / 8) - 1) |
+				S_030000_NON_DISP_TILING_ORDER(tile_type) |
+				S_030000_TEX_WIDTH(view->base.texture->width0 - 1)));
+	evergreen_emit_raw_value(res, (S_030004_TEX_HEIGHT(height - 1) |
+				S_030004_TEX_DEPTH(depth - 1) |
+				S_030004_ARRAY_MODE(array_mode)));
+	evergreen_emit_raw_value(res, tmp->offset[0] >> 8);
+	evergreen_emit_raw_value(res, tmp->offset[0] >> 8);
+	evergreen_emit_raw_value(res, (word4 |
+				S_030010_SRF_MODE_ALL(V_030010_SRF_MODE_ZERO_CLAMP_MINUS_ONE) |
+				S_030010_ENDIAN_SWAP(endian) |
+				S_030010_BASE_LEVEL(0)));
+	evergreen_emit_raw_value(res, (S_030014_LAST_LEVEL(0) |
+				S_030014_BASE_ARRAY(0) |
+				S_030014_LAST_ARRAY(0)));
+	evergreen_emit_raw_value(res, (S_030018_MAX_ANISO(4 /* max 16 samples */)));
+	evergreen_emit_raw_value(res,
+		S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_TEXTURE)
+		| S_03001C_DATA_FORMAT(format));
+
+	res->bo = (struct r600_resource*)view->base.texture;
+
+	res->usage = RADEON_USAGE_READ;
+
+	res->coher_bo_size = tmp->offset[0] + util_format_get_blockwidth(tmp->real_format)*view->base.texture->width0*height*depth;
+	res->flags = COMPUTE_RES_TC_FLUSH;
+
+	evergreen_emit_force_reloc(res);
+	evergreen_emit_force_reloc(res);
+}
+
+void evergreen_set_sampler_resource(
+	struct r600_pipe_compute *pipe,
+	struct compute_sampler_state *sampler,
+	int id)
+{
+	struct evergreen_compute_resource* res =
+		get_empty_res(pipe, COMPUTE_RESOURCE_SAMPLER, id);
+
+	unsigned aniso_flag_offset = sampler->state.max_anisotropy > 1 ? 2 : 0;
+
+	evergreen_emit_raw_value(res, PKT3C(PKT3_SET_SAMPLER, 3, 0));
+	evergreen_emit_raw_value(res, (id + 90)*3);
+	evergreen_emit_raw_value(res,
+		S_03C000_CLAMP_X(r600_tex_wrap(sampler->state.wrap_s)) |
+		S_03C000_CLAMP_Y(r600_tex_wrap(sampler->state.wrap_t)) |
+		S_03C000_CLAMP_Z(r600_tex_wrap(sampler->state.wrap_r)) |
+		S_03C000_XY_MAG_FILTER(r600_tex_filter(sampler->state.mag_img_filter) | aniso_flag_offset) |
+		S_03C000_XY_MIN_FILTER(r600_tex_filter(sampler->state.min_img_filter) | aniso_flag_offset) |
+		S_03C000_BORDER_COLOR_TYPE(V_03C000_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK)
+	);
+	evergreen_emit_raw_value(res,
+		S_03C004_MIN_LOD(S_FIXED(CLAMP(sampler->state.min_lod, 0, 15), 8)) |
+		S_03C004_MAX_LOD(S_FIXED(CLAMP(sampler->state.max_lod, 0, 15), 8))
+	);
+	evergreen_emit_raw_value(res,
+		S_03C008_LOD_BIAS(S_FIXED(CLAMP(sampler->state.lod_bias, -16, 16), 8)) |
+		(sampler->state.seamless_cube_map ? 0 : S_03C008_DISABLE_CUBE_WRAP(1)) |
+		S_03C008_TYPE(1)
+	);
+}
+
+void evergreen_set_const_cache(
+	struct r600_pipe_compute *pipe,
+	int cache_id,
+	struct r600_resource* cbo,
+	int size, int offset)
+{
+	#define SQ_ALU_CONST_BUFFER_SIZE_LS_0 0x00028fc0
+	#define SQ_ALU_CONST_CACHE_LS_0 0x00028f40
+
+	struct evergreen_compute_resource* res =
+		get_empty_res(pipe, COMPUTE_RESOURCE_CONST_MEM, cache_id);
+
+	assert(size < 0x200);
+	assert((offset & 0xFF) == 0);
+	assert(cache_id < 16);
+
+	evergreen_reg_set(res, SQ_ALU_CONST_BUFFER_SIZE_LS_0 + cache_id*4, size);
+	evergreen_reg_set(res, SQ_ALU_CONST_CACHE_LS_0 + cache_id*4, offset >> 8);
+	res->bo = cbo;
+	res->usage = RADEON_USAGE_READ;
+	res->coher_bo_size = size;
+	res->flags = COMPUTE_RES_SH_FLUSH;
+}
+
+struct r600_resource* r600_compute_buffer_alloc_vram(
+	struct r600_screen *screen,
+	unsigned size)
+{
+	assert(size);
+
+	struct pipe_resource * buffer = pipe_buffer_create(
+			(struct pipe_screen*) screen,
+			PIPE_BIND_CUSTOM,
+			PIPE_USAGE_IMMUTABLE,
+			size);
+
+	return (struct r600_resource *)buffer;
+}
--- a/src/gallium/drivers/r600/evergreen_compute_internal.h
+++ b/src/gallium/drivers/r600/evergreen_compute_internal.h
@ -0,0 +1,119 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Adam Rak <adam.rak@streamnovation.com>
+ */
+ 
+#ifndef EVERGREEN_COMPUTE_INTERNAL_H
+#define EVERGREEN_COMPUTE_INTERNAL_H
+
+#include "compute_memory_pool.h"
+
+enum evergreen_compute_resources
+{
+#define DECL_COMPUTE_RESOURCE(name, n) COMPUTE_RESOURCE_ ## name ,
+#include "compute_resource.def"
+#undef DECL_COMPUTE_RESOURCE
+__COMPUTE_RESOURCE_END__
+};
+
+typedef unsigned u32;
+
+#define COMPUTE_RES_TC_FLUSH      0xF0001
+#define COMPUTE_RES_VC_FLUSH      0xF0002
+#define COMPUTE_RES_SH_FLUSH      0xF0004
+#define COMPUTE_RES_CB_FLUSH(x)  (0xF0008 | x << 8)
+#define COMPUTE_RES_FULL_FLUSH    0xF0010
+
+struct evergreen_compute_resource {
+	int enabled;
+
+	int do_reloc[256];
+	u32 cs[256];
+	int cs_end;
+
+	struct r600_resource *bo;
+	int coher_bo_size;
+	enum radeon_bo_usage usage;
+	int flags; ///flags for COMPUTE_RES_*_FLUSH
+};
+
+struct compute_sampler_state {
+	struct r600_pipe_state base;
+	struct pipe_sampler_state state;
+};
+
+struct number_type_and_format {
+	unsigned format;
+	unsigned number_type;
+	unsigned num_format_all;
+};
+
+struct r600_pipe_compute {
+	struct r600_context *ctx;
+	struct r600_bytecode bc;
+	struct tgsi_token *tokens;
+
+	struct evergreen_compute_resource *resources;
+
+	unsigned local_size;
+	unsigned private_size;
+	unsigned input_size;
+#ifdef HAVE_OPENCL
+	LLVMModuleRef mod;
+#endif
+	struct r600_resource *kernel_param;
+	struct r600_resource *shader_code_bo;
+};
+
+int evergreen_compute_get_gpu_format(struct number_type_and_format* fmt, struct r600_resource *bo); ///get hw format from resource, return 0 on faliure, nonzero on success
+
+
+void evergreen_emit_raw_reg_set(struct evergreen_compute_resource* res, unsigned index, int num);
+void evergreen_emit_ctx_reg_set(struct r600_context *ctx, unsigned index, int num);
+void evergreen_emit_raw_value(struct evergreen_compute_resource* res, unsigned value);
+void evergreen_emit_ctx_value(struct r600_context *ctx, unsigned value);
+void evergreen_mult_reg_set_(struct evergreen_compute_resource* res,  int index, u32* array, int size);
+void evergreen_emit_ctx_reloc(struct r600_context *ctx, struct r600_resource *bo, enum radeon_bo_usage usage);
+void evergreen_reg_set(struct evergreen_compute_resource* res, unsigned index, unsigned value);
+void evergreen_emit_force_reloc(struct evergreen_compute_resource* res);
+
+void evergreen_set_buffer_sync(struct r600_context *ctx, struct r600_resource* bo, int size, int flags, enum radeon_bo_usage usage);
+
+struct evergreen_compute_resource* get_empty_res(struct r600_pipe_compute*, enum evergreen_compute_resources res_code, int index);
+int get_compute_resource_num(void);
+
+#define evergreen_mult_reg_set(res, index, array) evergreen_mult_reg_set_(res, index, array, sizeof(array))
+
+void evergreen_set_rat(struct r600_pipe_compute *pipe, int id, struct r600_resource* bo, int start, int size);
+void evergreen_set_lds(struct r600_pipe_compute *pipe, int num_lds, int size, int num_waves);
+void evergreen_set_gds(struct r600_pipe_compute *pipe, uint32_t addr, uint32_t size);
+void evergreen_set_export(struct r600_pipe_compute *pipe, struct r600_resource* bo, int offset, int size);
+void evergreen_set_loop_const(struct r600_pipe_compute *pipe, int id, int count, int init, int inc);
+void evergreen_set_tmp_ring(struct r600_pipe_compute *pipe, struct r600_resource* bo, int offset, int size, int se);
+void evergreen_set_vtx_resource(struct r600_pipe_compute *pipe, struct r600_resource* bo, int id, uint64_t offset, int writable);
+void evergreen_set_tex_resource(struct r600_pipe_compute *pipe, struct r600_pipe_sampler_view* view, int id);
+void evergreen_set_sampler_resource(struct r600_pipe_compute *pipe, struct compute_sampler_state *sampler, int id);
+void evergreen_set_const_cache(struct r600_pipe_compute *pipe, int cache_id, struct r600_resource* cbo, int size, int offset);
+
+struct r600_resource* r600_compute_buffer_alloc_vram(struct r600_screen *screen, unsigned size);
+
+#endif
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@ -28,6 +28,7 @@
 #include "util/u_memory.h"
 #include "util/u_framebuffer.h"
 #include "util/u_dual_blend.h"
+#include "evergreen_compute.h"

 static uint32_t eg_num_banks(uint32_t nbanks)
 {
@ -1881,6 +1882,7 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	rctx->context.create_stream_output_target = r600_create_so_target;
 	rctx->context.stream_output_target_destroy = r600_so_target_destroy;
 	rctx->context.set_stream_output_targets = r600_set_so_targets;
+	evergreen_init_compute_state_functions(rctx);
 }

 static void cayman_init_atom_start_cs(struct r600_context *rctx)
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@ -61,6 +61,8 @@
 #define R600_TEXEL_PITCH_ALIGNMENT_MASK        0x7

 #define PKT3_NOP                               0x10
+#define PKT3_DISPATCH_DIRECT                   0x15
+#define PKT3_DISPATCH_INDIRECT                 0x16
 #define PKT3_INDIRECT_BUFFER_END               0x17
 #define PKT3_SET_PREDICATION                   0x20
 #define PKT3_REG_RMW                           0x21
@ -114,6 +116,11 @@
 #define PKT3_PREDICATE(x)               (((x) >> 0) & 0x1)
 #define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))

+#define RADEON_CP_PACKET3_COMPUTE_MODE 0x00000002
+
+/*Evergreen Compute packet3*/
+#define PKT3C(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate) | RADEON_CP_PACKET3_COMPUTE_MODE)
+
 /* Registers */
 #define R_0084FC_CP_STRMOUT_CNTL		     0x000084FC
 #define   S_0084FC_OFFSET_UPDATE_DONE(x)		(((x) & 0x1) << 0)
@ -241,6 +248,15 @@
 #define   G_008CF0_ALU_UPDATE_FIFO_HIWATER(x)          (((x) >> 24) & 0x1F)
 #define   C_008CF0_ALU_UPDATE_FIFO_HIWATER(x)          0xE0FFFFFF

+#define R_008E20_SQ_STATIC_THREAD_MGMT1               0x8E20
+#define R_008E24_SQ_STATIC_THREAD_MGMT2               0x8E24
+#define R_008E28_SQ_STATIC_THREAD_MGMT3               0x8E28
+
+#define   R_00899C_VGT_COMPUTE_START_X                 0x0000899C
+#define   R_0089A0_VGT_COMPUTE_START_Y                 0x000089A0
+#define   R_0089A4_VGT_COMPUTE_START_Z                 0x000089A4
+#define   R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE       0x000089AC
+
 #define R_009100_SPI_CONFIG_CNTL                      0x00009100
 #define R_00913C_SPI_CONFIG_CNTL_1                    0x0000913C
 #define   S_00913C_VTX_DONE_DELAY(x)                (((x) & 0xF) << 0)
@ -397,6 +413,11 @@
 #define   G_028410_ALPHA_TEST_BYPASS(x)                (((x) >> 8) & 0x1)
 #define   C_028410_ALPHA_TEST_BYPASS                   0xFFFFFEFF

+#define R_0286EC_SPI_COMPUTE_NUM_THREAD_X            0x0286EC
+#define R_0286F0_SPI_COMPUTE_NUM_THREAD_Y            0x0286F0
+#define R_0286F4_SPI_COMPUTE_NUM_THREAD_Z            0x0286F4
+#define R_028B74_VGT_DISPATCH_INITIATOR              0x028B74
+
 #define R_028800_DB_DEPTH_CONTROL                    0x028800
 #define   S_028800_STENCIL_ENABLE(x)                   (((x) & 0x1) << 0)
 #define   G_028800_STENCIL_ENABLE(x)                   (((x) >> 0) & 0x1)
@ -747,6 +768,8 @@
 #define   S_028A40_CUT_MODE(x)                         (((x) & 0x3) << 3)
 #define   G_028A40_CUT_MODE(x)                         (((x) >> 3) & 0x3)
 #define   C_028A40_CUT_MODE                            0xFFFFFFE7
+#define   S_028A40_COMPUTE_MODE(x)                     (x << 14)
+#define   S_028A40_PARTIAL_THD_AT_EOI(x)               (x << 17)
 #define R_028A6C_VGT_GS_OUT_PRIM_TYPE                0x028A6C
 #define   S_028A6C_OUTPRIM_TYPE(x)                     (((x) & 0x3F) << 0)
 #define     V_028A6C_OUTPRIM_TYPE_POINTLIST            0
@ -1434,6 +1457,50 @@
 #define   G_028848_ALLOW_DOUBLE_DENORM_OUT(x)          (((x) >> 7) & 0x1)
 #define   C_028848_ALLOW_DOUBLE_DENORM_OUT             0xFFFFFF7F

+#define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
+#define   S_0288D4_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
+#define   G_0288D4_NUM_GPRS(x)                         (((x) >> 0) & 0xFF)
+#define   C_0288D4_NUM_GPRS                            0xFFFFFF00
+#define   S_0288D4_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
+#define   G_0288D4_STACK_SIZE(x)                       (((x) >> 8) & 0xFF)
+#define   C_0288D4_STACK_SIZE                          0xFFFF00FF
+#define   S_0288D4_DX10_CLAMP(x)                       (((x) & 0x1) << 21)
+#define   G_0288D4_DX10_CLAMP(x)                       (((x) >> 21) & 0x1)
+#define   C_0288D4_DX10_CLAMP                          0xFFDFFFFF
+#define   S_0288D4_PRIME_CACHE_ON_DRAW(x)              (((x) & 0x1) << 23)
+#define   G_0288D4_PRIME_CACHE_ON_DRAW(x)              (((x) >> 23) & 0x1)
+#define   S_0288D4_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
+#define   G_0288D4_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
+#define   C_0288D4_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define   S_0288D4_CLAMP_CONSTS(x)                     (((x) & 0x1) << 31)
+#define   G_0288D4_CLAMP_CONSTS(x)                     (((x) >> 31) & 0x1)
+#define   C_0288D4_CLAMP_CONSTS                        0x7FFFFFFF
+
+#define R_0288D8_SQ_PGM_RESOURCES_LS_2               0x0288d8
+
+
+#define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
+#define   S_0288D4_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
+#define   G_0288D4_NUM_GPRS(x)                         (((x) >> 0) & 0xFF)
+#define   C_0288D4_NUM_GPRS                            0xFFFFFF00
+#define   S_0288D4_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
+#define   G_0288D4_STACK_SIZE(x)                       (((x) >> 8) & 0xFF)
+#define   C_0288D4_STACK_SIZE                          0xFFFF00FF
+#define   S_0288D4_DX10_CLAMP(x)                       (((x) & 0x1) << 21)
+#define   G_0288D4_DX10_CLAMP(x)                       (((x) >> 21) & 0x1)
+#define   C_0288D4_DX10_CLAMP                          0xFFDFFFFF
+#define   S_0288D4_PRIME_CACHE_ON_DRAW(x)              (((x) & 0x1) << 23)
+#define   G_0288D4_PRIME_CACHE_ON_DRAW(x)              (((x) >> 23) & 0x1)
+#define   S_0288D4_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
+#define   G_0288D4_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
+#define   C_0288D4_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define   S_0288D4_CLAMP_CONSTS(x)                     (((x) & 0x1) << 31)
+#define   G_0288D4_CLAMP_CONSTS(x)                     (((x) >> 31) & 0x1)
+#define   C_0288D4_CLAMP_CONSTS                        0x7FFFFFFF
+
+#define R_0288D8_SQ_PGM_RESOURCES_LS_2               0x0288d8
+
+
 #define R_028644_SPI_PS_INPUT_CNTL_0                 0x028644
 #define   S_028644_SEMANTIC(x)                         (((x) & 0xFF) << 0)
 #define   G_028644_SEMANTIC(x)                         (((x) >> 0) & 0xFF)
@ -1710,6 +1777,12 @@
 #define R_0286DC_SPI_FOG_CNTL                        0x000286DC
 #define R_0286E4_SPI_PS_IN_CONTROL_2                 0x000286E4
 #define R_0286E8_SPI_COMPUTE_INPUT_CNTL              0x000286E8
+#define   S_0286E8_TID_IN_GROUP_ENA                  1
+#define   S_0286E8_TGID_ENA                          2
+#define   S_0286E8_DISABLE_INDEX_PACK                4
+#define R_028720_GDS_ADDR_BASE                       0x00028720
+#define R_028724_GDS_ADDR_SIZE                       0x00028724
+#define R_028728_GDS_ORDERED_WAVE_PER_SE             0x00028728
 #define R_028784_CB_BLEND1_CONTROL                   0x00028784
 #define R_028788_CB_BLEND2_CONTROL                   0x00028788
 #define R_02878C_CB_BLEND3_CONTROL                   0x0002878C
@ -1736,6 +1809,7 @@
 #define   C_02884C_EXPORT_Z                            0xFFFFFFFE
 #define R_02885C_SQ_PGM_START_VS                     0x0002885C
 #define R_0288A4_SQ_PGM_START_FS                     0x000288A4
+#define R_0288D0_SQ_PGM_START_LS                     0x000288d0
 #define R_0288A8_SQ_PGM_RESOURCES_FS                 0x000288A8
 #define R_0288EC_SQ_LDS_ALLOC_PS                     0x000288EC
 #define R_028900_SQ_ESGS_RING_ITEMSIZE               0x00028900
--- a/src/gallium/drivers/r600/llvm_wrapper.cpp
+++ b/src/gallium/drivers/r600/llvm_wrapper.cpp
@ -0,0 +1,19 @@
+#include <llvm/ADT/OwningPtr.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/LLVMContext.h>
+#include <llvm/Support/IRReader.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/SourceMgr.h>
+
+#include "llvm_wrapper.h"
+
+
+extern "C" LLVMModuleRef llvm_parse_bitcode(const unsigned char * bitcode, unsigned bitcode_len)
+{
+	llvm::OwningPtr<llvm::Module> M;
+	llvm::StringRef str((const char*)bitcode, bitcode_len);
+	llvm::MemoryBuffer*  buffer = llvm::MemoryBuffer::getMemBufferCopy(str);
+	llvm::SMDiagnostic Err;
+	M.reset(llvm::ParseIR(buffer, Err, llvm::getGlobalContext()));
+	return wrap(M.take());
+}
--- a/src/gallium/drivers/r600/llvm_wrapper.h
+++ b/src/gallium/drivers/r600/llvm_wrapper.h
@ -0,0 +1,16 @@
+#ifndef LLVM_WRAPPER_H
+#define LLVM_WRAPPER_H
+
+#include <llvm-c/Core.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LLVMModuleRef llvm_parse_bitcode(const unsigned char * bitcode, unsigned bitcode_len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/gallium/drivers/r600/r600_llvm.h
+++ b/src/gallium/drivers/r600/r600_llvm.h
@ -2,7 +2,7 @@
 #ifndef R600_LLVM_H
 #define R600_LLVM_H

-#ifdef R600_USE_LLVM
+#if defined R600_USE_LLVM || defined HAVE_OPENCL

 #include "radeon_llvm.h"
 #include <llvm-c/Core.h>
@ -24,6 +24,6 @@ unsigned r600_llvm_compile(
 	enum radeon_family family,
 	unsigned dump);

-#endif /* R600_USE_LLVM */
+#endif /* defined R600_USE_LLVM || defined HAVE_OPENCL */

 #endif /* R600_LLVM_H */
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@ -382,6 +382,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
 	case PIPE_CAP_USER_INDEX_BUFFERS:
 	case PIPE_CAP_USER_CONSTANT_BUFFERS:
+	case PIPE_CAP_COMPUTE:
 		return 1;

 	case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
@ -409,7 +410,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
 	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
-	case PIPE_CAP_COMPUTE:
 		return 0;

 	/* Stream output. */
@ -491,6 +491,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	{
 	case PIPE_SHADER_FRAGMENT:
 	case PIPE_SHADER_VERTEX:
+        case PIPE_SHADER_COMPUTE:
 		break;
 	case PIPE_SHADER_GEOMETRY:
 		/* XXX: support and enable geometry programs */
@ -538,8 +539,12 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 		return rscreen->glsl_feature_level >= 130;
 	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
 		return 16;
-	case PIPE_SHADER_CAP_PREFERRED_IR:
-		return PIPE_SHADER_IR_TGSI;
+        case PIPE_SHADER_CAP_PREFERRED_IR:
+		if (shader == PIPE_SHADER_COMPUTE) {
+			return PIPE_SHADER_IR_LLVM;
+		} else {
+			return PIPE_SHADER_IR_TGSI;
+		}
 	}
 	return 0;
 }
@ -569,6 +574,81 @@ static int r600_get_video_param(struct pipe_screen *screen,
 	}
 }

+static int r600_get_compute_param(struct pipe_screen *screen,
+        enum pipe_compute_cap param,
+        void *ret)
+{
+	//TODO: select these params by asic
+	switch (param) {
+	case PIPE_COMPUTE_CAP_IR_TARGET:
+		if (ret) {
+			strcpy(ret, "r600--");
+		}
+		return 7 * sizeof(char);
+
+	case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+		if (ret) {
+			uint64_t * grid_dimension = ret;
+			grid_dimension[0] = 3;
+		}
+		return 1 * sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+		if (ret) {
+			uint64_t * grid_size = ret;
+			grid_size[0] = 65535;
+			grid_size[1] = 65535;
+			grid_size[2] = 1;
+		}
+		return 3 * sizeof(uint64_t) ;
+
+	case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+		if (ret) {
+			uint64_t * block_size = ret;
+			block_size[0] = 256;
+			block_size[1] = 256;
+			block_size[2] = 256;
+		}
+		return 3 * sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+		if (ret) {
+			uint64_t * max_threads_per_block = ret;
+			*max_threads_per_block = 256;
+		}
+		return sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+		if (ret) {
+			uint64_t * max_global_size = ret;
+			/* XXX: This is what the proprietary driver reports, we
+			 * may want to use a different value. */
+			*max_global_size = 201326592;
+		}
+		return sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+		if (ret) {
+			uint64_t * max_input_size = ret;
+			*max_input_size = 1024;
+		}
+		return sizeof(uint64_t);
+
+	case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+		if (ret) {
+			uint64_t * max_local_size = ret;
+			/* XXX: This is what the proprietary driver reports, we
+			 * may want to use a different value. */
+			*max_local_size = 32768;
+		}
+		return sizeof(uint64_t);
+
+	default:
+		fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
+		return 0;
+	}
+}
+
 static void r600_destroy_screen(struct pipe_screen* pscreen)
 {
 	struct r600_screen *rscreen = (struct r600_screen *)pscreen;
@ -576,6 +656,10 @@ static void r600_destroy_screen(struct pipe_screen* pscreen)
 	if (rscreen == NULL)
 		return;

+	if (rscreen->global_pool) {
+		compute_memory_pool_delete(rscreen->global_pool);
+	}
+
 	if (rscreen->fences.bo) {
 		struct r600_fence_block *entry, *tmp;

@ -833,6 +917,8 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)
 	rscreen->screen.get_shader_param = r600_get_shader_param;
 	rscreen->screen.get_paramf = r600_get_paramf;
 	rscreen->screen.get_video_param = r600_get_video_param;
+	rscreen->screen.get_compute_param = r600_get_compute_param;
+
 	if (rscreen->chip_class >= EVERGREEN) {
 		rscreen->screen.is_format_supported = evergreen_is_format_supported;
 	} else {
@ -857,5 +943,7 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)
 	rscreen->use_surface_alloc = debug_get_bool_option("R600_SURF", TRUE);
 	rscreen->glsl_feature_level = debug_get_bool_option("R600_GLSL130", TRUE) ? 130 : 120;

+	rscreen->global_pool = compute_memory_pool_new(1024*16, rscreen);
+
 	return &rscreen->screen;
 }
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@ -28,8 +28,11 @@

 #include "util/u_slab.h"
 #include "r600.h"
+#include "r600_llvm.h"
+#include "r600_public.h"
 #include "r600_shader.h"
 #include "r600_resource.h"
+#include "evergreen_compute.h"

 #define R600_MAX_CONST_BUFFERS 2
 #define R600_MAX_CONST_BUFFER_SIZE 4096
@ -98,9 +101,16 @@ enum r600_pipe_state_id {
 	R600_PIPE_STATE_RESOURCE,
 	R600_PIPE_STATE_POLYGON_OFFSET,
 	R600_PIPE_STATE_FETCH_SHADER,
+	R600_PIPE_STATE_SPI,
 	R600_PIPE_NSTATES
 };

+struct compute_memory_pool;
+void compute_memory_pool_delete(struct compute_memory_pool* pool);
+struct compute_memory_pool* compute_memory_pool_new(
+	int64_t initial_size_in_dw,
+	struct r600_screen *rscreen);
+
 struct r600_pipe_fences {
 	struct r600_resource		*bo;
 	unsigned			*data;
@ -123,6 +133,12 @@ struct r600_screen {

 	bool				use_surface_alloc;
 	int 				glsl_feature_level;
+
+	/*for compute global memory binding, we allocate stuff here, instead of
+	 * buffers.
+	 * XXX: Not sure if this is the best place for global_pool.  Also,
+	 * it's not thread safe, so it won't work with multiple contexts. */
+	struct compute_memory_pool *global_pool;
 };

 struct r600_pipe_sampler_view {
@ -257,6 +273,7 @@ struct r600_context {
 	struct pipe_clip_state		clip;
 	struct r600_pipe_shader 	*ps_shader;
 	struct r600_pipe_shader 	*vs_shader;
+	struct r600_pipe_compute	*cs_shader;
 	struct r600_pipe_rasterizer	*rasterizer;
 	struct r600_pipe_state          vgt;
 	struct r600_pipe_state          spi;
@ -266,7 +283,9 @@ struct r600_context {
 	unsigned			saved_render_cond_mode;
 	/* shader information */
 	boolean				two_side;
+	boolean				spi_dirty;
 	unsigned			sprite_coord_enable;
+	boolean				flatshade;
 	boolean				export_16bpc;
 	unsigned			alpha_ref;
 	boolean				alpha_ref_dirty;
@ -412,6 +431,10 @@ void r600_init_context_resource_functions(struct r600_context *r600);

 /* r600_shader.c */
 int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader);
+#ifdef HAVE_OPENCL
+int r600_compute_shader_create(struct pipe_context * ctx,
+	LLVMModuleRef mod,  struct r600_bytecode * bytecode);
+#endif
 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader);
 int r600_find_vs_semantic_index(struct r600_shader *vs,
 				struct r600_shader *ps, int id);
--- a/src/gallium/drivers/r600/r600_resource.c
+++ b/src/gallium/drivers/r600/r600_resource.c
@ -27,7 +27,12 @@ static struct pipe_resource *r600_resource_create(struct pipe_screen *screen,
 						const struct pipe_resource *templ)
 {
 	if (templ->target == PIPE_BUFFER) {
-		return r600_buffer_create(screen, templ);
+		if (templ->bind & PIPE_BIND_GLOBAL) {
+		    return r600_compute_global_buffer_create(screen, templ);
+		}
+		else {
+		    return r600_buffer_create(screen, templ);
+		}
 	} else {
 		return r600_texture_create(screen, templ);
 	}
@ -44,12 +49,21 @@ static struct pipe_resource *r600_resource_from_handle(struct pipe_screen * scre
 	}
 }

+void r600_resource_destroy(struct pipe_screen *screen, struct pipe_resource *res)
+{
+	if (res->target == PIPE_BUFFER && (res->bind & PIPE_BIND_GLOBAL)) {
+		r600_compute_global_buffer_destroy(screen, res);
+	} else {
+		u_resource_destroy_vtbl(screen, res);
+	}
+}
+
 void r600_init_screen_resource_functions(struct pipe_screen *screen)
 {
 	screen->resource_create = r600_resource_create;
 	screen->resource_from_handle = r600_resource_from_handle;
 	screen->resource_get_handle = u_resource_get_handle_vtbl;
-	screen->resource_destroy = u_resource_destroy_vtbl;
+	screen->resource_destroy = r600_resource_destroy;
 }

 void r600_init_context_resource_functions(struct r600_context *r600)
--- a/src/gallium/drivers/r600/r600_resource.h
+++ b/src/gallium/drivers/r600/r600_resource.h
@ -34,6 +34,13 @@ struct r600_transfer {
 	unsigned			offset;
 };

+struct compute_memory_item;
+
+struct r600_resource_global {
+	struct r600_resource base;
+	struct compute_memory_item *chunk;
+};
+
 struct r600_resource_texture {
 	struct r600_resource		resource;

@ -65,6 +72,7 @@ struct r600_surface {
 	unsigned			aligned_height;
 };

+void r600_resource_destroy(struct pipe_screen *screen, struct pipe_resource *res);
 void r600_init_screen_resource_functions(struct pipe_screen *screen);

 /* r600_texture */
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@ -225,6 +225,37 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
 * struct r600_bytecode.
 */

+static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
+				unsigned char * bytes,	unsigned num_bytes);
+
+#ifdef HAVE_OPENCL
+int r600_compute_shader_create(struct pipe_context * ctx,
+	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
+{
+	struct r600_context *r600_ctx = (struct r600_context *)ctx;
+	unsigned char * bytes;
+	unsigned byte_count;
+	struct r600_shader_ctx shader_ctx;
+	unsigned dump = 0;
+
+	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
+		dump = 1;
+	}
+
+	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
+	shader_ctx.bc = bytecode;
+	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family);
+	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
+	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
+	r600_bytecode_build(shader_ctx.bc);
+	if (dump) {
+		r600_bytecode_dump(shader_ctx.bc);
+	}
+	return 1;
+}
+
+#endif /* HAVE_OPENCL */
+
 static unsigned r600_src_from_byte_stream(unsigned char * bytes,
 		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
 {
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@ -916,6 +916,10 @@ void* r600_texture_transfer_map(struct pipe_context *ctx,
 	unsigned offset = 0;
 	char *map;

+	if ((transfer->resource->bind & PIPE_BIND_GLOBAL) && transfer->resource->target == PIPE_BUFFER) {
+		return r600_compute_global_transfer_map(ctx, transfer);
+	}
+
 	if (rtransfer->staging) {
 		buf = ((struct r600_resource *)rtransfer->staging)->cs_buf;
 	} else {
@ -945,6 +949,10 @@ void r600_texture_transfer_unmap(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context*)ctx;
 	struct radeon_winsys_cs_handle *buf;

+	if ((transfer->resource->bind & PIPE_BIND_GLOBAL) && transfer->resource->target == PIPE_BUFFER) {
+		return r600_compute_global_transfer_unmap(ctx, transfer);
+	}
+
 	if (rtransfer->staging) {
 		buf = ((struct r600_resource *)rtransfer->staging)->cs_buf;
 	} else {