r600g: add async for staging buffer upload v2

v2: Add virtual address to dma src/dst offset for cayman Signed-off-by: Jerome Glisse <jglisse@redhat.com>
2025-12-23 06:50:11 +01:00 · 2013-01-07 17:45:59 -05:00 · 2013-01-07 17:45:59 -05:00 · 325422c494
commit 325422c494
parent bff07638a8
12 changed files with 595 additions and 17 deletions
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@ -26,6 +26,7 @@
 #include "r600_hw_context_priv.h"
 #include "evergreend.h"
 #include "util/u_memory.h"
+#include "util/u_math.h"

 static const struct r600_reg cayman_config_reg_list[] = {
 	{R_009100_SPI_CONFIG_CNTL, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0},
@ -238,3 +239,48 @@ void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_en
 		r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, S_028B94_STREAMOUT_0_EN(0));
 	}
 }
+
+void evergreen_dma_copy(struct r600_context *rctx,
+		struct pipe_resource *dst,
+		struct pipe_resource *src,
+		unsigned long dst_offset,
+		unsigned long src_offset,
+		unsigned long size)
+{
+	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+	unsigned i, ncopy, csize, sub_cmd, shift;
+	struct r600_resource *rdst = (struct r600_resource*)dst;
+	struct r600_resource *rsrc = (struct r600_resource*)src;
+
+	/* make sure that the dma ring is only one active */
+	rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+	dst_offset += r600_resource_va(&rctx->screen->screen, dst);
+	src_offset += r600_resource_va(&rctx->screen->screen, src);
+
+	/* see if we use dword or byte copy */
+	if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) {
+		size >>= 2;
+		sub_cmd = 0x00;
+		shift = 2;
+	} else {
+		sub_cmd = 0x40;
+		shift = 0;
+	}
+	ncopy = (size / 0x000fffff) + !!(size % 0x000fffff);
+
+	r600_need_dma_space(rctx, ncopy * 5);
+	for (i = 0; i < ncopy; i++) {
+		csize = size < 0x000fffff ? size : 0x000fffff;
+		/* emit reloc before writting cs so that cs is always in consistent state */
+		r600_context_bo_reloc(rctx, &rctx->rings.dma, rsrc, RADEON_USAGE_READ);
+		r600_context_bo_reloc(rctx, &rctx->rings.dma, rdst, RADEON_USAGE_WRITE);
+		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize);
+		cs->buf[cs->cdw++] = dst_offset & 0xffffffff;
+		cs->buf[cs->cdw++] = src_offset & 0xffffffff;
+		cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff;
+		cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff;
+		dst_offset += csize << shift;
+		src_offset += csize << shift;
+		size -= csize;
+	}
+}
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@ -30,6 +30,20 @@
 #include "util/u_framebuffer.h"
 #include "util/u_dual_blend.h"
 #include "evergreen_compute.h"
+#include "util/u_math.h"
+
+static INLINE unsigned evergreen_array_mode(unsigned mode)
+{
+	switch (mode) {
+	case RADEON_SURF_MODE_LINEAR_ALIGNED:	return V_028C70_ARRAY_LINEAR_ALIGNED;
+		break;
+	case RADEON_SURF_MODE_1D:		return V_028C70_ARRAY_1D_TILED_THIN1;
+		break;
+	case RADEON_SURF_MODE_2D:		return V_028C70_ARRAY_2D_TILED_THIN1;
+	default:
+	case RADEON_SURF_MODE_LINEAR:		return V_028C70_ARRAY_LINEAR_GENERAL;
+	}
+}

 static uint32_t eg_num_banks(uint32_t nbanks)
 {
@ -3445,3 +3459,190 @@ void evergreen_update_db_shader_control(struct r600_context * rctx)
 		rctx->db_misc_state.atom.dirty = true;
 	}
 }
+
+static void evergreen_dma_copy_tile(struct r600_context *rctx,
+				struct pipe_resource *dst,
+				unsigned dst_level,
+				unsigned dst_x,
+				unsigned dst_y,
+				unsigned dst_z,
+				struct pipe_resource *src,
+				unsigned src_level,
+				unsigned src_x,
+				unsigned src_y,
+				unsigned src_z,
+				unsigned copy_height,
+				unsigned pitch,
+				unsigned bpp)
+{
+	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+	struct r600_texture *rsrc = (struct r600_texture*)src;
+	struct r600_texture *rdst = (struct r600_texture*)dst;
+	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
+	unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
+	unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split;
+	unsigned long base, addr;
+
+	/* make sure that the dma ring is only one active */
+	rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+
+	dst_mode = rdst->surface.level[dst_level].mode;
+	src_mode = rsrc->surface.level[src_level].mode;
+	/* downcast linear aligned to linear to simplify test */
+	src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+	dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+	assert(dst_mode != src_mode);
+
+	y = 0;
+	sub_cmd = 0x8;
+	lbpp = util_logbase2(bpp);
+	pitch_tile_max = ((pitch / bpp) >> 3) - 1;
+	nbanks = eg_num_banks(rctx->screen->tiling_info.num_banks);
+
+	if (dst_mode == RADEON_SURF_MODE_LINEAR) {
+		/* T2L */
+		array_mode = evergreen_array_mode(src_mode);
+		slice_tile_max = (((pitch * rsrc->surface.level[src_level].npix_y) >> 6) / bpp) - 1;
+		/* linear height must be the same as the slice tile max height, it's ok even
+		 * if the linear destination/source have smaller heigh as the size of the
+		 * dma packet will be using the copy_height which is always smaller or equal
+		 * to the linear height
+		 */
+		height = rsrc->surface.level[src_level].npix_y;
+		detile = 1;
+		x = src_x;
+		y = src_y;
+		z = src_z;
+		base = rsrc->surface.level[src_level].offset;
+		addr = rdst->surface.level[dst_level].offset;
+		addr += rdst->surface.level[dst_level].slice_size * dst_z;
+		addr += dst_y * pitch + dst_x * bpp;
+		bank_h = eg_bank_wh(rsrc->surface.bankh);
+		bank_w = eg_bank_wh(rsrc->surface.bankw);
+		mt_aspect = eg_macro_tile_aspect(rsrc->surface.mtilea);
+		tile_split = eg_tile_split(rsrc->surface.tile_split);
+		base += r600_resource_va(&rctx->screen->screen, src);
+		addr += r600_resource_va(&rctx->screen->screen, dst);
+	} else {
+		/* L2T */
+		array_mode = evergreen_array_mode(dst_mode);
+		slice_tile_max = (((pitch * rdst->surface.level[dst_level].npix_y) >> 6) / bpp) - 1;
+		/* linear height must be the same as the slice tile max height, it's ok even
+		 * if the linear destination/source have smaller heigh as the size of the
+		 * dma packet will be using the copy_height which is always smaller or equal
+		 * to the linear height
+		 */
+		height = rdst->surface.level[dst_level].npix_y;
+		detile = 0;
+		x = dst_x;
+		y = dst_y;
+		z = dst_z;
+		base = rdst->surface.level[dst_level].offset;
+		addr = rsrc->surface.level[src_level].offset;
+		addr += rsrc->surface.level[src_level].slice_size * src_z;
+		addr += src_y * pitch + src_x * bpp;
+		bank_h = eg_bank_wh(rdst->surface.bankh);
+		bank_w = eg_bank_wh(rdst->surface.bankw);
+		mt_aspect = eg_macro_tile_aspect(rdst->surface.mtilea);
+		tile_split = eg_tile_split(rdst->surface.tile_split);
+		base += r600_resource_va(&rctx->screen->screen, dst);
+		addr += r600_resource_va(&rctx->screen->screen, src);
+	}
+
+	size = (copy_height * pitch) >> 2;
+	ncopy = (size / 0x000fffff) + !!(size % 0x000fffff);
+	r600_need_dma_space(rctx, ncopy * 9);
+
+	for (i = 0; i < ncopy; i++) {
+		cheight = copy_height;
+		if (((cheight * pitch) >> 2) > 0x000fffff) {
+			cheight = (0x000fffff << 2) / pitch;
+		}
+		size = (cheight * pitch) >> 2;
+		/* emit reloc before writting cs so that cs is always in consistent state */
+		r600_context_bo_reloc(rctx, &rctx->rings.dma, &rsrc->resource, RADEON_USAGE_READ);
+		r600_context_bo_reloc(rctx, &rctx->rings.dma, &rdst->resource, RADEON_USAGE_WRITE);
+		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, size);
+		cs->buf[cs->cdw++] = base >> 8;
+		cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) |
+					(lbpp << 24) | (bank_h << 21) |
+					(bank_w << 18) | (mt_aspect << 16);
+		cs->buf[cs->cdw++] = (pitch_tile_max << 0) | ((height - 1) << 16);
+		cs->buf[cs->cdw++] = (slice_tile_max << 0);
+		cs->buf[cs->cdw++] = (x << 0) | (z << 18);
+		cs->buf[cs->cdw++] = (y << 0) | (tile_split << 21) | (nbanks << 25);
+		cs->buf[cs->cdw++] = addr & 0xfffffffc;
+		cs->buf[cs->cdw++] = (addr >> 32UL) & 0xff;
+		copy_height -= cheight;
+		addr += cheight * pitch;
+		y += cheight;
+	}
+}
+
+boolean evergreen_dma_blit(struct pipe_context *ctx,
+			struct pipe_resource *dst,
+			unsigned dst_level,
+			unsigned dst_x, unsigned dst_y, unsigned dst_z,
+			struct pipe_resource *src,
+			unsigned src_level,
+			const struct pipe_box *src_box)
+{
+	struct r600_context *rctx = (struct r600_context *)ctx;
+	struct r600_texture *rsrc = (struct r600_texture*)src;
+	struct r600_texture *rdst = (struct r600_texture*)dst;
+	unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height;
+	unsigned src_w, dst_w;
+
+	if (rctx->rings.dma.cs == NULL) {
+		return FALSE;
+	}
+	if (src->format != dst->format) {
+		return FALSE;
+	}
+
+	bpp = rdst->surface.bpe;
+	dst_pitch = rdst->surface.level[dst_level].pitch_bytes;
+	src_pitch = rsrc->surface.level[src_level].pitch_bytes;
+	src_w = rsrc->surface.level[src_level].npix_x;
+	dst_w = rdst->surface.level[dst_level].npix_x;
+	copy_height = src_box->height / rsrc->surface.blk_h;
+
+	dst_mode = rdst->surface.level[dst_level].mode;
+	src_mode = rsrc->surface.level[src_level].mode;
+	/* downcast linear aligned to linear to simplify test */
+	src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+	dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+
+	if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) {
+		/* FIXME evergreen can do partial blit */
+		return FALSE;
+	}
+	/* the x test here are currently useless (because we don't support partial blit)
+	 * but keep them around so we don't forget about those
+	 */
+	if ((src_pitch & 0x7) || (src_box->x & 0x7) || (dst_x & 0x7) || (src_box->y & 0x7) || (dst_y & 0x7)) {
+		return FALSE;
+	}
+
+	if (src_mode == dst_mode) {
+		unsigned long dst_offset, src_offset;
+		/* simple dma blit would do NOTE code here assume :
+		 *   src_box.x/y == 0
+		 *   dst_x/y == 0
+		 *   dst_pitch == src_pitch
+		 */
+		src_offset= rsrc->surface.level[src_level].offset;
+		src_offset += rsrc->surface.level[src_level].slice_size * src_box->z;
+		src_offset += src_box->y * src_pitch + src_box->x * bpp;
+		dst_offset = rdst->surface.level[dst_level].offset;
+		dst_offset += rdst->surface.level[dst_level].slice_size * dst_z;
+		dst_offset += dst_y * dst_pitch + dst_x * bpp;
+		evergreen_dma_copy(rctx, dst, src, dst_offset, src_offset,
+					src_box->height * src_pitch);
+	} else {
+		evergreen_dma_copy_tile(rctx, dst, dst_level, dst_x, dst_y, dst_z,
+					src, src_level, src_box->x, src_box->y, src_box->z,
+					copy_height, dst_pitch, bpp);
+	}
+	return TRUE;
+}
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@ -2317,4 +2317,19 @@
 #define   G_028AA8_SWITCH_ON_EOP(x)                    (((x) >> 17) & 0x1)
 #define   C_028AA8_SWITCH_ON_EOP                       0xFFFDFFFF

+/* async DMA packets */
+#define DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) |    \
+                                    (((sub_cmd) & 0xFF) << 20) |\
+                                    (((n) & 0xFFFFF) << 0))
+/* async DMA Packet types */
+#define    DMA_PACKET_WRITE                     0x2
+#define    DMA_PACKET_COPY                      0x3
+#define    DMA_PACKET_INDIRECT_BUFFER           0x4
+#define    DMA_PACKET_SEMAPHORE                 0x5
+#define    DMA_PACKET_FENCE                     0x6
+#define    DMA_PACKET_TRAP                      0x7
+#define    DMA_PACKET_SRBM_WRITE                0x9
+#define    DMA_PACKET_CONSTANT_FILL             0xd
+#define    DMA_PACKET_NOP                       0xf
+
 #endif
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@ -170,6 +170,33 @@ void r600_flush_emit(struct r600_context *ctx);
 void r600_context_streamout_begin(struct r600_context *ctx);
 void r600_context_streamout_end(struct r600_context *ctx);
 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in);
+void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw);
+void r600_dma_copy(struct r600_context *rctx,
+		struct pipe_resource *dst,
+		struct pipe_resource *src,
+		unsigned long dst_offset,
+		unsigned long src_offset,
+		unsigned long size);
+boolean r600_dma_blit(struct pipe_context *ctx,
+			struct pipe_resource *dst,
+			unsigned dst_level,
+			unsigned dst_x, unsigned dst_y, unsigned dst_z,
+			struct pipe_resource *src,
+			unsigned src_level,
+			const struct pipe_box *src_box);
+void evergreen_dma_copy(struct r600_context *rctx,
+		struct pipe_resource *dst,
+		struct pipe_resource *src,
+		unsigned long dst_offset,
+		unsigned long src_offset,
+		unsigned long size);
+boolean evergreen_dma_blit(struct pipe_context *ctx,
+			struct pipe_resource *dst,
+			unsigned dst_level,
+			unsigned dst_x, unsigned dst_y, unsigned dst_z,
+			struct pipe_resource *src,
+			unsigned src_level,
+			const struct pipe_box *src_box);
 void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block, unsigned pkt_flags);
 void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 			     struct pipe_resource *dst, uint64_t dst_offset,
--- a/src/gallium/drivers/r600/r600_buffer.c
+++ b/src/gallium/drivers/r600/r600_buffer.c
@ -27,6 +27,7 @@
 #include "r600_pipe.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_memory.h"
+#include "util/u_surface.h"

 static void r600_buffer_destroy(struct pipe_screen *screen,
 				struct pipe_resource *buf)
@ -179,13 +180,27 @@ static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
 	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;

 	if (rtransfer->staging) {
-		struct pipe_box box;
-		u_box_1d(rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT,
-			 transfer->box.width, &box);
+		struct pipe_resource *dst, *src;
+		unsigned soffset, doffset, size;

+		dst = transfer->resource;
+		src = &rtransfer->staging->b.b;
+		size = transfer->box.width;
+		doffset = transfer->box.x;
+		soffset = rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT;
 		/* Copy the staging buffer into the original one. */
-		r600_copy_buffer(pipe, transfer->resource, transfer->box.x,
-				 &rtransfer->staging->b.b, &box);
+		if (rctx->rings.dma.cs && !(size % 4) && !(doffset % 4) && !(soffset)) {
+			if (rctx->screen->chip_class >= EVERGREEN) {
+				evergreen_dma_copy(rctx, dst, src, doffset, soffset, size);
+			} else {
+				r600_dma_copy(rctx, dst, src, doffset, soffset, size);
+			}
+		} else {
+			struct pipe_box box;
+
+			u_box_1d(soffset, size, &box);
+			r600_copy_buffer(pipe, dst, doffset, src, &box);
+		}
 		pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL);
 	}
 	util_slab_free(&rctx->pool_transfers, transfer);
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@ -762,8 +762,6 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
 		}
 	}
 #endif
-
-	r600_begin_new_cs(ctx);
 }

 void r600_begin_new_cs(struct r600_context *ctx)
@ -1129,3 +1127,49 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 		dst_offset += byte_count;
 	}
 }
+
+void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw)
+{
+	/* The number of dwords we already used in the DMA so far. */
+	num_dw += ctx->rings.dma.cs->cdw;
+	/* Flush if there's not enough space. */
+	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
+	}
+}
+
+void r600_dma_copy(struct r600_context *rctx,
+		struct pipe_resource *dst,
+		struct pipe_resource *src,
+		unsigned long dst_offset,
+		unsigned long src_offset,
+		unsigned long size)
+{
+	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+	unsigned i, ncopy, csize, shift;
+	struct r600_resource *rdst = (struct r600_resource*)dst;
+	struct r600_resource *rsrc = (struct r600_resource*)src;
+
+	/* make sure that the dma ring is only one active */
+	rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+
+	size >>= 2;
+	shift = 2;
+	ncopy = (size / 0xffff) + !!(size % 0xffff);
+
+	r600_need_dma_space(rctx, ncopy * 5);
+	for (i = 0; i < ncopy; i++) {
+		csize = size < 0xffff ? size : 0xffff;
+		/* emit reloc before writting cs so that cs is always in consistent state */
+		r600_context_bo_reloc(rctx, &rctx->rings.dma, rsrc, RADEON_USAGE_READ);
+		r600_context_bo_reloc(rctx, &rctx->rings.dma, rdst, RADEON_USAGE_WRITE);
+		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize);
+		cs->buf[cs->cdw++] = dst_offset & 0xfffffffc;
+		cs->buf[cs->cdw++] = src_offset & 0xfffffffc;
+		cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff;
+		cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff;
+		dst_offset += csize << shift;
+		src_offset += csize << shift;
+		size -= csize;
+	}
+}
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@ -30,6 +30,7 @@
 #include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_math.h"
 #include "vl/vl_decoder.h"
 #include "vl/vl_video_buffer.h"
 #include "os/os_time.h"
@ -128,12 +129,13 @@ static void r600_flush(struct pipe_context *ctx, unsigned flags)
 	}

 	r600_context_flush(rctx, flags);
+	rctx->rings.gfx.flushing = false;
+	r600_begin_new_cs(rctx);

 	/* Re-enable render condition. */
 	if (render_cond) {
 		ctx->render_condition(ctx, render_cond, render_cond_mode);
 	}
-	rctx->rings.gfx.flushing = false;
 }

 static void r600_flush_from_st(struct pipe_context *ctx,
@ -1111,8 +1113,10 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)

 	if (rscreen->chip_class >= EVERGREEN) {
 		rscreen->screen.is_format_supported = evergreen_is_format_supported;
+		rscreen->dma_blit = &evergreen_dma_blit;
 	} else {
 		rscreen->screen.is_format_supported = r600_is_format_supported;
+		rscreen->dma_blit = &r600_dma_blit;
 	}
 	rscreen->screen.is_video_format_supported = vl_video_buffer_is_format_supported;
 	rscreen->screen.context_create = r600_create_context;
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@ -220,6 +220,14 @@ enum r600_msaa_texture_mode {
 	MSAA_TEXTURE_COMPRESSED
 };

+typedef boolean (*r600g_dma_blit_t)(struct pipe_context *ctx,
+				struct pipe_resource *dst,
+				unsigned dst_level,
+				unsigned dst_x, unsigned dst_y, unsigned dst_z,
+				struct pipe_resource *src,
+				unsigned src_level,
+				const struct pipe_box *src_box);
+
 struct r600_screen {
 	struct pipe_screen		screen;
 	struct radeon_winsys		*ws;
@ -243,6 +251,7 @@ struct r600_screen {
 	uint32_t			*trace_ptr;
 	unsigned			cs_count;
 #endif
+	r600g_dma_blit_t		dma_blit;
 };

 struct r600_pipe_sampler_view {
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@ -2945,3 +2945,193 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 		rctx->db_misc_state.atom.dirty = true;
 	}
 }
+
+static INLINE unsigned r600_array_mode(unsigned mode)
+{
+	switch (mode) {
+	case RADEON_SURF_MODE_LINEAR_ALIGNED:	return V_0280A0_ARRAY_LINEAR_ALIGNED;
+		break;
+	case RADEON_SURF_MODE_1D:		return V_0280A0_ARRAY_1D_TILED_THIN1;
+		break;
+	case RADEON_SURF_MODE_2D:		return V_0280A0_ARRAY_2D_TILED_THIN1;
+	default:
+	case RADEON_SURF_MODE_LINEAR:		return V_0280A0_ARRAY_LINEAR_GENERAL;
+	}
+}
+
+static boolean r600_dma_copy_tile(struct r600_context *rctx,
+				struct pipe_resource *dst,
+				unsigned dst_level,
+				unsigned dst_x,
+				unsigned dst_y,
+				unsigned dst_z,
+				struct pipe_resource *src,
+				unsigned src_level,
+				unsigned src_x,
+				unsigned src_y,
+				unsigned src_z,
+				unsigned copy_height,
+				unsigned pitch,
+				unsigned bpp)
+{
+	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+	struct r600_texture *rsrc = (struct r600_texture*)src;
+	struct r600_texture *rdst = (struct r600_texture*)dst;
+	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
+	unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
+	unsigned long base, addr;
+
+	/* make sure that the dma ring is only one active */
+	rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
+
+	dst_mode = rdst->surface.level[dst_level].mode;
+	src_mode = rsrc->surface.level[src_level].mode;
+	/* downcast linear aligned to linear to simplify test */
+	src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+	dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+	assert(dst_mode != src_mode);
+
+	y = 0;
+	lbpp = util_logbase2(bpp);
+	pitch_tile_max = ((pitch / bpp) >> 3) - 1;
+
+	if (dst_mode == RADEON_SURF_MODE_LINEAR) {
+		/* T2L */
+		array_mode = r600_array_mode(src_mode);
+		slice_tile_max = (((pitch * rsrc->surface.level[src_level].npix_y) >> 6) / bpp) - 1;
+		/* linear height must be the same as the slice tile max height, it's ok even
+		 * if the linear destination/source have smaller heigh as the size of the
+		 * dma packet will be using the copy_height which is always smaller or equal
+		 * to the linear height
+		 */
+		height = rsrc->surface.level[src_level].npix_y;
+		detile = 1;
+		x = src_x;
+		y = src_y;
+		z = src_z;
+		base = rsrc->surface.level[src_level].offset;
+		addr = rdst->surface.level[dst_level].offset;
+		addr += rdst->surface.level[dst_level].slice_size * dst_z;
+		addr += dst_y * pitch + dst_x * bpp;
+	} else {
+		/* L2T */
+		array_mode = r600_array_mode(dst_mode);
+		slice_tile_max = (((pitch * rdst->surface.level[dst_level].npix_y) >> 6) / bpp) - 1;
+		/* linear height must be the same as the slice tile max height, it's ok even
+		 * if the linear destination/source have smaller heigh as the size of the
+		 * dma packet will be using the copy_height which is always smaller or equal
+		 * to the linear height
+		 */
+		height = rdst->surface.level[dst_level].npix_y;
+		detile = 0;
+		x = dst_x;
+		y = dst_y;
+		z = dst_z;
+		base = rdst->surface.level[dst_level].offset;
+		addr = rsrc->surface.level[src_level].offset;
+		addr += rsrc->surface.level[src_level].slice_size * src_z;
+		addr += src_y * pitch + src_x * bpp;
+	}
+	/* check that we are in dw/base alignment constraint */
+	if ((addr & 0x3) || (base & 0xff)) {
+		return FALSE;
+	}
+
+	size = (copy_height * pitch) >> 2;
+	ncopy = (size / 0x0000ffff) + !!(size % 0x0000ffff);
+	r600_need_dma_space(rctx, ncopy * 7);
+	for (i = 0; i < ncopy; i++) {
+		cheight = copy_height;
+		if (((cheight * pitch) >> 2) > 0x0000ffff) {
+			cheight = (0x0000ffff << 2) / pitch;
+		}
+		size = (cheight * pitch) >> 2;
+		/* emit reloc before writting cs so that cs is always in consistent state */
+		r600_context_bo_reloc(rctx, &rctx->rings.dma, &rsrc->resource, RADEON_USAGE_READ);
+		r600_context_bo_reloc(rctx, &rctx->rings.dma, &rdst->resource, RADEON_USAGE_WRITE);
+		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 1, 0, size);
+		cs->buf[cs->cdw++] = base >> 8;
+		cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) |
+					(lbpp << 24) | ((height - 1) << 10) |
+					pitch_tile_max;
+		cs->buf[cs->cdw++] = (slice_tile_max << 12) | (z << 0);
+		cs->buf[cs->cdw++] = (x << 3) | (y << 17);
+		cs->buf[cs->cdw++] = addr & 0xfffffffc;
+		cs->buf[cs->cdw++] = (addr >> 32UL) & 0xff;
+		copy_height -= cheight;
+		addr += cheight * pitch;
+		y += cheight;
+	}
+	return TRUE;
+}
+
+boolean r600_dma_blit(struct pipe_context *ctx,
+			struct pipe_resource *dst,
+			unsigned dst_level,
+			unsigned dst_x, unsigned dst_y, unsigned dst_z,
+			struct pipe_resource *src,
+			unsigned src_level,
+			const struct pipe_box *src_box)
+{
+	struct r600_context *rctx = (struct r600_context *)ctx;
+	struct r600_texture *rsrc = (struct r600_texture*)src;
+	struct r600_texture *rdst = (struct r600_texture*)dst;
+	unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height;
+	unsigned src_w, dst_w;
+
+	if (rctx->rings.dma.cs == NULL) {
+		return FALSE;
+	}
+	if (src->format != dst->format) {
+		return FALSE;
+	}
+
+	bpp = rdst->surface.bpe;
+	dst_pitch = rdst->surface.level[dst_level].pitch_bytes;
+	src_pitch = rsrc->surface.level[src_level].pitch_bytes;
+	src_w = rsrc->surface.level[src_level].npix_x;
+	dst_w = rdst->surface.level[dst_level].npix_x;
+	copy_height = src_box->height / rsrc->surface.blk_h;
+
+	dst_mode = rdst->surface.level[dst_level].mode;
+	src_mode = rsrc->surface.level[src_level].mode;
+	/* downcast linear aligned to linear to simplify test */
+	src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+	dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+
+	if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) {
+		/* strick requirement on r6xx/r7xx */
+		return FALSE;
+	}
+	/* lot of constraint on alignment this should capture them all */
+	if ((src_pitch & 0x7) || (src_box->y & 0x7) || (dst_y & 0x7)) {
+		return FALSE;
+	}
+
+	if (src_mode == dst_mode) {
+		unsigned long dst_offset, src_offset, size;
+
+		/* simple dma blit would do NOTE code here assume :
+		 *   src_box.x/y == 0
+		 *   dst_x/y == 0
+		 *   dst_pitch == src_pitch
+		 */
+		src_offset= rsrc->surface.level[src_level].offset;
+		src_offset += rsrc->surface.level[src_level].slice_size * src_box->z;
+		src_offset += src_box->y * src_pitch + src_box->x * bpp;
+		dst_offset = rdst->surface.level[dst_level].offset;
+		dst_offset += rdst->surface.level[dst_level].slice_size * dst_z;
+		dst_offset += dst_y * dst_pitch + dst_x * bpp;
+		size = src_box->height * src_pitch;
+		/* must be dw aligned */
+		if ((dst_offset & 0x3) || (src_offset & 0x3) || (size & 0x3)) {
+			return FALSE;
+		}
+		r600_dma_copy(rctx, dst, src, dst_offset, src_offset, size);
+	} else {
+		return r600_dma_copy_tile(rctx, dst, dst_level, dst_x, dst_y, dst_z,
+					src, src_level, src_box->x, src_box->y, src_box->z,
+					copy_height, dst_pitch, bpp);
+	}
+	return TRUE;
+}
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@ -1273,6 +1273,9 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		return;
 	}

+	/* make sure that the gfx ring is only one active */
+	rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC);
+
 	if (!r600_update_derived_state(rctx)) {
 		/* useless to render because current rendering command
 		 * can't be achieved
@ -1280,9 +1283,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		return;
 	}

-	/* make sure that the gfx ring is only one active */
-	rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC);
-
 	if (info.indexed) {
 		/* Initialize the index buffer struct. */
 		pipe_resource_reference(&ib.buffer, rctx->index_buffer.buffer);
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@ -35,13 +35,19 @@
 /* Copy from a full GPU texture to a transfer's staging one. */
 static void r600_copy_to_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer)
 {
+	struct r600_context *rctx = (struct r600_context*)ctx;
 	struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
 	struct pipe_resource *dst = &rtransfer->staging->b.b;
 	struct pipe_resource *src = transfer->resource;

 	if (src->nr_samples <= 1) {
-		ctx->resource_copy_region(ctx, dst, 0, 0, 0, 0,
-					  src, transfer->level, &transfer->box);
+		if (!rctx->screen->dma_blit(ctx, dst, 0, 0, 0, 0,
+					    src, transfer->level,
+					    &transfer->box)) {
+			/* async dma could not be use */
+			ctx->resource_copy_region(ctx, dst, 0, 0, 0, 0,
+						  src, transfer->level, &transfer->box);
+		}
 	} else {
 		/* Resolve the resource. */
 		struct pipe_blit_info blit;
@ -66,16 +72,22 @@ static void r600_copy_to_staging_texture(struct pipe_context *ctx, struct r600_t
 /* Copy from a transfer's staging texture to a full GPU one. */
 static void r600_copy_from_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer)
 {
+	struct r600_context *rctx = (struct r600_context*)ctx;
 	struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
 	struct pipe_resource *texture = transfer->resource;
 	struct pipe_box sbox;

 	u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);

-	ctx->resource_copy_region(ctx, texture, transfer->level,
-				  transfer->box.x, transfer->box.y, transfer->box.z,
-				  &rtransfer->staging->b.b,
-				  0, &sbox);
+	if (!rctx->screen->dma_blit(ctx, texture, transfer->level,
+				    transfer->box.x, transfer->box.y, transfer->box.z,
+				    &rtransfer->staging->b.b, 0, &sbox)) {
+		/* async dma could not be use */
+		ctx->resource_copy_region(ctx, texture, transfer->level,
+					  transfer->box.x, transfer->box.y, transfer->box.z,
+					  &rtransfer->staging->b.b,
+					  0, &sbox);
+	}
 }

 unsigned r600_texture_get_offset(struct r600_texture *rtex,
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@ -3681,4 +3681,19 @@
 #define SQ_TEX_INST_SAMPLE_C_G_LB	0x1E
 #define SQ_TEX_INST_SAMPLE_C_G_LZ	0x1F

+/* async DMA packets */
+#define DMA_PACKET(cmd, t, s, n)	((((cmd) & 0xF) << 28) |	\
+					(((t) & 0x1) << 23) |		\
+					(((s) & 0x1) << 22) |		\
+					(((n) & 0xFFFF) << 0))
+/* async DMA Packet types */
+#define DMA_PACKET_WRITE		0x2
+#define DMA_PACKET_COPY			0x3
+#define DMA_PACKET_INDIRECT_BUFFER	0x4
+#define DMA_PACKET_SEMAPHORE		0x5
+#define DMA_PACKET_FENCE		0x6
+#define DMA_PACKET_TRAP			0x7
+#define DMA_PACKET_CONSTANT_FILL	0xd /* 7xx only */
+#define DMA_PACKET_NOP			0xf
+
 #endif