nv04-nv40: new 2D: add new Gallium-independent 2D engine

This patch add a brand new nv04-nv40 2D engine module.
It should correctly implement all operations involving swizzled, and 3D-swizzled surfaces.

This code is independent from the Gallium framework and can thus be reused in the DDX and classic Mesa drivers (it's only likely to be useful in the latter, though).

Currently, surface_copy and surface_fill are broken for 3D textures, for swizzled source textures and possibly for some misaligned cases

The code is based around the new nv04_region structure, which encapsulates the information from pipe_surface needed for the 2D engine and CPU copies.
The use of nv04_region makes the code independent of the Gallium framework and allows to transform the nv04_region without clobbering the nv04_region.
The existing M2MF, blitter, and SWIZZLED_SURFACE paths have been improved and a new CPU path has been added.
There is also support to tell the caller to use the 3D engine.

The main feature of the copy/fill setup algorithm is linearization/contiguous-linearization of swizzled surfaces.
The idea of linearization is that some swizzled surfaces are laid out like linear ones (1xN, 2xN, Nx1) and can thus be used as such (e.g. useful for copying single pixels).
Also, some rectangles (e.g. the whole surface) are contiguous in memory. If both the source and destination rectangles are swizzled but contiguous, then they can be regarded as both linear: this is the idea of "contiguous linearization".
This, for instance, allows to use the 2D engine to duplicate the content of a swizzled surface to another swizzled surface, by pretending they are actually linear.
After linearization, the result may not be 64-byte aligned. Another transformation is done to enlarge the linear surface so that it becomes 64-byte aligned.
This is also used to 64-byte align swizzled texture mipmaps.

The inner loop of the CPU path is as optimized as possible without using SSE/SSE2.
Future improvements could include SSE/SSE2 support, and possibly a faster coordinate swizzling algorithm (which is however not used in the inner loop).
It may be a good idea to autogenerate swizzling code at least for all possible POT 2D texture dimensions  (less than 256), maybe for all 3D ones too (less than 4096).
Also, it woud be a very good idea to make a copy with the GPU first if the source surface is in uncached memory.
This commit is contained in:
Luca Barbieri 2010-01-19 18:51:10 +01:00
parent 23639dc046
commit 24a4ea003f
4 changed files with 1478 additions and 0 deletions

View file

@ -5,6 +5,7 @@ LIBNAME = nvfx
C_SOURCES = \
nv04_surface_2d.c \
nv04_2d.c \
nvfx_buffer.c \
nvfx_context.c \
nvfx_clear.c \

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,87 @@
/**************************************************************************
*
* Copyright 2009 Ben Skeggs
* Copyright 2009 Younes Manton
* Copyright 2010 Luca Barbieri
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
* AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
**************************************************************************/
/* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */
#ifndef __NV04_2D_H__
#define __NV04_2D_H__
struct nv04_2d_context;
struct nouveau_channel;
struct nouveau_bo;
// NOTE: all functions taking this as a parameter will CLOBBER it (except for ->bo)
struct nv04_region {
struct nouveau_bo* bo;
int offset;
unsigned pitch; // 0 -> swizzled
unsigned bpps; // bpp shift (0, 1, 2; 3, 4 for fp/compressed)
unsigned x, y, z;
unsigned w, h, d;
};
void
nv04_memcpy(struct nv04_2d_context *ctx,
struct nouveau_bo* dstbo, int dstoff,
struct nouveau_bo* srcbo, int srcoff,
unsigned size);
unsigned
nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h);
unsigned
nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h);
void
nv04_2d_context_takedown(struct nv04_2d_context *pctx);
struct nv04_2d_context *
nv04_2d_context_init(struct nouveau_channel* chan);
void
nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h);
void
nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value);
int
nv04_region_copy_2d(struct nv04_2d_context *ctx,
struct nv04_region* dst, struct nv04_region* src,
int w, int h,
int cs2d_format, int sifm_format,
int dst_to_gpu, int src_on_gpu);
int
nv04_region_fill_2d(struct nv04_2d_context *ctx,
struct nv04_region *dst,
int w, int h,
unsigned value);
#endif

View file

@ -0,0 +1,70 @@
#ifndef T
{
if(dst->bpps == 0)
#define T uint8_t
#include "nv04_2d_loops.h"
#undef T
else if(dst->bpps == 1)
#define T uint16_t
#include "nv04_2d_loops.h"
#undef T
else if(dst->bpps == 2)
#define T uint32_t
#include "nv04_2d_loops.h"
#undef T
else
assert(0);
}
#else
#ifdef SWIZZLED_COPY_LOOPS
{
if(!dst->pitch)
{
if(!src->pitch)
{
LOOP_Y
{
T* pdst = (T*)mdst + dswy[iy];
T* psrc = (T*)msrc + sswy[iy];
LOOP_X
{
assert((char*)&psrc[sswx[ix] + 1] <= ((char*)src->bo->map + src->bo->size));
assert((char*)&pdst[dswx[ix] + 1] <= ((char*)dst->bo->map + dst->bo->size));
pdst[dswx[ix]] = psrc[sswx[ix]];
}
}
}
else
{
T* psrc = (T*)(msrc + ((dir > 0) ? src->y : (src->y + h - 1)) * src->pitch) + src->x;
LOOP_Y
{
T* pdst = (T*)mdst + dswy[iy];
LOOP_X
{
assert((char*)&psrc[ix + 1] <= ((char*)src->bo->map + src->bo->size));
assert((char*)&pdst[dswx[ix] + 1] <= ((char*)dst->bo->map + dst->bo->size));
pdst[dswx[ix]] = psrc[ix];
}
psrc = (T*)((char*)psrc + dir * src->pitch);
}
}
}
else
{
T* pdst = (T*)(mdst + ((dir > 0) ? dst->y : (dst->y + h - 1)) * dst->pitch) + dst->x;
LOOP_Y
{
T* psrc = (T*)msrc + sswy[iy];
LOOP_X
{
assert((char*)&psrc[sswx[ix] + 1] <= ((char*)src->bo->map + src->bo->size));
assert((char*)&pdst[ix + 1] <= ((char*)dst->bo->map + dst->bo->size));
pdst[ix] = psrc[sswx[ix]];
}
pdst = (T*)((char*)pdst + dir * dst->pitch);
}
}
}
#endif
#endif