1/* NV Acceleration functions */
2
3/* Author:
4   Rudolf Cornelissen 8/2003-6/2010.
5
6   This code was possible thanks to:
7    - the Linux XFree86 NV driver,
8    - the Linux UtahGLX 3D driver.
9*/
10
11#define MODULE_BIT 0x00080000
12
13#include "nv_std.h"
14
15/*acceleration notes*/
16
17/*functions Be's app_server uses:
18fill span (horizontal only)
19fill rectangle (these 2 are very similar)
20invert rectangle
21blit
22*/
23
24static void nv_init_for_3D_dma(void);
25static void nv_start_dma(void);
26static status_t nv_acc_fifofree_dma(uint16 cmd_size);
27static void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size);
28static void nv_acc_set_ch_dma(uint16 ch, uint32 handle);
29
30/* used to track engine DMA stalls */
31static uint8 err;
32
33/* wait until engine completely idle */
34status_t nv_acc_wait_idle_dma()
35{
36	/* we'd better check for timeouts on the DMA engine as it's theoretically
37	 * breakable by malfunctioning software */
38	uint16 cnt = 0;
39
40	/* wait until all upcoming commands are in execution at least. Do this until
41	 * we hit a timeout; abort if we failed at least three times before:
42	 * if DMA stalls, we have to forget about it alltogether at some point, or
43	 * the system will almost come to a complete halt.. */
44	/* note:
45	 * it doesn't matter which FIFO channel's DMA registers we access, they are in
46	 * fact all the same set. It also doesn't matter if the channel was assigned a
47	 * command or not. */
48	while ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET) != (si->engine.dma.put << 2)) &&
49			(cnt < 10000) && (err < 3))
50	{
51		/* snooze a bit so I do not hammer the bus */
52		snooze (100);
53		cnt++;
54	}
55
56	/* log timeout if we had one */
57	if (cnt == 10000)
58	{
59		if (err < 3) err++;
60		LOG(4,("ACC_DMA: wait_idle; DMA timeout #%d, engine trouble!\n", err));
61	}
62
63	/* wait until execution completed */
64	while (ACCR(STATUS))
65	{
66		/* snooze a bit so I do not hammer the bus */
67		snooze (100);
68	}
69
70	return B_OK;
71}
72
73/* AFAIK this must be done for every new screenmode.
74 * Engine required init. */
75status_t nv_acc_init_dma()
76{
77	uint32 cnt, tmp;
78	uint32 surf_depth, cmd_depth;
79	/* reset the engine DMA stalls counter */
80	err = 0;
81
82	/* a hanging engine only recovers from a complete power-down/power-up cycle */
83	NV_REG32(NV32_PWRUPCTRL) = 0xffff00ff;
84	snooze(1000);
85	NV_REG32(NV32_PWRUPCTRL) = 0xffffffff;
86
87	/* don't try this on NV20 and later.. */
88	/* note:
89	 * the specific register that's responsible for the speedfix on NV18 is
90	 * $00400ed8: bit 6 needs to be zero for fastest rendering (confirmed). */
91	/* note also:
92	 * on NV28 the following ranges could be reset (confirmed):
93	 * $00400000 upto/incl. $004002fc;
94	 * $00400400 upto/incl. $004017fc;
95	 * $0040180c upto/incl. $00401948;
96	 * $00401994 upto/incl. $00401a80;
97	 * $00401a94 upto/incl. $00401ffc.
98	 * The intermediate ranges hang the engine upon resetting. */
99	if (si->ps.card_arch < NV20A)
100	{
101		/* actively reset the PGRAPH registerset (acceleration engine) */
102		for (cnt = 0x00400000; cnt < 0x00402000; cnt +=4)
103		{
104			NV_REG32(cnt) = 0x00000000;
105		}
106	}
107
108	/* setup PTIMER: */
109	LOG(4,("ACC_DMA: timer numerator $%08x, denominator $%08x\n", ACCR(PT_NUMERATOR), ACCR(PT_DENOMINATR)));
110
111	/* The NV28 BIOS programs PTIMER like this (see coldstarting in nv_info.c) */
112	//ACCW(PT_NUMERATOR, (si->ps.std_engine_clock * 20));
113	//ACCW(PT_DENOMINATR, 0x00000271);
114	/* Nouveau (march 2009) mentions something like: writing 8 and 3 to these regs breaks the timings
115	 * on the LVDS hardware sequencing microcode. A correct solution involves calculations with the GPU PLL. */
116
117	/* For now use BIOS pre-programmed values if there */
118	if (!ACCR(PT_NUMERATOR) || !ACCR(PT_DENOMINATR)) {
119		/* set timer numerator to 8 (in b0-15) */
120		ACCW(PT_NUMERATOR, 0x00000008);
121		/* set timer denominator to 3 (in b0-15) */
122		ACCW(PT_DENOMINATR, 0x00000003);
123	}
124
125	/* disable timer-alarm INT requests (b0) */
126	ACCW(PT_INTEN, 0x00000000);
127	/* reset timer-alarm INT status bit (b0) */
128	ACCW(PT_INTSTAT, 0xffffffff);
129
130	/* enable PRAMIN write access on pre NV10 before programming it! */
131	if (si->ps.card_arch == NV04A)
132	{
133		/* set framebuffer config: type = notiling, PRAMIN write access enabled */
134		NV_REG32(NV32_PFB_CONFIG_0) = 0x00001114;
135	}
136	else
137	{
138		/* setup acc engine 'source' tile adressranges */
139		if ((si->ps.card_type <= NV40) || (si->ps.card_type == NV45))
140		{
141			ACCW(NV10_FBTIL0AD, 0);
142			ACCW(NV10_FBTIL1AD, 0);
143			ACCW(NV10_FBTIL2AD, 0);
144			ACCW(NV10_FBTIL3AD, 0);
145			ACCW(NV10_FBTIL4AD, 0);
146			ACCW(NV10_FBTIL5AD, 0);
147			ACCW(NV10_FBTIL6AD, 0);
148			ACCW(NV10_FBTIL7AD, 0);
149			ACCW(NV10_FBTIL0ED, (si->ps.memory_size - 1));
150			ACCW(NV10_FBTIL1ED, (si->ps.memory_size - 1));
151			ACCW(NV10_FBTIL2ED, (si->ps.memory_size - 1));
152			ACCW(NV10_FBTIL3ED, (si->ps.memory_size - 1));
153			ACCW(NV10_FBTIL4ED, (si->ps.memory_size - 1));
154			ACCW(NV10_FBTIL5ED, (si->ps.memory_size - 1));
155			ACCW(NV10_FBTIL6ED, (si->ps.memory_size - 1));
156			ACCW(NV10_FBTIL7ED, (si->ps.memory_size - 1));
157		}
158		else
159		{
160			/* NV41, 43, 44, G70 and up */
161			ACCW(NV41_FBTIL0AD, 0);
162			ACCW(NV41_FBTIL1AD, 0);
163			ACCW(NV41_FBTIL2AD, 0);
164			ACCW(NV41_FBTIL3AD, 0);
165			ACCW(NV41_FBTIL4AD, 0);
166			ACCW(NV41_FBTIL5AD, 0);
167			ACCW(NV41_FBTIL6AD, 0);
168			ACCW(NV41_FBTIL7AD, 0);
169			ACCW(NV41_FBTIL8AD, 0);
170			ACCW(NV41_FBTIL9AD, 0);
171			ACCW(NV41_FBTILAAD, 0);
172			ACCW(NV41_FBTILBAD, 0);
173			ACCW(NV41_FBTIL0ED, (si->ps.memory_size - 1));
174			ACCW(NV41_FBTIL1ED, (si->ps.memory_size - 1));
175			ACCW(NV41_FBTIL2ED, (si->ps.memory_size - 1));
176			ACCW(NV41_FBTIL3ED, (si->ps.memory_size - 1));
177			ACCW(NV41_FBTIL4ED, (si->ps.memory_size - 1));
178			ACCW(NV41_FBTIL5ED, (si->ps.memory_size - 1));
179			ACCW(NV41_FBTIL6ED, (si->ps.memory_size - 1));
180			ACCW(NV41_FBTIL7ED, (si->ps.memory_size - 1));
181			ACCW(NV41_FBTIL8ED, (si->ps.memory_size - 1));
182			ACCW(NV41_FBTIL9ED, (si->ps.memory_size - 1));
183			ACCW(NV41_FBTILAED, (si->ps.memory_size - 1));
184			ACCW(NV41_FBTILBED, (si->ps.memory_size - 1));
185
186			if (si->ps.card_type >= G70)
187			{
188				ACCW(G70_FBTILCAD, 0);
189				ACCW(G70_FBTILDAD, 0);
190				ACCW(G70_FBTILEAD, 0);
191				ACCW(G70_FBTILCED, (si->ps.memory_size - 1));
192				ACCW(G70_FBTILDED, (si->ps.memory_size - 1));
193				ACCW(G70_FBTILEED, (si->ps.memory_size - 1));
194			}
195		}
196	}
197
198	/*** PRAMIN ***/
199	/* first clear the entire RAMHT (hash-table) space to a defined state. It turns
200	 * out at least NV11 will keep the previously programmed handles over resets and
201	 * power-outages upto about 15 seconds!! Faulty entries might well hang the
202	 * engine (confirmed on NV11).
203	 * Note:
204	 * this behaviour is not very strange: even very old DRAM chips are known to be
205	 * able to do this, even though you should refresh them every few milliseconds or
206	 * so. (Large memory cell capacitors, though different cells vary a lot in their
207	 * capacity.)
208	 * Of course data validity is not certain by a long shot over this large
209	 * amount of time.. */
210	for(cnt = 0; cnt < 0x0400; cnt++)
211		NV_REG32(NVACC_HT_HANDL_00 + (cnt << 2)) = 0;
212	/* RAMHT (hash-table) space SETUP FIFO HANDLES */
213	/* note:
214	 * 'instance' tells you where the engine command is stored in 'PR_CTXx_x' sets
215	 * below: instance being b4-19 with baseadress NV_PRAMIN_CTX_0 (0x00700000).
216	 * That command is linked to the handle noted here. This handle is then used to
217	 * tell the FIFO to which engine command it is connected!
218	 * (CTX registers are actually a sort of RAM space.) */
219	if (si->ps.card_arch >= NV40A)
220	{
221		/* (first set) */
222		ACCW(HT_HANDL_00, (0x80000000 | NV10_CONTEXT_SURFACES_2D)); /* 32bit handle (not used) */
223		ACCW(HT_VALUE_00, 0x0010114c); /* instance $114c, engine = acc engine, CHID = $00 */
224
225		ACCW(HT_HANDL_01, (0x80000000 | NV_IMAGE_BLIT)); /* 32bit handle */
226		ACCW(HT_VALUE_01, 0x00101148); /* instance $1148, engine = acc engine, CHID = $00 */
227
228		ACCW(HT_HANDL_02, (0x80000000 | NV4_GDI_RECTANGLE_TEXT)); /* 32bit handle */
229		ACCW(HT_VALUE_02, 0x0010114a); /* instance $114a, engine = acc engine, CHID = $00 */
230
231		/* (second set) */
232		ACCW(HT_HANDL_10, (0x80000000 | NV_ROP5_SOLID)); /* 32bit handle */
233		ACCW(HT_VALUE_10, 0x00101142); /* instance $1142, engine = acc engine, CHID = $00 */
234
235		ACCW(HT_HANDL_11, (0x80000000 | NV_IMAGE_BLACK_RECTANGLE)); /* 32bit handle */
236		ACCW(HT_VALUE_11, 0x00101144); /* instance $1144, engine = acc engine, CHID = $00 */
237
238		ACCW(HT_HANDL_12, (0x80000000 | NV_IMAGE_PATTERN)); /* 32bit handle */
239		ACCW(HT_VALUE_12, 0x00101146); /* instance $1146, engine = acc engine, CHID = $00 */
240
241		ACCW(HT_HANDL_13, (0x80000000 | NV_SCALED_IMAGE_FROM_MEMORY)); /* 32bit handle */
242		ACCW(HT_VALUE_13, 0x0010114e); /* instance $114e, engine = acc engine, CHID = $00 */
243	}
244	else
245	{
246		/* (first set) */
247		ACCW(HT_HANDL_00, (0x80000000 | NV4_SURFACE)); /* 32bit handle */
248		ACCW(HT_VALUE_00, 0x80011145); /* instance $1145, engine = acc engine, CHID = $00 */
249
250		ACCW(HT_HANDL_01, (0x80000000 | NV_IMAGE_BLIT)); /* 32bit handle */
251		ACCW(HT_VALUE_01, 0x80011146); /* instance $1146, engine = acc engine, CHID = $00 */
252
253		ACCW(HT_HANDL_02, (0x80000000 | NV4_GDI_RECTANGLE_TEXT)); /* 32bit handle */
254		ACCW(HT_VALUE_02, 0x80011147); /* instance $1147, engine = acc engine, CHID = $00 */
255
256		ACCW(HT_HANDL_03, (0x80000000 | NV4_CONTEXT_SURFACES_ARGB_ZS)); /* 32bit handle (3D) */
257		ACCW(HT_VALUE_03, 0x80011148); /* instance $1148, engine = acc engine, CHID = $00 */
258
259		/* NV4_ and NV10_DX5_TEXTURE_TRIANGLE should be identical */
260		ACCW(HT_HANDL_04, (0x80000000 | NV4_DX5_TEXTURE_TRIANGLE)); /* 32bit handle (3D) */
261		ACCW(HT_VALUE_04, 0x80011149); /* instance $1149, engine = acc engine, CHID = $00 */
262
263		/* NV4_ and NV10_DX6_MULTI_TEXTURE_TRIANGLE should be identical */
264		ACCW(HT_HANDL_05, (0x80000000 | NV4_DX6_MULTI_TEXTURE_TRIANGLE)); /* 32bit handle (not used) */
265		ACCW(HT_VALUE_05, 0x8001114a); /* instance $114a, engine = acc engine, CHID = $00 */
266
267		ACCW(HT_HANDL_06, (0x80000000 | NV1_RENDER_SOLID_LIN)); /* 32bit handle (not used) */
268		ACCW(HT_VALUE_06, 0x8001114c); /* instance $114c, engine = acc engine, CHID = $00 */
269
270		/* (second set) */
271		ACCW(HT_HANDL_10, (0x80000000 | NV_ROP5_SOLID)); /* 32bit handle */
272		ACCW(HT_VALUE_10, 0x80011142); /* instance $1142, engine = acc engine, CHID = $00 */
273
274		ACCW(HT_HANDL_11, (0x80000000 | NV_IMAGE_BLACK_RECTANGLE)); /* 32bit handle */
275		ACCW(HT_VALUE_11, 0x80011143); /* instance $1143, engine = acc engine, CHID = $00 */
276
277		ACCW(HT_HANDL_12, (0x80000000 | NV_IMAGE_PATTERN)); /* 32bit handle */
278		ACCW(HT_VALUE_12, 0x80011144); /* instance $1144, engine = acc engine, CHID = $00 */
279
280		ACCW(HT_HANDL_13, (0x80000000 | NV_SCALED_IMAGE_FROM_MEMORY)); /* 32bit handle */
281		ACCW(HT_VALUE_13, 0x8001114b); /* instance $114b, engine = acc engine, CHID = $00 */
282
283		//2007 3D tests..
284		if (si->ps.card_type == NV15)
285		{
286			ACCW(HT_HANDL_14, (0x80000000 | NV_TCL_PRIMITIVE_3D)); /* 32bit handle */
287			ACCW(HT_VALUE_14, 0x8001114d); /* instance $114d, engine = acc engine, CHID = $00 */
288		}
289
290	}
291
292	/* program CTX registers: CTX1 is mostly done later (colorspace dependant) */
293	/* note:
294	 * CTX determines which HT handles point to what engine commands. */
295	/* note also:
296	 * CTX registers are in fact in the same GPU internal RAM space as the engine's
297	 * hashtable. This means that stuff programmed in here also survives resets and
298	 * power-outages! (confirmed NV11) */
299	if (si->ps.card_arch >= NV40A)
300	{
301		/* setup a DMA define for use by command defines below. */
302		ACCW(PR_CTX0_R, 0x00003000); /* DMA page table present and of linear type;
303									  * DMA target node is NVM (non-volatile memory?)
304									  * (instead of doing PCI or AGP transfers) */
305		ACCW(PR_CTX1_R, (si->ps.memory_size - 1)); /* DMA limit: size is all cardRAM */
306		ACCW(PR_CTX2_R, ((0x00000000 & 0xfffff000) | 0x00000002));
307									 /* DMA access type is READ_AND_WRITE;
308									  * memory starts at start of cardRAM (b12-31):
309									  * It's adress needs to be at a 4kb boundary! */
310		ACCW(PR_CTX3_R, 0x00000002); /* unknown (looks like this is rubbish/not needed?) */
311		/* setup set '0' for cmd NV_ROP5_SOLID */
312		ACCW(PR_CTX0_0, 0x02080043); /* NVclass $043, patchcfg ROP_AND, nv10+: little endian */
313		ACCW(PR_CTX1_0, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
314		ACCW(PR_CTX2_0, 0x00000000); /* DMA0 and DMA1 instance invalid */
315		ACCW(PR_CTX3_0, 0x00000000); /* method traps disabled */
316		ACCW(PR_CTX0_1, 0x00000000); /* extra */
317		ACCW(PR_CTX1_1, 0x00000000); /* extra */
318		/* setup set '1' for cmd NV_IMAGE_BLACK_RECTANGLE */
319		ACCW(PR_CTX0_2, 0x02080019); /* NVclass $019, patchcfg ROP_AND, nv10+: little endian */
320		ACCW(PR_CTX1_2, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
321		ACCW(PR_CTX2_2, 0x00000000); /* DMA0 and DMA1 instance invalid */
322		ACCW(PR_CTX3_2, 0x00000000); /* method traps disabled */
323		ACCW(PR_CTX0_3, 0x00000000); /* extra */
324		ACCW(PR_CTX1_3, 0x00000000); /* extra */
325		/* setup set '2' for cmd NV_IMAGE_PATTERN */
326		ACCW(PR_CTX0_4, 0x02080018); /* NVclass $018, patchcfg ROP_AND, nv10+: little endian */
327		ACCW(PR_CTX1_4, 0x02000000); /* colorspace not set, notify instance is $0200 (b16-31) */
328		ACCW(PR_CTX2_4, 0x00000000); /* DMA0 and DMA1 instance invalid */
329		ACCW(PR_CTX3_4, 0x00000000); /* method traps disabled */
330		ACCW(PR_CTX0_5, 0x00000000); /* extra */
331		ACCW(PR_CTX1_5, 0x00000000); /* extra */
332		/* setup set '4' for cmd NV12_IMAGE_BLIT */
333		ACCW(PR_CTX0_6, 0x0208009f); /* NVclass $09f, patchcfg ROP_AND, nv10+: little endian */
334		ACCW(PR_CTX1_6, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
335		ACCW(PR_CTX2_6, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
336		ACCW(PR_CTX3_6, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
337		ACCW(PR_CTX0_7, 0x00000000); /* extra */
338		ACCW(PR_CTX1_7, 0x00000000); /* extra */
339		/* setup set '5' for cmd NV4_GDI_RECTANGLE_TEXT */
340		ACCW(PR_CTX0_8, 0x0208004a); /* NVclass $04a, patchcfg ROP_AND, nv10+: little endian */
341		ACCW(PR_CTX1_8, 0x02000000); /* colorspace not set, notify instance is $0200 (b16-31) */
342		ACCW(PR_CTX2_8, 0x00000000); /* DMA0 and DMA1 instance invalid */
343		ACCW(PR_CTX3_8, 0x00000000); /* method traps disabled */
344		ACCW(PR_CTX0_9, 0x00000000); /* extra */
345		ACCW(PR_CTX1_9, 0x00000000); /* extra */
346		/* setup set '6' for cmd NV10_CONTEXT_SURFACES_2D */
347		ACCW(PR_CTX0_A, 0x02080062); /* NVclass $062, nv10+: little endian */
348		ACCW(PR_CTX1_A, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
349		ACCW(PR_CTX2_A, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
350		ACCW(PR_CTX3_A, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
351		ACCW(PR_CTX0_B, 0x00000000); /* extra */
352		ACCW(PR_CTX1_B, 0x00000000); /* extra */
353		/* setup set '7' for cmd NV_SCALED_IMAGE_FROM_MEMORY */
354		ACCW(PR_CTX0_C, 0x02080077); /* NVclass $077, nv10+: little endian */
355		ACCW(PR_CTX1_C, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
356		ACCW(PR_CTX2_C, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
357		ACCW(PR_CTX3_C, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
358		ACCW(PR_CTX0_D, 0x00000000); /* extra */
359		ACCW(PR_CTX1_D, 0x00000000); /* extra */
360		/* setup DMA set pointed at by PF_CACH1_DMAI */
361		ACCW(PR_CTX0_E, 0x00003002); /* DMA page table present and of linear type;
362									  * DMA class is $002 (b0-11);
363									  * DMA target node is NVM (non-volatile memory?)
364									  * (instead of doing PCI or AGP transfers) */
365		ACCW(PR_CTX1_E, 0x00007fff); /* DMA limit: tablesize is 32k bytes */
366		ACCW(PR_CTX2_E, (((si->ps.memory_size - 1) & 0xffff8000) | 0x00000002));
367									 /* DMA access type is READ_AND_WRITE;
368									  * table is located at end of cardRAM (b12-31):
369									  * It's adress needs to be at a 4kb boundary! */
370	}
371	else
372	{
373		/* setup a DMA define for use by command defines below. */
374		ACCW(PR_CTX0_R, 0x00003000); /* DMA page table present and of linear type;
375									  * DMA target node is NVM (non-volatile memory?)
376									  * (instead of doing PCI or AGP transfers) */
377		ACCW(PR_CTX1_R, (si->ps.memory_size - 1)); /* DMA limit: size is all cardRAM */
378		ACCW(PR_CTX2_R, ((0x00000000 & 0xfffff000) | 0x00000002));
379									 /* DMA access type is READ_AND_WRITE;
380									  * memory starts at start of cardRAM (b12-31):
381									  * It's adress needs to be at a 4kb boundary! */
382		ACCW(PR_CTX3_R, 0x00000002); /* unknown (looks like this is rubbish/not needed?) */
383		/* setup set '0' for cmd NV_ROP5_SOLID */
384		ACCW(PR_CTX0_0, 0x01008043); /* NVclass $043, patchcfg ROP_AND, nv10+: little endian */
385		ACCW(PR_CTX1_0, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
386		ACCW(PR_CTX2_0, 0x00000000); /* DMA0 and DMA1 instance invalid */
387		ACCW(PR_CTX3_0, 0x00000000); /* method traps disabled */
388		/* setup set '1' for cmd NV_IMAGE_BLACK_RECTANGLE */
389		ACCW(PR_CTX0_1, 0x01008019); /* NVclass $019, patchcfg ROP_AND, nv10+: little endian */
390		ACCW(PR_CTX1_1, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
391		ACCW(PR_CTX2_1, 0x00000000); /* DMA0 and DMA1 instance invalid */
392		ACCW(PR_CTX3_1, 0x00000000); /* method traps disabled */
393		/* setup set '2' for cmd NV_IMAGE_PATTERN */
394		ACCW(PR_CTX0_2, 0x01008018); /* NVclass $018, patchcfg ROP_AND, nv10+: little endian */
395		ACCW(PR_CTX1_2, 0x00000002); /* colorspace not set, notify instance is $0200 (b16-31) */
396		ACCW(PR_CTX2_2, 0x00000000); /* DMA0 and DMA1 instance invalid */
397		ACCW(PR_CTX3_2, 0x00000000); /* method traps disabled */
398		/* setup set '3' for ... */
399		if(si->ps.card_arch >= NV10A)
400		{
401			/* ... cmd NV10_CONTEXT_SURFACES_2D */
402			ACCW(PR_CTX0_3, 0x01008062); /* NVclass $062, nv10+: little endian */
403		}
404		else
405		{
406			/* ... cmd NV4_SURFACE */
407			ACCW(PR_CTX0_3, 0x01008042); /* NVclass $042, nv10+: little endian */
408		}
409		ACCW(PR_CTX1_3, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
410		ACCW(PR_CTX2_3, 0x11401140); /* DMA0 instance is $1140, DMA1 instance invalid */
411		ACCW(PR_CTX3_3, 0x00000000); /* method trap 0 is $1140, trap 1 disabled */
412		/* setup set '4' for ... */
413		if (si->ps.card_type >= NV11)
414		{
415			/* ... cmd NV12_IMAGE_BLIT */
416			ACCW(PR_CTX0_4, 0x0100809f); /* NVclass $09f, patchcfg ROP_AND, nv10+: little endian */
417		}
418		else
419		{
420			/* ... cmd NV_IMAGE_BLIT */
421			ACCW(PR_CTX0_4, 0x0100805f); /* NVclass $05f, patchcfg ROP_AND, nv10+: little endian */
422		}
423		ACCW(PR_CTX1_4, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
424		ACCW(PR_CTX2_4, 0x11401140); /* DMA0 instance is $1140, DMA1 instance invalid */
425		ACCW(PR_CTX3_4, 0x00000000); /* method trap 0 is $1140, trap 1 disabled */
426		/* setup set '5' for cmd NV4_GDI_RECTANGLE_TEXT */
427		ACCW(PR_CTX0_5, 0x0100804a); /* NVclass $04a, patchcfg ROP_AND, nv10+: little endian */
428		ACCW(PR_CTX1_5, 0x00000002); /* colorspace not set, notify instance is $0200 (b16-31) */
429		ACCW(PR_CTX2_5, 0x00000000); /* DMA0 and DMA1 instance invalid */
430		ACCW(PR_CTX3_5, 0x00000000); /* method traps disabled */
431		/* setup set '6' ... */
432		if (si->ps.card_arch >= NV10A)
433		{
434			/* ... for cmd NV10_CONTEXT_SURFACES_ARGB_ZS */
435			ACCW(PR_CTX0_6, 0x00000093); /* NVclass $093, nv10+: little endian */
436		}
437		else
438		{
439			/* ... for cmd NV4_CONTEXT_SURFACES_ARGB_ZS */
440			ACCW(PR_CTX0_6, 0x00000053); /* NVclass $053, nv10+: little endian */
441		}
442		ACCW(PR_CTX1_6, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
443		ACCW(PR_CTX2_6, 0x11401140); /* DMA0, DMA1 instance = $1140 */
444		ACCW(PR_CTX3_6, 0x00000000); /* method traps disabled */
445		/* setup set '7' ... */
446		if (si->ps.card_arch >= NV10A)
447		{
448			/* ... for cmd NV10_DX5_TEXTURE_TRIANGLE */
449			ACCW(PR_CTX0_7, 0x0300a094); /* NVclass $094, patchcfg ROP_AND, userclip enable,
450										  * context surface0 valid, nv10+: little endian */
451		}
452		else
453		{
454			/* ... for cmd NV4_DX5_TEXTURE_TRIANGLE */
455			ACCW(PR_CTX0_7, 0x0300a054); /* NVclass $054, patchcfg ROP_AND, userclip enable,
456										  * context surface0 valid */
457		}
458		ACCW(PR_CTX1_7, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
459		ACCW(PR_CTX2_7, 0x11401140); /* DMA0, DMA1 instance = $1140 */
460		ACCW(PR_CTX3_7, 0x00000000); /* method traps disabled */
461		/* setup set '8' ... */
462		if (si->ps.card_arch >= NV10A)
463		{
464			/* ... for cmd NV10_DX6_MULTI_TEXTURE_TRIANGLE (not used) */
465			ACCW(PR_CTX0_8, 0x0300a095); /* NVclass $095, patchcfg ROP_AND, userclip enable,
466										  * context surface0 valid, nv10+: little endian */
467		}
468		else
469		{
470			/* ... for cmd NV4_DX6_MULTI_TEXTURE_TRIANGLE (not used) */
471			ACCW(PR_CTX0_8, 0x0300a055); /* NVclass $055, patchcfg ROP_AND, userclip enable,
472										  * context surface0 valid */
473		}
474		ACCW(PR_CTX1_8, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
475		ACCW(PR_CTX2_8, 0x11401140); /* DMA0, DMA1 instance = $1140 */
476		ACCW(PR_CTX3_8, 0x00000000); /* method traps disabled */
477		/* setup set '9' for cmd NV_SCALED_IMAGE_FROM_MEMORY */
478		ACCW(PR_CTX0_9, 0x01018077); /* NVclass $077, patchcfg SRC_COPY,
479									  * context surface0 valid, nv10+: little endian */
480		ACCW(PR_CTX1_9, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
481		ACCW(PR_CTX2_9, 0x11401140); /* DMA0, DMA1 instance = $1140 */
482		ACCW(PR_CTX3_9, 0x00000000); /* method traps disabled */
483		/* setup set 'A' for cmd NV1_RENDER_SOLID_LIN (not used) */
484		ACCW(PR_CTX0_A, 0x0300a01c); /* NVclass $01c, patchcfg ROP_AND, userclip enable,
485									  * context surface0 valid, nv10+: little endian */
486		ACCW(PR_CTX1_A, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
487		ACCW(PR_CTX2_A, 0x11401140); /* DMA0, DMA1 instance = $1140 */
488		ACCW(PR_CTX3_A, 0x00000000); /* method traps disabled */
489		//2007 3D tests..
490		/* setup set 'B' ... */
491		if (si->ps.card_type == NV15)
492		{
493			/* ... for cmd NV11_TCL_PRIMITIVE_3D */
494			ACCW(PR_CTX0_B, 0x0300a096); /* NVclass $096, patchcfg ROP_AND, userclip enable,
495										  * context surface0 valid, nv10+: little endian */
496			ACCW(PR_CTX1_B, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
497			ACCW(PR_CTX2_B, 0x11401140); /* DMA0, DMA1 instance = $1140 */
498			ACCW(PR_CTX3_B, 0x00000000); /* method traps disabled */
499		}
500		/* setup DMA set pointed at by PF_CACH1_DMAI */
501		if (si->engine.agp_mode)
502		{
503			/* DMA page table present and of linear type;
504			 * DMA class is $002 (b0-11);
505			 * DMA target node is AGP */
506			ACCW(PR_CTX0_C, 0x00033002);
507		}
508		else
509		{
510			/* DMA page table present and of linear type;
511			 * DMA class is $002 (b0-11);
512			 * DMA target node is PCI */
513			ACCW(PR_CTX0_C, 0x00023002);
514		}
515		ACCW(PR_CTX1_C, 0x000fffff); /* DMA limit: tablesize is 1M bytes */
516		ACCW(PR_CTX2_C, (((uint32)((uint8 *)(si->dma_buffer_pci))) | 0x00000002));
517									 /* DMA access type is READ_AND_WRITE;
518									  * table is located in main system RAM (b12-31):
519									  * It's adress needs to be at a 4kb boundary! */
520
521		/* set the 3D rendering functions colordepth via BPIXEL's 'depth 2' */
522		/* note:
523		 * setting a depth to 'invalid' (zero) makes the engine report
524		 * ready with drawing 'immediately'. */
525		//fixme: NV30A and above (probably) needs to be corrected...
526		switch(si->dm.space)
527		{
528		case B_CMAP8:
529			if (si->ps.card_arch < NV30A)
530				/* set depth 2: $1 = Y8 */
531				ACCW(BPIXEL, 0x00000100);
532			else
533				/* set depth 0-1: $1 = Y8, $2 = X1R5G5B5_Z1R5G5B5 */
534				ACCW(BPIXEL, 0x00000021);
535			break;
536		case B_RGB15_LITTLE:
537			if (si->ps.card_arch < NV30A)
538				/* set depth 2: $4 = A1R5G5B5 */
539				ACCW(BPIXEL, 0x00000400);
540			else
541				/* set depth 0-1: $2 = X1R5G5B5_Z1R5G5B5, $4 = A1R5G5B5 */
542				ACCW(BPIXEL, 0x00000042);
543			break;
544		case B_RGB16_LITTLE:
545			if (si->ps.card_arch < NV30A)
546				/* set depth 2: $5 = R5G6B5 */
547				ACCW(BPIXEL, 0x00000500);
548			else
549				/* set depth 0-1: $5 = R5G6B5, $a = X1A7R8G8B8_O1A7R8G8B8 */
550				ACCW(BPIXEL, 0x000000a5);
551			break;
552		case B_RGB32_LITTLE:
553		case B_RGBA32_LITTLE:
554			if (si->ps.card_arch < NV30A)
555				/* set depth 2: $c = A8R8G8B8 */
556				ACCW(BPIXEL, 0x00000c00);
557			else
558				/* set depth 0-1: $7 = X8R8G8B8_Z8R8G8B8, $e = V8YB8U8YA8 */
559				ACCW(BPIXEL, 0x000000e7);
560			break;
561		default:
562			LOG(8,("ACC: init, invalid bit depth\n"));
563			return B_ERROR;
564		}
565	}
566
567	if (si->ps.card_arch == NV04A)
568	{
569		/* do a explicit engine reset */
570		ACCW(DEBUG0, 0x000001ff);
571
572		/* init some function blocks */
573		/* DEBUG0, b20 and b21 should be high, this has a big influence on
574		 * 3D rendering speed! (on all cards, confirmed) */
575		ACCW(DEBUG0, 0x1230c000);
576		/* DEBUG1, b19 = 1 increases 3D rendering speed on TNT2 (M64) a bit,
577		 * TNT1 rendering speed stays the same (all cards confirmed) */
578		ACCW(DEBUG1, 0x72191101);
579		ACCW(DEBUG2, 0x11d5f071);
580		ACCW(DEBUG3, 0x0004ff31);
581		/* init OP methods */
582		ACCW(DEBUG3, 0x4004ff31);
583
584		/* disable all acceleration engine INT reguests */
585		ACCW(ACC_INTE, 0x00000000);
586		/* reset all acceration engine INT status bits */
587		ACCW(ACC_INTS, 0xffffffff);
588		/* context control enabled */
589		ACCW(NV04_CTX_CTRL, 0x10010100);
590		/* all acceleration buffers, pitches and colors are valid */
591		ACCW(NV04_ACC_STAT, 0xffffffff);
592		/* enable acceleration engine command FIFO */
593		ACCW(FIFO_EN, 0x00000001);
594
595		/* setup location of active screen in framebuffer */
596		ACCW(OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
597		ACCW(OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
598		/* setup accesible card memory range */
599		ACCW(BLIMIT0, (si->ps.memory_size - 1));
600		ACCW(BLIMIT1, (si->ps.memory_size - 1));
601
602		/* pattern shape value = 8x8, 2 color */
603		//fixme: not needed, unless the engine has a hardware fault (setting via cmd)!
604		//ACCW(PAT_SHP, 0x00000000);
605		/* Pgraph Beta AND value (fraction) b23-30 */
606		ACCW(BETA_AND_VAL, 0xffffffff);
607	}
608	else
609	{
610		/* do a explicit engine reset */
611		ACCW(DEBUG0, 0xffffffff);
612		ACCW(DEBUG0, 0x00000000);
613		/* disable all acceleration engine INT reguests */
614		ACCW(ACC_INTE, 0x00000000);
615		/* reset all acceration engine INT status bits */
616		ACCW(ACC_INTS, 0xffffffff);
617		/* context control enabled */
618		ACCW(NV10_CTX_CTRL, 0x10010100);
619		/* all acceleration buffers, pitches and colors are valid */
620		ACCW(NV10_ACC_STAT, 0xffffffff);
621		/* enable acceleration engine command FIFO */
622		ACCW(FIFO_EN, 0x00000001);
623		/* setup surface type:
624		 * b1-0 = %01 = surface type is non-swizzle;
625		 * this is needed to enable 3D on NV1x (confirmed) and maybe others? */
626		ACCW(NV10_SURF_TYP, ((ACCR(NV10_SURF_TYP)) & 0x0007ff00));
627		ACCW(NV10_SURF_TYP, ((ACCR(NV10_SURF_TYP)) | 0x00020101));
628	}
629
630	if (si->ps.card_arch == NV10A)
631	{
632		/* init some function blocks */
633		ACCW(DEBUG1, 0x00118700);
634		/* DEBUG2 has a big influence on 3D speed for NV11 and NV15
635		 * (confirmed b3 and b18 should both be '1' on both cards!)
636		 * (b16 should also be '1', increases 3D speed on NV11 a bit more) */
637		ACCW(DEBUG2, 0x24fd2ad9);
638		ACCW(DEBUG3, 0x55de0030);
639		/* NV10_DEBUG4 has a big influence on 3D speed for NV11, NV15 and NV18
640		 * (confirmed b14 and b15 should both be '1' on these cards!)
641		 * (confirmed b8 should be '0' on NV18 to prevent complete engine crash!) */
642		ACCW(NV10_DEBUG4, 0x0000c000);
643
644		/* copy tile setup stuff from 'source' to acc engine */
645		for (cnt = 0; cnt < 32; cnt++)
646		{
647			NV_REG32(NVACC_NV10_TIL0AD + (cnt << 2)) =
648				NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
649		}
650
651		/* setup location of active screen in framebuffer */
652		ACCW(OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
653		ACCW(OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
654		/* setup accesible card memory range */
655		ACCW(BLIMIT0, (si->ps.memory_size - 1));
656		ACCW(BLIMIT1, (si->ps.memory_size - 1));
657
658		/* pattern shape value = 8x8, 2 color */
659		//fixme: not needed, unless the engine has a hardware fault (setting via cmd)!
660		//ACCW(PAT_SHP, 0x00000000);
661		/* Pgraph Beta AND value (fraction) b23-30 */
662		ACCW(BETA_AND_VAL, 0xffffffff);
663	}
664
665	if (si->ps.card_arch >= NV20A)
666	{
667		switch (si->ps.card_arch)
668		{
669		case NV40A:
670			/* init some function blocks */
671			ACCW(DEBUG1, 0x401287c0);
672			ACCW(DEBUG3, 0x60de8051);
673			/* disable specific functions, but enable SETUP_SPARE2 register */
674			ACCW(NV10_DEBUG4, 0x00008000);
675			/* set limit_viol_pix_adress(?): more likely something unknown.. */
676			ACCW(NV25_WHAT0, 0x00be3c5f);
677
678			/* setup some unknown serially accessed registers (?) */
679			tmp = (NV_REG32(NV32_NV4X_WHAT0) & 0x000000ff);
680			for (cnt = 0; (tmp && !(tmp & 0x00000001)); tmp >>= 1, cnt++);
681			{
682				ACCW(NV4X_WHAT2, cnt);
683			}
684
685			/* unknown.. */
686			switch (si->ps.card_type)
687			{
688			case NV40:
689			case NV45:
690			/* and NV48: but these are pgm'd as NV45 currently */
691				ACCW(NV40_WHAT0, 0x83280fff);
692				ACCW(NV40_WHAT1, 0x000000a0);
693				ACCW(NV40_WHAT2, 0x0078e366);
694				ACCW(NV40_WHAT3, 0x0000014c);
695				break;
696			case NV41:
697			/* and ID == 0x012x: but no cards defined yet */
698				ACCW(NV40P_WHAT0, 0x83280eff);
699				ACCW(NV40P_WHAT1, 0x000000a0);
700				ACCW(NV40P_WHAT2, 0x007596ff);
701				ACCW(NV40P_WHAT3, 0x00000108);
702				break;
703			case NV43:
704				ACCW(NV40P_WHAT0, 0x83280eff);
705				ACCW(NV40P_WHAT1, 0x000000a0);
706				ACCW(NV40P_WHAT2, 0x0072cb77);
707				ACCW(NV40P_WHAT3, 0x00000108);
708				break;
709			case NV44:
710			case G72:
711				ACCW(NV40P_WHAT0, 0x83280eff);
712				ACCW(NV40P_WHAT1, 0x000000a0);
713
714				NV_REG32(NV32_NV44_WHAT10) = NV_REG32(NV32_NV10STRAPINFO);
715				NV_REG32(NV32_NV44_WHAT11) = 0x00000000;
716				NV_REG32(NV32_NV44_WHAT12) = 0x00000000;
717				NV_REG32(NV32_NV44_WHAT13) = NV_REG32(NV32_NV10STRAPINFO);
718
719				ACCW(NV44_WHAT2, 0x00000000);
720				ACCW(NV44_WHAT3, 0x00000000);
721				break;
722/*			case NV44 type 2: (cardID 0x022x)
723				//fixme if needed: doesn't seem to need the strapinfo thing..
724				ACCW(NV40P_WHAT0, 0x83280eff);
725				ACCW(NV40P_WHAT1, 0x000000a0);
726
727				ACCW(NV44_WHAT2, 0x00000000);
728				ACCW(NV44_WHAT3, 0x00000000);
729				break;
730*/			case G70:
731			case G71:
732			case G73:
733				ACCW(NV40P_WHAT0, 0x83280eff);
734				ACCW(NV40P_WHAT1, 0x000000a0);
735				ACCW(NV40P_WHAT2, 0x07830610);
736				ACCW(NV40P_WHAT3, 0x0000016a);
737				break;
738			default:
739				ACCW(NV40P_WHAT0, 0x83280eff);
740				ACCW(NV40P_WHAT1, 0x000000a0);
741				break;
742			}
743
744			ACCW(NV10_TIL3PT, 0x2ffff800);
745			ACCW(NV10_TIL3ST, 0x00006000);
746			ACCW(NV4X_WHAT1, 0x01000000);
747			/* engine data source DMA instance = $1140 */
748			ACCW(NV4X_DMA_SRC, 0x00001140);
749			break;
750		case NV30A:
751			/* init some function blocks, but most is unknown.. */
752			ACCW(DEBUG1, 0x40108700);
753			ACCW(NV25_WHAT1, 0x00140000);
754			ACCW(DEBUG3, 0xf00e0431);
755			ACCW(NV10_DEBUG4, 0x00008000);
756			ACCW(NV25_WHAT0, 0xf04b1f36);
757			ACCW(NV20_WHAT3, 0x1002d888);
758			ACCW(NV25_WHAT2, 0x62ff007f);
759			break;
760		case NV20A:
761			/* init some function blocks, but most is unknown.. */
762			ACCW(DEBUG1, 0x00118700);
763			ACCW(DEBUG3, 0xf20e0431);
764			ACCW(NV10_DEBUG4, 0x00000000);
765			ACCW(NV20_WHAT1, 0x00000040);
766			if (si->ps.card_type < NV25)
767			{
768				ACCW(NV20_WHAT2, 0x00080000);
769				ACCW(NV10_DEBUG5, 0x00000005);
770				ACCW(NV20_WHAT3, 0x45caa208);
771				ACCW(NV20_WHAT4, 0x24000000);
772				ACCW(NV20_WHAT5, 0x00000040);
773
774				/* copy some fixed RAM(?) configuration info(?) to some indexed registers: */
775				/* b16-24 is select; b2-13 is adress in 32-bit words */
776				ACCW(RDI_INDEX, 0x00e00038);
777				/* data is 32-bit */
778				ACCW(RDI_DATA, 0x00000030);
779				/* copy some fixed RAM(?) configuration info(?) to some indexed registers: */
780				/* b16-24 is select; b2-13 is adress in 32-bit words */
781				ACCW(RDI_INDEX, 0x00e10038);
782				/* data is 32-bit */
783				ACCW(RDI_DATA, 0x00000030);
784			}
785			else
786			{
787				ACCW(NV25_WHAT1, 0x00080000);
788				ACCW(NV25_WHAT0, 0x304b1fb6);
789				ACCW(NV20_WHAT3, 0x18b82880);
790				ACCW(NV20_WHAT4, 0x44000000);
791				ACCW(NV20_WHAT5, 0x40000080);
792				ACCW(NV25_WHAT2, 0x000000ff);
793			}
794			break;
795		}
796
797		/* NV20A, NV30A and NV40A: */
798		/* copy tile setup stuff from previous setup 'source' to acc engine
799		 * (pattern colorRAM?) */
800		if ((si->ps.card_type <= NV40) || (si->ps.card_type == NV45))
801		{
802			for (cnt = 0; cnt < 32; cnt++)
803			{
804				/* copy NV10_FBTIL0AD upto/including NV10_FBTIL7ST */
805				NV_REG32(NVACC_NV20_WHAT0 + (cnt << 2)) =
806					NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
807
808				/* copy NV10_FBTIL0AD upto/including NV10_FBTIL7ST */
809				NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
810					NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
811			}
812		}
813		else
814		{
815			/* NV41, 43, 44, G70 and later */
816			if (si->ps.card_type >= G70)
817			{
818				for (cnt = 0; cnt < 60; cnt++)
819				{
820					/* copy NV41_FBTIL0AD upto/including G70_FBTILEST */
821					NV_REG32(NVACC_NV41_WHAT0 + (cnt << 2)) =
822						NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
823
824					/* copy NV41_FBTIL0AD upto/including G70_FBTILEST */
825					NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
826						NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
827				}
828			}
829			else
830			{
831				/* NV41, 43, 44 */
832				for (cnt = 0; cnt < 48; cnt++)
833				{
834					/* copy NV41_FBTIL0AD upto/including NV41_FBTILBST */
835					NV_REG32(NVACC_NV20_WHAT0 + (cnt << 2)) =
836						NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
837
838					if (si->ps.card_type != NV44)
839					{
840						/* copy NV41_FBTIL0AD upto/including NV41_FBTILBST */
841						NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
842							NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
843					}
844				}
845			}
846		}
847
848		if (si->ps.card_arch >= NV40A)
849		{
850			if ((si->ps.card_type == NV40) || (si->ps.card_type == NV45))
851			{
852				/* copy some RAM configuration info(?) */
853 				ACCW(NV20_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
854				ACCW(NV20_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
855				ACCW(NV40_WHAT_T2, NV_REG32(NV32_PFB_CONFIG_0));
856				ACCW(NV40_WHAT_T3, NV_REG32(NV32_PFB_CONFIG_1));
857
858				/* setup location of active screen in framebuffer */
859				ACCW(NV20_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
860				ACCW(NV20_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
861				/* setup accesible card memory range */
862				ACCW(NV20_BLIMIT6, (si->ps.memory_size - 1));
863				ACCW(NV20_BLIMIT7, (si->ps.memory_size - 1));
864			}
865			else
866			{
867				/* NV41, 43, 44, G70 and later */
868
869				/* copy some RAM configuration info(?) */
870				if (si->ps.card_type >= G70)
871				{
872					ACCW(G70_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
873					ACCW(G70_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
874				}
875				else
876				{
877					/* NV41, 43, 44 */
878					ACCW(NV40P_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
879					ACCW(NV40P_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
880				}
881				ACCW(NV40P_WHAT_T2, NV_REG32(NV32_PFB_CONFIG_0));
882				ACCW(NV40P_WHAT_T3, NV_REG32(NV32_PFB_CONFIG_1));
883
884				/* setup location of active screen in framebuffer */
885				ACCW(NV40P_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
886				ACCW(NV40P_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
887				/* setup accesible card memory range */
888				ACCW(NV40P_BLIMIT6, (si->ps.memory_size - 1));
889				ACCW(NV40P_BLIMIT7, (si->ps.memory_size - 1));
890			}
891		}
892		else /* NV20A and NV30A: */
893		{
894			/* copy some RAM configuration info(?) */
895			ACCW(NV20_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
896			ACCW(NV20_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
897			/* copy some RAM configuration info(?) to some indexed registers: */
898			/* b16-24 is select; b2-13 is adress in 32-bit words */
899			ACCW(RDI_INDEX, 0x00ea0000);
900			/* data is 32-bit */
901			ACCW(RDI_DATA, NV_REG32(NV32_PFB_CONFIG_0));
902			/* b16-24 is select; b2-13 is adress in 32-bit words */
903			ACCW(RDI_INDEX, 0x00ea0004);
904			/* data is 32-bit */
905			ACCW(RDI_DATA, NV_REG32(NV32_PFB_CONFIG_1));
906
907			/* setup location of active screen in framebuffer */
908			ACCW(NV20_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
909			ACCW(NV20_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
910			/* setup accesible card memory range */
911			ACCW(NV20_BLIMIT6, (si->ps.memory_size - 1));
912			ACCW(NV20_BLIMIT7, (si->ps.memory_size - 1));
913		}
914
915		/* NV20A, NV30A and NV40A: */
916		/* setup some acc engine tile stuff */
917		ACCW(NV10_TIL2AD, 0x00000000);
918		ACCW(NV10_TIL0ED, 0xffffffff);
919	}
920
921	/* all cards: */
922	/* setup clipping: rect size is 32768 x 32768, probably max. setting */
923	/* note:
924	 * can also be done via the NV_IMAGE_BLACK_RECTANGLE engine command. */
925	ACCW(ABS_UCLP_XMIN, 0x00000000);
926	ACCW(ABS_UCLP_YMIN, 0x00000000);
927	ACCW(ABS_UCLP_XMAX, 0x00007fff);
928	ACCW(ABS_UCLP_YMAX, 0x00007fff);
929
930	/* setup sync parameters for NV12_IMAGE_BLIT command for the current mode:
931	 * values given are CRTC vertical counter limit values. The NV12 command will wait
932	 * for the specified's CRTC's vertical counter to be in between the given values */
933	if (si->ps.card_type >= NV11)
934	{
935		ACCW(NV11_CRTC_LO, si->dm.timing.v_display - 1);
936		ACCW(NV11_CRTC_HI, si->dm.timing.v_display + 1);
937	}
938
939	/*** PFIFO ***/
940	/* (setup caches) */
941	/* disable caches reassign */
942	ACCW(PF_CACHES, 0x00000000);
943	/* PFIFO mode: channel 0 is in DMA mode, channels 1 - 32 are in PIO mode */
944	ACCW(PF_MODE, 0x00000001);
945	/* cache1 push0 access disabled */
946	ACCW(PF_CACH1_PSH0, 0x00000000);
947	/* cache1 pull0 access disabled */
948	ACCW(PF_CACH1_PUL0, 0x00000000);
949	/* cache1 push1 mode = DMA */
950	if (si->ps.card_arch >= NV40A)
951		ACCW(PF_CACH1_PSH1, 0x00010000);
952	else
953		ACCW(PF_CACH1_PSH1, 0x00000100);
954	/* cache1 DMA Put offset = 0 (b2-28) */
955	ACCW(PF_CACH1_DMAP, 0x00000000);
956	/* cache1 DMA Get offset = 0 (b2-28) */
957	ACCW(PF_CACH1_DMAG, 0x00000000);
958	/* cache1 DMA instance adress = $114e (b0-15);
959	 * instance being b4-19 with baseadress NV_PRAMIN_CTX_0 (0x00700000). */
960	/* note:
961	 * should point to a DMA definition in CTX register space (which is sort of RAM).
962	 * This define tells the engine where the DMA cmd buffer is and what it's size is.
963	 * Inside that cmd buffer you'll find the actual issued engine commands. */
964	if (si->ps.card_arch >= NV40A)
965		ACCW(PF_CACH1_DMAI, 0x00001150);
966	else
967		//2007 3d test..
968		ACCW(PF_CACH1_DMAI, 0x0000114e);
969	/* cache0 push0 access disabled */
970	ACCW(PF_CACH0_PSH0, 0x00000000);
971	/* cache0 pull0 access disabled */
972	ACCW(PF_CACH0_PUL0, 0x00000000);
973	/* RAM HT (hash table) baseadress = $10000 (b4-8), size = 4k,
974	 * search = 128 (is byte offset between hash 'sets') */
975	/* note:
976	 * so HT base is $00710000, last is $00710fff.
977	 * In this space you define the engine command handles (HT_HANDL_XX), which
978	 * in turn points to the defines in CTX register space (which is sort of RAM) */
979	ACCW(PF_RAMHT, 0x03000100);
980	/* RAM FC baseadress = $11000 (b3-8) (size is fixed to 0.5k(?)) */
981	/* note:
982	 * so FC base is $00711000, last is $007111ff. (not used?) */
983	ACCW(PF_RAMFC, 0x00000110);
984	/* RAM RO baseadress = $11200 (b1-8), size = 0.5k */
985	/* note:
986	 * so RO base is $00711200, last is $007113ff. (not used?) */
987	/* note also:
988	 * This means(?) the PRAMIN CTX registers are accessible from base $00711400. */
989	ACCW(PF_RAMRO, 0x00000112);
990	/* PFIFO size: ch0-15 = 512 bytes, ch16-31 = 124 bytes */
991	ACCW(PF_SIZE, 0x0000ffff);
992	/* cache1 hash instance = $ffff (b0-15) */
993	ACCW(PF_CACH1_HASH, 0x0000ffff);
994	/* disable all PFIFO INTs */
995	ACCW(PF_INTEN, 0x00000000);
996	/* reset all PFIFO INT status bits */
997	ACCW(PF_INTSTAT, 0xffffffff);
998	/* cache0 pull0 engine = acceleration engine (graphics) */
999	ACCW(PF_CACH0_PUL1, 0x00000001);
1000	/* cache1 DMA control: disable some stuff */
1001	ACCW(PF_CACH1_DMAC, 0x00000000);
1002	/* cache1 engine 0 upto/including 7 is software (could also be graphics or DVD) */
1003	ACCW(PF_CACH1_ENG, 0x00000000);
1004	/* cache1 DMA fetch: trigger at 128 bytes, size is 32 bytes, max requests is 15,
1005	 * use little endian */
1006	ACCW(PF_CACH1_DMAF, 0x000f0078);
1007	/* cache1 DMA push: b0 = 1: access is enabled */
1008	ACCW(PF_CACH1_DMAS, 0x00000001);
1009	/* cache1 push0 access enabled */
1010	ACCW(PF_CACH1_PSH0, 0x00000001);
1011	/* cache1 pull0 access enabled */
1012	ACCW(PF_CACH1_PUL0, 0x00000001);
1013	/* cache1 pull1 engine = acceleration engine (graphics) */
1014	ACCW(PF_CACH1_PUL1, 0x00000001);
1015	/* enable PFIFO caches reassign */
1016	ACCW(PF_CACHES, 0x00000001);
1017
1018	/* setup 3D specifics */
1019	nv_init_for_3D_dma();
1020
1021	/*** init acceleration engine command info ***/
1022	/* set object handles */
1023	/* note:
1024	 * probably depending on some other setup, there are 8 or 32 FIFO channels
1025	 * available. Assuming the current setup only has 8 channels because the 'rest'
1026	 * isn't setup here... */
1027	si->engine.fifo.handle[0] = NV_ROP5_SOLID;
1028	si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
1029	si->engine.fifo.handle[2] = NV_IMAGE_PATTERN;
1030	si->engine.fifo.handle[3] = NV4_SURFACE; /* NV10_CONTEXT_SURFACES_2D is identical */
1031	si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
1032	si->engine.fifo.handle[5] = NV4_GDI_RECTANGLE_TEXT;
1033	si->engine.fifo.handle[6] = NV4_CONTEXT_SURFACES_ARGB_ZS;//NV1_RENDER_SOLID_LIN;
1034	si->engine.fifo.handle[7] = NV4_DX5_TEXTURE_TRIANGLE;
1035	/* preset no FIFO channels assigned to cmd's */
1036	for (cnt = 0; cnt < 0x20; cnt++)
1037	{
1038		si->engine.fifo.ch_ptr[cnt] = 0;
1039	}
1040	/* set handle's pointers to their assigned FIFO channels */
1041	/* note:
1042	 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
1043	for (cnt = 0; cnt < 0x08; cnt++)
1044	{
1045		si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
1046												(0x00000001 + (cnt * 0x00002000));
1047	}
1048
1049	/*** init DMA command buffer info ***/
1050	if (si->ps.card_arch >= NV40A) //main mem DMA buf on pre-NV40
1051	{
1052		si->dma_buffer = (void *)((char *)si->framebuffer +
1053			((si->ps.memory_size - 1) & 0xffff8000));
1054	}
1055	LOG(4,("ACC_DMA: command buffer is at adress $%08x\n",
1056		((uint32)(si->dma_buffer))));
1057	/* we have issued no DMA cmd's to the engine yet */
1058	si->engine.dma.put = 0;
1059	/* the current first free adress in the DMA buffer is at offset 0 */
1060	si->engine.dma.current = 0;
1061	/* the DMA buffer can hold 8k 32-bit words (it's 32kb in size),
1062	 * or 256k 32-bit words (1Mb in size) dependant on architecture (for now) */
1063	/* note:
1064	 * one word is reserved at the end of the DMA buffer to be able to instruct the
1065	 * engine to do a buffer wrap-around!
1066	 * (DMA opcode 'noninc method': issue word $20000000.) */
1067	if (si->ps.card_arch < NV40A)
1068		si->engine.dma.max = ((1 * 1024 * 1024) >> 2) - 1;
1069	else
1070		si->engine.dma.max = 8192 - 1;
1071	/* note the current free space we have left in the DMA buffer */
1072	si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;
1073
1074	/*** init FIFO via DMA command buffer. ***/
1075	/* wait for room in fifo for new FIFO assigment cmds if needed: */
1076	if (si->ps.card_arch >= NV40A)
1077	{
1078		if (nv_acc_fifofree_dma(12) != B_OK) return B_ERROR;
1079	}
1080	else
1081	{
1082		if (nv_acc_fifofree_dma(16) != B_OK) return B_ERROR;
1083	}
1084
1085	/* program new FIFO assignments */
1086	/* Raster OPeration: */
1087	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
1088	/* Clip: */
1089	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
1090	/* Pattern: */
1091	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH2, si->engine.fifo.handle[2]);
1092	/* 2D Surfaces: */
1093	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
1094	/* Blit: */
1095	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
1096	/* Bitmap: */
1097	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH5, si->engine.fifo.handle[5]);
1098	if (si->ps.card_arch < NV40A)
1099	{
1100		/* 3D surfaces: (3D related only) */
1101		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH6, si->engine.fifo.handle[6]);
1102		/* Textured Triangle: (3D only) */
1103		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH7, si->engine.fifo.handle[7]);
1104	}
1105
1106	/*** Set pixel width ***/
1107	switch(si->dm.space)
1108	{
1109	case B_CMAP8:
1110		surf_depth = 0x00000001;
1111		cmd_depth = 0x00000003;
1112		break;
1113	case B_RGB15_LITTLE:
1114	case B_RGB16_LITTLE:
1115		surf_depth = 0x00000004;
1116		cmd_depth = 0x00000001;
1117		break;
1118	case B_RGB32_LITTLE:
1119	case B_RGBA32_LITTLE:
1120		surf_depth = 0x00000006;
1121		cmd_depth = 0x00000003;
1122		break;
1123	default:
1124		LOG(8,("ACC_DMA: init, invalid bit depth\n"));
1125		return B_ERROR;
1126	}
1127
1128	/* wait for room in fifo for surface setup cmd if needed */
1129	if (nv_acc_fifofree_dma(5) != B_OK) return B_ERROR;
1130	/* now setup 2D surface (writing 5 32bit words) */
1131	nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 4);
1132	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = surf_depth; /* Format */
1133	/* setup screen pitch */
1134	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1135		((si->fbc.bytes_per_row & 0x0000ffff) | (si->fbc.bytes_per_row << 16)); /* Pitch */
1136	/* setup screen location */
1137	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1138		((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer); /* OffsetSource */
1139	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1140		((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer); /* OffsetDest */
1141
1142	/* wait for room in fifo for pattern colordepth setup cmd if needed */
1143	if (nv_acc_fifofree_dma(2) != B_OK) return B_ERROR;
1144	/* set pattern colordepth (writing 2 32bit words) */
1145	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETCOLORFORMAT, 1);
1146	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1147
1148	/* wait for room in fifo for bitmap colordepth setup cmd if needed */
1149	if (nv_acc_fifofree_dma(2) != B_OK) return B_ERROR;
1150	/* set bitmap colordepth (writing 2 32bit words) */
1151	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_SETCOLORFORMAT, 1);
1152	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1153
1154	/* Load our pattern into the engine: */
1155	/* wait for room in fifo for pattern cmd if needed. */
1156	if (nv_acc_fifofree_dma(7) != B_OK) return B_ERROR;
1157	/* now setup pattern (writing 7 32bit words) */
1158	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETSHAPE, 1);
1159	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* SetShape: 0 = 8x8, 1 = 64x1, 2 = 1x64 */
1160	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETCOLOR0, 4);
1161	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetColor0 */
1162	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetColor1 */
1163	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetPattern[0] */
1164	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetPattern[1] */
1165
1166	/* tell the engine to fetch and execute all (new) commands in the DMA buffer */
1167	nv_start_dma();
1168
1169	return B_OK;
1170}
1171
1172static void nv_init_for_3D_dma(void)
1173{
1174	/* setup PGRAPH unknown registers and modify (pre-cleared) pipe stuff for 3D use */
1175	if (si->ps.card_arch >= NV10A)
1176	{
1177		/* setup unknown PGRAPH stuff */
1178		ACCW(PGWHAT_00, 0x00000000);
1179		ACCW(PGWHAT_01, 0x00000000);
1180		ACCW(PGWHAT_02, 0x00000000);
1181		ACCW(PGWHAT_03, 0x00000000);
1182
1183		ACCW(PGWHAT_04, 0x00001000);
1184		ACCW(PGWHAT_05, 0x00001000);
1185		ACCW(PGWHAT_06, 0x4003ff80);
1186
1187		ACCW(PGWHAT_07, 0x00000000);
1188		ACCW(PGWHAT_08, 0x00000000);
1189		ACCW(PGWHAT_09, 0x00000000);
1190		ACCW(PGWHAT_0A, 0x00000000);
1191		ACCW(PGWHAT_0B, 0x00000000);
1192
1193		ACCW(PGWHAT_0C, 0x00080008);
1194		ACCW(PGWHAT_0D, 0x00080008);
1195
1196		ACCW(PGWHAT_0E, 0x00000000);
1197		ACCW(PGWHAT_0F, 0x00000000);
1198		ACCW(PGWHAT_10, 0x00000000);
1199		ACCW(PGWHAT_11, 0x00000000);
1200		ACCW(PGWHAT_12, 0x00000000);
1201		ACCW(PGWHAT_13, 0x00000000);
1202		ACCW(PGWHAT_14, 0x00000000);
1203		ACCW(PGWHAT_15, 0x00000000);
1204		ACCW(PGWHAT_16, 0x00000000);
1205		ACCW(PGWHAT_17, 0x00000000);
1206		ACCW(PGWHAT_18, 0x00000000);
1207
1208		ACCW(PGWHAT_19, 0x10000000);
1209
1210		ACCW(PGWHAT_1A, 0x00000000);
1211		ACCW(PGWHAT_1B, 0x00000000);
1212		ACCW(PGWHAT_1C, 0x00000000);
1213		ACCW(PGWHAT_1D, 0x00000000);
1214		ACCW(PGWHAT_1E, 0x00000000);
1215		ACCW(PGWHAT_1F, 0x00000000);
1216		ACCW(PGWHAT_20, 0x00000000);
1217		ACCW(PGWHAT_21, 0x00000000);
1218
1219		ACCW(PGWHAT_22, 0x08000000);
1220
1221		ACCW(PGWHAT_23, 0x00000000);
1222		ACCW(PGWHAT_24, 0x00000000);
1223		ACCW(PGWHAT_25, 0x00000000);
1224		ACCW(PGWHAT_26, 0x00000000);
1225
1226		ACCW(PGWHAT_27, 0x4b7fffff);
1227
1228		ACCW(PGWHAT_28, 0x00000000);
1229		ACCW(PGWHAT_29, 0x00000000);
1230		ACCW(PGWHAT_2A, 0x00000000);
1231
1232		/* setup window clipping */
1233		/* b0-11 = min; b16-27 = max.
1234		 * note:
1235		 * probably two's complement values, so setting to max range here:
1236		 * which would be -2048 upto/including +2047. */
1237		/* horizontal */
1238		ACCW(WINCLIP_H_0, 0x07ff0800);
1239		ACCW(WINCLIP_H_1, 0x07ff0800);
1240		ACCW(WINCLIP_H_2, 0x07ff0800);
1241		ACCW(WINCLIP_H_3, 0x07ff0800);
1242		ACCW(WINCLIP_H_4, 0x07ff0800);
1243		ACCW(WINCLIP_H_5, 0x07ff0800);
1244		ACCW(WINCLIP_H_6, 0x07ff0800);
1245		ACCW(WINCLIP_H_7, 0x07ff0800);
1246		/* vertical */
1247		ACCW(WINCLIP_V_0, 0x07ff0800);
1248		ACCW(WINCLIP_V_1, 0x07ff0800);
1249		ACCW(WINCLIP_V_2, 0x07ff0800);
1250		ACCW(WINCLIP_V_3, 0x07ff0800);
1251		ACCW(WINCLIP_V_4, 0x07ff0800);
1252		ACCW(WINCLIP_V_5, 0x07ff0800);
1253		ACCW(WINCLIP_V_6, 0x07ff0800);
1254		ACCW(WINCLIP_V_7, 0x07ff0800);
1255
1256		/* setup (initialize) pipe:
1257		 * needed to get valid 3D rendering on (at least) NV1x cards. Without this
1258		 * those cards produce rubbish instead of 3D, although the engine itself keeps
1259		 * running and 2D stays OK. */
1260
1261		/* set eyetype to local, lightning etc. is off */
1262		ACCW(NV10_XFMOD0, 0x10000000);
1263		/* disable all lights */
1264		ACCW(NV10_XFMOD1, 0x00000000);
1265
1266		/* note: upon writing data into the PIPEDAT register, the PIPEADR is
1267		 * probably auto-incremented! */
1268		/* (pipe adress = b2-16, pipe data = b0-31) */
1269		/* note: pipe adresses IGRAPH registers! */
1270		ACCW(NV10_PIPEADR, 0x00006740);
1271		ACCW(NV10_PIPEDAT, 0x00000000);
1272		ACCW(NV10_PIPEDAT, 0x00000000);
1273		ACCW(NV10_PIPEDAT, 0x00000000);
1274		ACCW(NV10_PIPEDAT, 0x3f800000);
1275
1276		ACCW(NV10_PIPEADR, 0x00006750);
1277		ACCW(NV10_PIPEDAT, 0x40000000);
1278		ACCW(NV10_PIPEDAT, 0x40000000);
1279		ACCW(NV10_PIPEDAT, 0x40000000);
1280		ACCW(NV10_PIPEDAT, 0x40000000);
1281
1282		ACCW(NV10_PIPEADR, 0x00006760);
1283		ACCW(NV10_PIPEDAT, 0x00000000);
1284		ACCW(NV10_PIPEDAT, 0x00000000);
1285		ACCW(NV10_PIPEDAT, 0x3f800000);
1286		ACCW(NV10_PIPEDAT, 0x00000000);
1287
1288		ACCW(NV10_PIPEADR, 0x00006770);
1289		ACCW(NV10_PIPEDAT, 0xc5000000);
1290		ACCW(NV10_PIPEDAT, 0xc5000000);
1291		ACCW(NV10_PIPEDAT, 0x00000000);
1292		ACCW(NV10_PIPEDAT, 0x00000000);
1293
1294		ACCW(NV10_PIPEADR, 0x00006780);
1295		ACCW(NV10_PIPEDAT, 0x00000000);
1296		ACCW(NV10_PIPEDAT, 0x00000000);
1297		ACCW(NV10_PIPEDAT, 0x3f800000);
1298		ACCW(NV10_PIPEDAT, 0x00000000);
1299
1300		ACCW(NV10_PIPEADR, 0x000067a0);
1301		ACCW(NV10_PIPEDAT, 0x3f800000);
1302		ACCW(NV10_PIPEDAT, 0x3f800000);
1303		ACCW(NV10_PIPEDAT, 0x3f800000);
1304		ACCW(NV10_PIPEDAT, 0x3f800000);
1305
1306		ACCW(NV10_PIPEADR, 0x00006ab0);
1307		ACCW(NV10_PIPEDAT, 0x3f800000);
1308		ACCW(NV10_PIPEDAT, 0x3f800000);
1309		ACCW(NV10_PIPEDAT, 0x3f800000);
1310
1311		ACCW(NV10_PIPEADR, 0x00006ac0);
1312		ACCW(NV10_PIPEDAT, 0x00000000);
1313		ACCW(NV10_PIPEDAT, 0x00000000);
1314		ACCW(NV10_PIPEDAT, 0x00000000);
1315
1316		ACCW(NV10_PIPEADR, 0x00006c10);
1317		ACCW(NV10_PIPEDAT, 0xbf800000);
1318
1319		ACCW(NV10_PIPEADR, 0x00007030);
1320		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1321
1322		ACCW(NV10_PIPEADR, 0x00007040);
1323		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1324
1325		ACCW(NV10_PIPEADR, 0x00007050);
1326		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1327
1328		ACCW(NV10_PIPEADR, 0x00007060);
1329		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1330
1331		ACCW(NV10_PIPEADR, 0x00007070);
1332		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1333
1334		ACCW(NV10_PIPEADR, 0x00007080);
1335		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1336
1337		ACCW(NV10_PIPEADR, 0x00007090);
1338		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1339
1340		ACCW(NV10_PIPEADR, 0x000070a0);
1341		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1342
1343		ACCW(NV10_PIPEADR, 0x00006a80);
1344		ACCW(NV10_PIPEDAT, 0x00000000);
1345		ACCW(NV10_PIPEDAT, 0x00000000);
1346		ACCW(NV10_PIPEDAT, 0x3f800000);
1347
1348		ACCW(NV10_PIPEADR, 0x00006aa0);
1349		ACCW(NV10_PIPEDAT, 0x00000000);
1350		ACCW(NV10_PIPEDAT, 0x00000000);
1351		ACCW(NV10_PIPEDAT, 0x00000000);
1352
1353		/* select primitive type that will be drawn (tri's) */
1354		ACCW(NV10_PIPEADR, 0x00000040);
1355		ACCW(NV10_PIPEDAT, 0x00000005);
1356
1357		ACCW(NV10_PIPEADR, 0x00006400);
1358		ACCW(NV10_PIPEDAT, 0x3f800000);
1359		ACCW(NV10_PIPEDAT, 0x3f800000);
1360		ACCW(NV10_PIPEDAT, 0x4b7fffff);
1361		ACCW(NV10_PIPEDAT, 0x00000000);
1362
1363		ACCW(NV10_PIPEADR, 0x00006410);
1364		ACCW(NV10_PIPEDAT, 0xc5000000);
1365		ACCW(NV10_PIPEDAT, 0xc5000000);
1366		ACCW(NV10_PIPEDAT, 0x00000000);
1367		ACCW(NV10_PIPEDAT, 0x00000000);
1368
1369		ACCW(NV10_PIPEADR, 0x00006420);
1370		ACCW(NV10_PIPEDAT, 0x00000000);
1371		ACCW(NV10_PIPEDAT, 0x00000000);
1372		ACCW(NV10_PIPEDAT, 0x00000000);
1373		ACCW(NV10_PIPEDAT, 0x00000000);
1374
1375		ACCW(NV10_PIPEADR, 0x00006430);
1376		ACCW(NV10_PIPEDAT, 0x00000000);
1377		ACCW(NV10_PIPEDAT, 0x00000000);
1378		ACCW(NV10_PIPEDAT, 0x00000000);
1379		ACCW(NV10_PIPEDAT, 0x00000000);
1380
1381		ACCW(NV10_PIPEADR, 0x000064c0);
1382		ACCW(NV10_PIPEDAT, 0x3f800000);
1383		ACCW(NV10_PIPEDAT, 0x3f800000);
1384		ACCW(NV10_PIPEDAT, 0x477fffff);
1385		ACCW(NV10_PIPEDAT, 0x3f800000);
1386
1387		ACCW(NV10_PIPEADR, 0x000064d0);
1388		ACCW(NV10_PIPEDAT, 0xc5000000);
1389		ACCW(NV10_PIPEDAT, 0xc5000000);
1390		ACCW(NV10_PIPEDAT, 0x00000000);
1391		ACCW(NV10_PIPEDAT, 0x00000000);
1392
1393		ACCW(NV10_PIPEADR, 0x000064e0);
1394		ACCW(NV10_PIPEDAT, 0xc4fff000);
1395		ACCW(NV10_PIPEDAT, 0xc4fff000);
1396		ACCW(NV10_PIPEDAT, 0x00000000);
1397		ACCW(NV10_PIPEDAT, 0x00000000);
1398
1399		ACCW(NV10_PIPEADR, 0x000064f0);
1400		ACCW(NV10_PIPEDAT, 0x00000000);
1401		ACCW(NV10_PIPEDAT, 0x00000000);
1402		ACCW(NV10_PIPEDAT, 0x00000000);
1403		ACCW(NV10_PIPEDAT, 0x00000000);
1404
1405		/* turn lightning on */
1406		ACCW(NV10_XFMOD0, 0x30000000);
1407		/* set light 1 to infinite type, other lights remain off */
1408		ACCW(NV10_XFMOD1, 0x00000004);
1409
1410		/* Z-buffer state is:
1411		 * initialized, set to: 'fixed point' (integer?); Z-buffer; 16bits depth */
1412		/* note:
1413		 * other options possible are: floating point; 24bits depth; W-buffer */
1414		ACCW(GLOB_STAT_0, 0x10000000);
1415		/* set DMA instance 2 and 3 to be invalid */
1416		ACCW(GLOB_STAT_1, 0x00000000);
1417	}
1418}
1419
1420static void nv_start_dma(void)
1421{
1422	uint32 dummy;
1423
1424	if (si->engine.dma.current != si->engine.dma.put)
1425	{
1426		si->engine.dma.put = si->engine.dma.current;
1427		/* flush used caches so we know for sure the DMA cmd buffer received all data. */
1428		if (si->ps.card_arch < NV40A)
1429		{
1430			/* some CPU's support out-of-order processing (WinChip/Cyrix). Flush them. */
1431			__asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
1432			/* read a non-cached adress to flush the cash */
1433			dummy = ACCR(STATUS);
1434		}
1435		else
1436		{
1437			/* dummy read the first adress of the framebuffer to flush MTRR-WC buffers */
1438			dummy = *((volatile uint32 *)(si->framebuffer));
1439		}
1440
1441		/* actually start DMA to execute all commands now in buffer */
1442		/* note:
1443		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
1444		 * fact all the same set. It also doesn't matter if the channel was assigned a
1445		 * command or not. */
1446		/* note also:
1447		 * NV_GENERAL_DMAPUT is a write-only register on some cards (confirmed NV11). */
1448		NV_REG32(NVACC_FIFO + NV_GENERAL_DMAPUT) = (si->engine.dma.put << 2);
1449	}
1450}
1451
1452/* this routine does not check the engine's internal hardware FIFO, but the DMA
1453 * command buffer. You can see this as a FIFO as well, that feeds the hardware FIFO.
1454 * The hardware FIFO state is checked by the DMA hardware automatically. */
1455static status_t nv_acc_fifofree_dma(uint16 cmd_size)
1456{
1457	uint32 dmaget;
1458
1459	/* we'd better check for timeouts on the DMA engine as it's theoretically
1460	 * breakable by malfunctioning software */
1461	uint16 cnt = 0;
1462
1463	/* check if the DMA buffer has enough room for the command.
1464	 * note:
1465	 * engine.dma.free is 'cached' */
1466	while ((si->engine.dma.free < cmd_size) && (cnt < 10000) && (err < 3))
1467	{
1468		/* see where the engine is currently fetching from the buffer */
1469		/* note:
1470		 * read this only once in the code as accessing registers is relatively slow */
1471		/* note also:
1472		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
1473		 * fact all the same set. It also doesn't matter if the channel was assigned a
1474		 * command or not. */
1475		dmaget = ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET)) >> 2);
1476
1477		/* update timeout counter: on NV11 on a Pentium4 2.8Ghz max reached count
1478		 * using BeRoMeter 1.2.6 was about 600; so counting 10000 before generating
1479		 * a timeout should definately do it. Snooze()-ing cannot be done without a
1480		 * serious speed penalty, even if done for only 1 microSecond. */
1481		cnt++;
1482
1483		/* where's the engine fetching viewed from us issuing? */
1484		if (si->engine.dma.put >= dmaget)
1485		{
1486			/* engine is fetching 'behind us', the last piece of the buffer is free */
1487
1488			/* note the 'updated' free space we have in the DMA buffer */
1489			si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;
1490			/* if it's enough after all we exit this routine immediately. Else: */
1491			if (si->engine.dma.free < cmd_size)
1492			{
1493				/* not enough room left, so instruct DMA engine to reset the buffer
1494				 * when it's reaching the end of it */
1495				((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x20000000;
1496				/* reset our buffer pointer, so new commands will be placed at the
1497				 * beginning of the buffer. */
1498				si->engine.dma.current = 0;
1499				/* tell the engine to fetch the remaining command(s) in the DMA buffer
1500				 * that where not executed before. */
1501				nv_start_dma();
1502
1503				/* NOW the engine is fetching 'in front of us', so the first piece
1504				 * of the buffer is free */
1505
1506				/* note the updated current free space we have in the DMA buffer */
1507				si->engine.dma.free = dmaget - si->engine.dma.current;
1508				/* mind this pittfall:
1509				 * Leave some room between where the engine is fetching and where we
1510				 * put new commands. Otherwise the engine will crash on heavy loads.
1511				 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
1512				 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
1513				 * Note:
1514				 * The engine is DMA triggered for fetching chunks every 128 bytes,
1515				 * maybe this is the reason for this behaviour.
1516				 * Note also:
1517				 * it looks like the space that needs to be kept free is coupled
1518				 * with the size of the DMA buffer. */
1519				if (si->engine.dma.free < 256)
1520					si->engine.dma.free = 0;
1521				else
1522					si->engine.dma.free -= 256;
1523			}
1524		}
1525		else
1526		{
1527			/* engine is fetching 'in front of us', so the first piece of the buffer
1528			 * is free */
1529
1530			/* note the updated current free space we have in the DMA buffer */
1531			si->engine.dma.free = dmaget - si->engine.dma.current;
1532			/* mind this pittfall:
1533			 * Leave some room between where the engine is fetching and where we
1534			 * put new commands. Otherwise the engine will crash on heavy loads.
1535			 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
1536			 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
1537			 * Note:
1538			 * The engine is DMA triggered for fetching chunks every 128 bytes,
1539			 * maybe this is the reason for this behaviour.
1540			 * Note also:
1541			 * it looks like the space that needs to be kept free is coupled
1542			 * with the size of the DMA buffer. */
1543			if (si->engine.dma.free < 256)
1544				si->engine.dma.free = 0;
1545			else
1546				si->engine.dma.free -= 256;
1547		}
1548	}
1549
1550	/* log timeout if we had one */
1551	if (cnt == 10000)
1552	{
1553		if (err < 3) err++;
1554		LOG(4,("ACC_DMA: fifofree; DMA timeout #%d, engine trouble!\n", err));
1555	}
1556
1557	/* we must make the acceleration routines abort or the driver will hang! */
1558	if (err >= 3) return B_ERROR;
1559
1560	return B_OK;
1561}
1562
1563static void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size)
1564{
1565	/* NV_FIFO_DMA_OPCODE: set number of cmd words (b18 - 28); set FIFO offset for
1566	 * first cmd word (b2 - 15); set DMA opcode = method (b29 - 31).
1567	 * a 'NOP' is the opcode word $00000000. */
1568	/* note:
1569	 * possible DMA opcodes:
1570	 * b'000' is 'method' (execute cmd);
1571	 * b'001' is 'jump';
1572	 * b'002' is 'noninc method' (execute buffer wrap-around);
1573	 * b'003' is 'call': return is executed by opcode word $00020000 (b17 = 1). */
1574	/* note also:
1575	 * this system uses auto-increments for the FIFO offset adresses. Make sure
1576	 * to set a new adress if a gap exists between the previous one and the new one. */
1577	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((size << 18) |
1578		((si->engine.fifo.ch_ptr[cmd] + offset) & 0x0000fffc));
1579
1580	/* space left after issuing the current command is the cmd AND it's arguments less */
1581	si->engine.dma.free -= (size + 1);
1582}
1583
1584static void nv_acc_set_ch_dma(uint16 ch, uint32 handle)
1585{
1586	/* issue FIFO channel assign cmd */
1587	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((1 << 18) | ch);
1588	/* set new assignment */
1589	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = (0x80000000 | handle);
1590
1591	/* space left after issuing the current command is the cmd AND it's arguments less */
1592	si->engine.dma.free -= 2;
1593}
1594
1595/* note:
1596 * switching fifo channel assignments this way has no noticable slowdown:
1597 * measured 0.2% with Quake2. */
1598void nv_acc_assert_fifo_dma(void)
1599{
1600	/* does every engine cmd this accelerant needs have a FIFO channel? */
1601	//fixme: can probably be optimized for both speed and channel selection...
1602	if (!si->engine.fifo.ch_ptr[NV_ROP5_SOLID] ||
1603		!si->engine.fifo.ch_ptr[NV_IMAGE_BLACK_RECTANGLE] ||
1604		!si->engine.fifo.ch_ptr[NV_IMAGE_PATTERN] ||
1605		!si->engine.fifo.ch_ptr[NV4_SURFACE] ||
1606		!si->engine.fifo.ch_ptr[NV_IMAGE_BLIT] ||
1607		!si->engine.fifo.ch_ptr[NV4_GDI_RECTANGLE_TEXT] ||
1608		!si->engine.fifo.ch_ptr[NV_SCALED_IMAGE_FROM_MEMORY])
1609	{
1610		uint16 cnt;
1611
1612		/* free the FIFO channels we want from the currently assigned cmd's */
1613		si->engine.fifo.ch_ptr[si->engine.fifo.handle[0]] = 0;
1614		si->engine.fifo.ch_ptr[si->engine.fifo.handle[1]] = 0;
1615		si->engine.fifo.ch_ptr[si->engine.fifo.handle[2]] = 0;
1616		si->engine.fifo.ch_ptr[si->engine.fifo.handle[3]] = 0;
1617		si->engine.fifo.ch_ptr[si->engine.fifo.handle[4]] = 0;
1618		si->engine.fifo.ch_ptr[si->engine.fifo.handle[5]] = 0;
1619		si->engine.fifo.ch_ptr[si->engine.fifo.handle[6]] = 0;
1620
1621		/* set new object handles */
1622		si->engine.fifo.handle[0] = NV_ROP5_SOLID;
1623		si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
1624		si->engine.fifo.handle[2] = NV_IMAGE_PATTERN;
1625		si->engine.fifo.handle[3] = NV4_SURFACE;
1626		si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
1627		si->engine.fifo.handle[5] = NV4_GDI_RECTANGLE_TEXT;
1628		si->engine.fifo.handle[6] = NV_SCALED_IMAGE_FROM_MEMORY;
1629
1630		/* set handle's pointers to their assigned FIFO channels */
1631		/* note:
1632		 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
1633		for (cnt = 0; cnt < 0x08; cnt++)
1634		{
1635			si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
1636				(0x00000001 + (cnt * 0x00002000));
1637		}
1638
1639		/* wait for room in fifo for new FIFO assigment cmds if needed. */
1640		if (nv_acc_fifofree_dma(14) != B_OK) return;
1641
1642		/* program new FIFO assignments */
1643		/* Raster OPeration: */
1644		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
1645		/* Clip: */
1646		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
1647		/* Pattern: */
1648		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH2, si->engine.fifo.handle[2]);
1649		/* 2D Surface: */
1650		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
1651		/* Blit: */
1652		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
1653		/* Bitmap: */
1654		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH5, si->engine.fifo.handle[5]);
1655		/* Scaled and fitered Blit: */
1656		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH6, si->engine.fifo.handle[6]);
1657
1658		/* tell the engine to fetch and execute all (new) commands in the DMA buffer */
1659		nv_start_dma();
1660	}
1661}
1662
1663/*
1664	note:
1665	moved acceleration 'top-level' routines to be integrated in the engine:
1666	it is costly to call the engine for every single function within a loop!
1667	(measured with BeRoMeter 1.2.6: upto 15% speed increase on all CPU's.)
1668
1669	note also:
1670	splitting up each command list into sublists (see routines below) prevents
1671	a lot more nested calls, further increasing the speed with upto 70%.
1672
1673	finally:
1674	sending the sublist to just one single engine command even further increases
1675	speed with upto another 10%. This can't be done for blits though, as this engine-
1676	command's hardware does not support multiple objects.
1677*/
1678
1679/* screen to screen blit - i.e. move windows around and scroll within them. */
1680void SCREEN_TO_SCREEN_BLIT_DMA(engine_token *et, blit_params *list, uint32 count)
1681{
1682	uint32 i = 0;
1683	uint16 subcnt;
1684
1685	/*** init acc engine for blit function ***/
1686	/* ROP registers (Raster OPeration):
1687	 * wait for room in fifo for ROP cmd if needed. */
1688	if (nv_acc_fifofree_dma(2) != B_OK) return;
1689	/* now setup ROP (writing 2 32bit words) for GXcopy */
1690	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
1691	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
1692
1693	/*** do each blit ***/
1694	/* Note:
1695	 * blit-copy direction is determined inside nvidia hardware: no setup needed */
1696	while (count)
1697	{
1698		/* break up the list in sublists to minimize calls, while making sure long
1699		 * lists still get executed without trouble */
1700		subcnt = 32;
1701		if (count < 32) subcnt = count;
1702		count -= subcnt;
1703
1704		/* wait for room in fifo for blit cmd if needed. */
1705		if (nv_acc_fifofree_dma(4 * subcnt) != B_OK) return;
1706
1707		while (subcnt--)
1708		{
1709			/* now setup blit (writing 4 32bit words) */
1710			nv_acc_cmd_dma(NV_IMAGE_BLIT, NV_IMAGE_BLIT_SOURCEORG, 3);
1711			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1712				(((list[i].src_top) << 16) | (list[i].src_left)); /* SourceOrg */
1713			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1714				(((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */
1715			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1716				((((list[i].height) + 1) << 16) | ((list[i].width) + 1)); /* HeightWidth */
1717
1718			i++;
1719		}
1720
1721		/* tell the engine to fetch the commands in the DMA buffer that where not
1722		 * executed before. */
1723		nv_start_dma();
1724	}
1725
1726	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1727	si->engine.threeD.reload = 0xffffffff;
1728}
1729
1730/* scaled and filtered screen to screen blit - i.e. video playback without overlay */
1731/* note: source and destination may not overlap. */
1732//fixme? checkout NV5 and NV10 version of cmd: faster?? (or is 0x77 a 'autoselect' version?)
1733void SCREEN_TO_SCREEN_SCALED_FILTERED_BLIT_DMA(engine_token *et, scaled_blit_params *list, uint32 count)
1734{
1735	uint32 i = 0;
1736	uint16 subcnt;
1737	uint32 cmd_depth;
1738	uint8 bpp;
1739
1740	/*** init acc engine for scaled filtered blit function ***/
1741	/* Set pixel width */
1742	switch(si->dm.space)
1743	{
1744	case B_RGB15_LITTLE:
1745		cmd_depth = 0x00000002;
1746		bpp = 2;
1747		break;
1748	case B_RGB16_LITTLE:
1749		cmd_depth = 0x00000007;
1750		bpp = 2;
1751		break;
1752	case B_RGB32_LITTLE:
1753	case B_RGBA32_LITTLE:
1754		cmd_depth = 0x00000004;
1755		bpp = 4;
1756		break;
1757	/* fixme sometime:
1758	 * we could do the spaces below if this function would be modified to be able
1759	 * to use a source outside of the desktop, i.e. using offscreen bitmaps... */
1760	case B_YCbCr422:
1761		cmd_depth = 0x00000005;
1762		bpp = 2;
1763		break;
1764	case B_YUV422:
1765		cmd_depth = 0x00000006;
1766		bpp = 2;
1767		break;
1768	default:
1769		/* note: this function does not support src or dest in the B_CMAP8 space! */
1770		//fixme: the NV10 version of this cmd supports B_CMAP8 src though... (checkout)
1771		LOG(8,("ACC_DMA: scaled_filtered_blit, invalid bit depth\n"));
1772		return;
1773	}
1774
1775	/* modify surface depth settings for 15-bit colorspace so command works as intended */
1776	if (si->dm.space == B_RGB15_LITTLE)
1777	{
1778		/* wait for room in fifo for surface setup cmd if needed */
1779		if (nv_acc_fifofree_dma(2) != B_OK) return;
1780		/* now setup 2D surface (writing 1 32bit word) */
1781		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
1782		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000002; /* Format */
1783	}
1784
1785	/* TNT1 has fixed operation mode 'SRCcopy' while the rest can be programmed: */
1786	if (si->ps.card_type != NV04)
1787	{
1788		/* wait for room in fifo for cmds if needed. */
1789		if (nv_acc_fifofree_dma(5) != B_OK) return;
1790		/* now setup source bitmap colorspace */
1791		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 2);
1792		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1793		/* now setup operation mode to SRCcopy */
1794		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000003; /* SetOperation */
1795	}
1796	else
1797	{
1798		/* wait for room in fifo for cmd if needed. */
1799		if (nv_acc_fifofree_dma(4) != B_OK) return;
1800		/* now setup source bitmap colorspace */
1801		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 1);
1802		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1803		/* TNT1 has fixed operation mode SRCcopy */
1804	}
1805	/* now setup fill color (writing 2 32bit words) */
1806	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1807	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */
1808
1809	/*** do each blit ***/
1810	while (count)
1811	{
1812		/* break up the list in sublists to minimize calls, while making sure long
1813		 * lists still get executed without trouble */
1814		subcnt = 16;
1815		if (count < 16) subcnt = count;
1816		count -= subcnt;
1817
1818		/* wait for room in fifo for blit cmd if needed. */
1819		if (nv_acc_fifofree_dma(12 * subcnt) != B_OK) return;
1820
1821		while (subcnt--)
1822		{
1823			/* now setup blit (writing 12 32bit words) */
1824			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCEORG, 6);
1825			/* setup dest clipping ref for blit (not used) (b0-15 = left, b16-31 = top) */
1826			((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0; /* SourceOrg */
1827			/* setup dest clipping size for blit */
1828			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1829				(((list[i].dest_height + 1) << 16) | (list[i].dest_width + 1)); /* SourceHeightWidth */
1830			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1831			/* setup destination location and size for blit */
1832				(((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */
1833			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1834				(((list[i].dest_height + 1) << 16) | (list[i].dest_width + 1)); /* DestHeightWidth */
1835			//fixme: findout scaling limits... (although the current cmd interface doesn't support them.)
1836			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1837				(((list[i].src_width + 1) << 20) / (list[i].dest_width + 1)); /* HorInvScale (in 12.20 format) */
1838			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1839				(((list[i].src_height + 1) << 20) / (list[i].dest_height + 1)); /* VerInvScale (in 12.20 format) */
1840
1841			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCESIZE, 4);
1842			/* setup horizontal and vertical source (fetching) ends.
1843			 * note:
1844			 * horizontal granularity is 2 pixels, vertical granularity is 1 pixel.
1845			 * look at Matrox or Neomagic bes engines code for usage example. */
1846			//fixme: tested 15, 16 and 32-bit RGB depth, verify other depths...
1847			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1848				(((list[i].src_height + 1) << 16) |
1849				 (((list[i].src_width + 1) + 0x0001) & ~0x0001)); /* SourceHeightWidth */
1850			/* setup source pitch (b0-15). Set 'format origin center' (b16-17) and
1851			 * select 'format interpolator foh (bilinear filtering)' (b24). */
1852			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1853				(si->fbc.bytes_per_row | (1 << 16) | (1 << 24)); /* SourcePitch */
1854			/* setup source surface location */
1855			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1856				((uint32)((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer)) +
1857				(list[i].src_top * si->fbc.bytes_per_row) +	(list[i].src_left * bpp); /* Offset */
1858			/* setup source start: first (sub)pixel contributing to output picture */
1859			/* note:
1860			 * clipping is not asked for.
1861			 * look at nVidia NV10+ bes engine code for useage example. */
1862			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1863				0; /* SourceRef (b0-15 = hor, b16-31 = ver: both in 12.4 format) */
1864
1865			i++;
1866		}
1867
1868		/* tell the engine to fetch the commands in the DMA buffer that where not
1869		 * executed before. */
1870		nv_start_dma();
1871	}
1872
1873	/* reset surface depth settings so the other engine commands works as intended */
1874	if (si->dm.space == B_RGB15_LITTLE)
1875	{
1876		/* wait for room in fifo for surface setup cmd if needed */
1877		if (nv_acc_fifofree_dma(2) != B_OK) return;
1878		/* now setup 2D surface (writing 1 32bit word) */
1879		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
1880		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000004; /* Format */
1881
1882		/* tell the engine to fetch the commands in the DMA buffer that where not
1883		 * executed before. */
1884		nv_start_dma();
1885	}
1886
1887	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1888	si->engine.threeD.reload = 0xffffffff;
1889}
1890
1891/* scaled and filtered screen to screen blit - i.e. video playback without overlay */
1892/* note: source and destination may not overlap. */
1893//fixme? checkout NV5 and NV10 version of cmd: faster?? (or is 0x77 a 'autoselect' version?)
1894void OFFSCREEN_TO_SCREEN_SCALED_FILTERED_BLIT_DMA(
1895	engine_token *et, offscreen_buffer_config *config, clipped_scaled_blit_params *list, uint32 count)
1896{
1897	uint32 i = 0;
1898	uint32 cmd_depth;
1899	uint8 bpp;
1900
1901	LOG(4,("ACC_DMA: offscreen src buffer location $%08x\n", (uint32)((uint8*)(config->buffer))));
1902
1903	/*** init acc engine for scaled filtered blit function ***/
1904	/* Set pixel width */
1905	switch(config->space)
1906	{
1907	case B_RGB15_LITTLE:
1908		cmd_depth = 0x00000002;
1909		bpp = 2;
1910		break;
1911	case B_RGB16_LITTLE:
1912		cmd_depth = 0x00000007;
1913		bpp = 2;
1914		break;
1915	case B_RGB32_LITTLE:
1916	case B_RGBA32_LITTLE:
1917		cmd_depth = 0x00000004;
1918		bpp = 4;
1919		break;
1920	/* fixme sometime:
1921	 * we could do the spaces below if this function would be modified to be able
1922	 * to use a source outside of the desktop, i.e. using offscreen bitmaps... */
1923	case B_YCbCr422:
1924		cmd_depth = 0x00000005;
1925		bpp = 2;
1926		break;
1927	case B_YUV422:
1928		cmd_depth = 0x00000006;
1929		bpp = 2;
1930		break;
1931	default:
1932		/* note: this function does not support src or dest in the B_CMAP8 space! */
1933		//fixme: the NV10 version of this cmd supports B_CMAP8 src though... (checkout)
1934		LOG(8,("ACC_DMA: scaled_filtered_blit, invalid bit depth\n"));
1935		return;
1936	}
1937
1938	/* modify surface depth settings for 15-bit colorspace so command works as intended */
1939	if (si->dm.space == B_RGB15_LITTLE)
1940	{
1941		/* wait for room in fifo for surface setup cmd if needed */
1942		if (nv_acc_fifofree_dma(2) != B_OK) return;
1943		/* now setup 2D surface (writing 1 32bit word) */
1944		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
1945		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000002; /* Format */
1946	}
1947
1948	/* TNT1 has fixed operation mode 'SRCcopy' while the rest can be programmed: */
1949	if (si->ps.card_type != NV04)
1950	{
1951		/* wait for room in fifo for cmds if needed. */
1952		if (nv_acc_fifofree_dma(5) != B_OK) return;
1953		/* now setup source bitmap colorspace */
1954		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 2);
1955		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1956		/* now setup operation mode to SRCcopy */
1957		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000003; /* SetOperation */
1958	}
1959	else
1960	{
1961		/* wait for room in fifo for cmd if needed. */
1962		if (nv_acc_fifofree_dma(4) != B_OK) return;
1963		/* now setup source bitmap colorspace */
1964		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 1);
1965		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1966		/* TNT1 has fixed operation mode SRCcopy */
1967	}
1968	/* now setup fill color (writing 2 32bit words) */
1969	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1970	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */
1971
1972	/*** do each blit ***/
1973	while (count--)
1974	{
1975		uint32 j = 0;
1976		uint16 clipcnt = list[i].dest_clipcount;
1977
1978		LOG(4,("ACC_DMA: offscreen src left %d, top %d\n", list[i].src_left, list[i].src_top));
1979		LOG(4,("ACC_DMA: offscreen src width %d, height %d\n", list[i].src_width + 1, list[i].src_height + 1));
1980		LOG(4,("ACC_DMA: offscreen dest left %d, top %d\n", list[i].dest_left, list[i].dest_top));
1981		LOG(4,("ACC_DMA: offscreen dest width %d, height %d\n", list[i].dest_width + 1, list[i].dest_height + 1));
1982
1983		/* wait for room in fifo for blit cmd if needed. */
1984		if (nv_acc_fifofree_dma(9 + (5 * clipcnt)) != B_OK) return;
1985
1986		/* now setup blit (writing 12 32bit words) */
1987		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCEORG + 8, 4);
1988		/* setup destination location and size for blit */
1989		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1990			((list[i].dest_top << 16) | list[i].dest_left); /* DestTopLeftOutputRect */
1991		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1992			(((list[i].dest_height + 1) << 16) | (list[i].dest_width + 1)); /* DestHeightWidthOutputRect */
1993		/* setup scaling */
1994		//fixme: findout scaling limits... (although the current cmd interface doesn't support them.)
1995		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1996			(((list[i].src_width + 1) << 20) / (list[i].dest_width + 1)); /* HorInvScale (in 12.20 format) */
1997		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1998			(((list[i].src_height + 1) << 20) / (list[i].dest_height + 1)); /* VerInvScale (in 12.20 format) */
1999
2000		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCESIZE, 3);
2001		/* setup horizontal and vertical source (fetching) ends.
2002		 * note:
2003		 * horizontal granularity is 2 pixels, vertical granularity is 1 pixel.
2004		 * look at Matrox or Neomagic bes engines code for usage example. */
2005		//fixme: tested 15, 16 and 32-bit RGB depth, verify other depths...
2006		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2007			(((list[i].src_height + 1) << 16) |
2008			 (((list[i].src_width + 1) + 0x0001) & ~0x0001)); /* SourceHeightWidth */
2009		/* setup source pitch (b0-15). Set 'format origin center' (b16-17) and
2010		 * select 'format interpolator foh (bilinear filtering)' (b24). */
2011		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2012			(config->bytes_per_row | (1 << 16) | (1 << 24)); /* SourcePitch */
2013
2014		/* setup source surface location */
2015		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2016			(uint32)((uint8*)config->buffer - (uint8*)si->framebuffer +
2017			(list[i].src_top * config->bytes_per_row) +	(list[i].src_left * bpp)); /* Offset */
2018
2019		while (clipcnt--)
2020		{
2021			LOG(4,("ACC_DMA: offscreen clip left %d, top %d\n",
2022				list[i].dest_cliplist[j].left, list[i].dest_cliplist[j].top));
2023			LOG(4,("ACC_DMA: offscreen clip width %d, height %d\n",
2024				list[i].dest_cliplist[j].width + 1, list[i].dest_cliplist[j].height + 1));
2025
2026			/* now setup blit (writing 12 32bit words) */
2027			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCEORG, 2);
2028			/* setup dest clipping rect for blit (b0-15 = left, b16-31 = top) */
2029			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2030					(list[i].dest_cliplist[j].top << 16) | list[i].dest_cliplist[j].left; /* DestTopLeftClipRect */
2031			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2032					((list[i].dest_cliplist[j].height + 1) << 16) | (list[i].dest_cliplist[j].width + 1); /* DestHeightWidthClipRect */
2033
2034			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCESIZE + 12, 1);
2035			/* setup source start: first (sub)pixel contributing to output picture */
2036			/* note:
2037			 * clipping is not asked for.
2038			 * look at nVidia NV10+ bes engine code for useage example. */
2039			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2040				0; /* SourceRef (b0-15 = hor, b16-31 = ver: both in 12.4 format) */
2041
2042			j++;
2043		}
2044
2045		i++;
2046	}
2047
2048	/* tell the engine to fetch the commands in the DMA buffer that where not
2049	 * executed before. */
2050	nv_start_dma();
2051
2052	/* reset surface depth settings so the other engine commands works as intended */
2053	if (si->dm.space == B_RGB15_LITTLE)
2054	{
2055		/* wait for room in fifo for surface setup cmd if needed */
2056		if (nv_acc_fifofree_dma(2) != B_OK) return;
2057		/* now setup 2D surface (writing 1 32bit word) */
2058		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
2059		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000004; /* Format */
2060
2061		/* tell the engine to fetch the commands in the DMA buffer that where not
2062		 * executed before. */
2063		nv_start_dma();
2064	}
2065
2066	/* tell 3D add-ons that they should reload their rendering states and surfaces */
2067	si->engine.threeD.reload = 0xffffffff;
2068}
2069
2070/* rectangle fill - i.e. workspace and window background color */
2071void FILL_RECTANGLE_DMA(engine_token *et, uint32 colorIndex, fill_rect_params *list, uint32 count)
2072{
2073	uint32 i = 0;
2074	uint16 subcnt;
2075
2076	/*** init acc engine for fill function ***/
2077	/* ROP registers (Raster OPeration):
2078	 * wait for room in fifo for ROP and bitmap cmd if needed. */
2079	if (nv_acc_fifofree_dma(4) != B_OK) return;
2080	/* now setup ROP (writing 2 32bit words) for GXcopy */
2081	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
2082	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
2083	/* now setup fill color (writing 2 32bit words) */
2084	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
2085	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = colorIndex; /* Color1A */
2086
2087	/*** draw each rectangle ***/
2088	while (count)
2089	{
2090		/* break up the list in sublists to minimize calls, while making sure long
2091		 * lists still get executed without trouble */
2092		subcnt = 32;
2093		if (count < 32) subcnt = count;
2094		count -= subcnt;
2095
2096		/* wait for room in fifo for bitmap cmd if needed. */
2097		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
2098
2099		/* issue fill command once... */
2100		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
2101		/* ... and send multiple rects (engine cmd supports 32 max) */
2102		while (subcnt--)
2103		{
2104			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2105				(((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
2106			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2107				(((((list[i].right)+1) - (list[i].left)) << 16) |
2108				(((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */
2109
2110			i++;
2111		}
2112
2113		/* tell the engine to fetch the commands in the DMA buffer that where not
2114		 * executed before. */
2115		nv_start_dma();
2116	}
2117
2118	/* tell 3D add-ons that they should reload their rendering states and surfaces */
2119	si->engine.threeD.reload = 0xffffffff;
2120}
2121
2122/* span fill - i.e. (selected) menuitem background color (Dano) */
2123void FILL_SPAN_DMA(engine_token *et, uint32 colorIndex, uint16 *list, uint32 count)
2124{
2125	uint32 i = 0;
2126	uint16 subcnt;
2127
2128	/*** init acc engine for fill function ***/
2129	/* ROP registers (Raster OPeration):
2130	 * wait for room in fifo for ROP and bitmap cmd if needed. */
2131	if (nv_acc_fifofree_dma(4) != B_OK) return;
2132	/* now setup ROP (writing 2 32bit words) for GXcopy */
2133	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
2134	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
2135	/* now setup fill color (writing 2 32bit words) */
2136	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
2137	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = colorIndex; /* Color1A */
2138
2139	/*** draw each span ***/
2140	while (count)
2141	{
2142		/* break up the list in sublists to minimize calls, while making sure long
2143		 * lists still get executed without trouble */
2144		subcnt = 32;
2145		if (count < 32) subcnt = count;
2146		count -= subcnt;
2147
2148		/* wait for room in fifo for bitmap cmd if needed. */
2149		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
2150
2151		/* issue fill command once... */
2152		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
2153		/* ... and send multiple rects (spans) (engine cmd supports 32 max) */
2154		while (subcnt--)
2155		{
2156			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2157				(((list[i+1]) << 16) | ((list[i]) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
2158			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2159				((((list[i+2]+1) - (list[i+1])) << 16) | 0x00000001); /* Unclipped Rect 0 WidthHeight */
2160
2161			i+=3;
2162		}
2163
2164		/* tell the engine to fetch the commands in the DMA buffer that where not
2165		 * executed before. */
2166		nv_start_dma();
2167	}
2168
2169	/* tell 3D add-ons that they should reload their rendering states and surfaces */
2170	si->engine.threeD.reload = 0xffffffff;
2171}
2172
2173/* rectangle invert - i.e. text cursor and text selection */
2174void INVERT_RECTANGLE_DMA(engine_token *et, fill_rect_params *list, uint32 count)
2175{
2176	uint32 i = 0;
2177	uint16 subcnt;
2178
2179	/*** init acc engine for invert function ***/
2180	/* ROP registers (Raster OPeration):
2181	 * wait for room in fifo for ROP and bitmap cmd if needed. */
2182	if (nv_acc_fifofree_dma(4) != B_OK) return;
2183	/* now setup ROP (writing 2 32bit words) for GXinvert */
2184	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
2185	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x55; /* SetRop5 */
2186	/* now reset fill color (writing 2 32bit words) */
2187	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
2188	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */
2189
2190	/*** invert each rectangle ***/
2191	while (count)
2192	{
2193		/* break up the list in sublists to minimize calls, while making sure long
2194		 * lists still get executed without trouble */
2195		subcnt = 32;
2196		if (count < 32) subcnt = count;
2197		count -= subcnt;
2198
2199		/* wait for room in fifo for bitmap cmd if needed. */
2200		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
2201
2202		/* issue fill command once... */
2203		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
2204		/* ... and send multiple rects (engine cmd supports 32 max) */
2205		while (subcnt--)
2206		{
2207			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2208				(((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
2209			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2210				(((((list[i].right)+1) - (list[i].left)) << 16) |
2211				(((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */
2212
2213			i++;
2214		}
2215
2216		/* tell the engine to fetch the commands in the DMA buffer that where not
2217		 * executed before. */
2218		nv_start_dma();
2219	}
2220
2221	/* tell 3D add-ons that they should reload their rendering states and surfaces */
2222	si->engine.threeD.reload = 0xffffffff;
2223}
2224