1/* MGA Acceleration functions */
2/* Authors:
3   Mark Watson 2/2000,
4   Rudolf Cornelissen 10/2002-1/2006.
5*/
6
7#define MODULE_BIT 0x00080000
8
9#include "mga_std.h"
10
11/*acceleration notes*/
12
13/*functions Be's app_server uses:
14fill span (horizontal only)
15fill rectangle (these 2 are very similar)
16invert rectangle
17blit
18*/
19
20/* needed by MIL 1/2 because of adress linearisation constraints */
21#define ACCW_YDSTLEN(dst, len) do { \
22	if (si->engine.y_lin) { \
23		ACCW(YDST,((dst)* (si->fbc.bytes_per_row / (si->engine.depth >> 3))) >> 5); \
24		ACCW(LEN,len); \
25	} else ACCW(YDSTLEN,((dst)<<16)|(len)); \
26} while (0)
27
28status_t gx00_acc_wait_idle()
29{
30	/* wait until engine completely idle */
31	while (ACCR(STATUS) & 0x00010000)
32	{
33		/* snooze a bit so I do not hammer the bus */
34		snooze (100);
35	}
36
37	return B_OK;
38}
39
40/* AFAIK this must be done for every new screenmode.
41 * Engine required init. */
42status_t gx00_acc_init()
43{
44	/* used for convenience: MACCESS is a write only register! */
45	uint32 maccess = 0x00000000;
46	/* if we were unable to read PINS, we have to assume something (keeping bit6 zero) */
47	if ((si->ps.card_type >= G450) && (si->ps.pins_status == B_OK))
48	{
49		/* b7 v5_mem_type = done by Mark Watson. fixme: still confirm! (unknown bits) */
50		maccess |= ((((uint32)si->ps.v5_mem_type) & 0x80) >> 1);
51	}
52
53	/* preset using hardware adress linearisation */
54	si->engine.y_lin = 0x00;
55	/* reset depth */
56	si->engine.depth = 0;
57
58	/* cleanup bitblt */
59	ACCW(OPMODE,0);
60
61	/* Set the Z origin to the start of FB (otherwise lockup on blits) */
62	ACCW(ZORG,0);
63
64	/* Set pixel width */
65	switch(si->dm.space)
66	{
67	case B_CMAP8:
68		ACCW(MACCESS, ((maccess & 0xfffffffc) | 0x00));
69		si->engine.depth = 8;
70		break;
71	case B_RGB15_LITTLE:case B_RGB16_LITTLE:
72		ACCW(MACCESS, ((maccess & 0xfffffffc) | 0x01));
73		si->engine.depth = 16;
74		break;
75	case B_RGB32_LITTLE:case B_RGBA32_LITTLE:
76		ACCW(MACCESS, ((maccess & 0xfffffffc) | 0x02));
77		si->engine.depth = 32;
78		break;
79	default:
80		LOG(8,("ACC: init, invalid bit depth\n"));
81		return B_ERROR;
82	}
83
84	/* setup PITCH: very cardtype specific! */
85	switch (si->ps.card_type)
86	{
87	case MIL1:
88		switch (si->fbc.bytes_per_row / (si->engine.depth >> 3))
89		{
90			case 640:
91			case 768:
92			case 800:
93			case 960:
94			case 1024:
95			case 1152:
96			case 1280:
97			case 1600:
98			case 1920:
99			case 2048:
100				/* we are using hardware adress linearisation */
101				break;
102			default:
103				/* we are using software adress linearisation */
104				si->engine.y_lin = 0x01;
105				LOG(8,("ACC: using software adress linearisation\n"));
106				break;
107		}
108		ACCW(PITCH, (si->engine.y_lin << 15) |
109					((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x0FFF));
110		break;
111	case MIL2:
112		switch (si->fbc.bytes_per_row / (si->engine.depth >> 3))
113		{
114			case 512:
115			case 640:
116			case 768:
117			case 800:
118			case 832:
119			case 960:
120			case 1024:
121			case 1152:
122			case 1280:
123			case 1600:
124			case 1664:
125			case 1920:
126			case 2048:
127				/* we are using hardware adress linearisation */
128				break;
129			default:
130				/* we are using software adress linearisation */
131				si->engine.y_lin = 0x01;
132				LOG(8,("ACC: using software adress linearisation\n"));
133				break;
134		}
135		ACCW(PITCH, (si->engine.y_lin << 15) |
136					((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x0FFF));
137		break;
138	case G100:
139		/* always using hardware adress linearisation, because 2D/3D
140		 * engine works on every pitch multiple of 32 */
141		ACCW(PITCH, ((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x0FFF));
142		break;
143	default:
144		/* G200 and up are equal.. */
145		/* always using hardware adress linearisation, because 2D/3D
146		 * engine works on every pitch multiple of 32 */
147		ACCW(PITCH, ((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x1FFF));
148		break;
149	}
150
151	/* disable plane write mask (needed for SDRAM): actual change needed to get it sent to RAM */
152	ACCW(PLNWT,0x00000000);
153	ACCW(PLNWT,0xffffffff);
154
155	if (si->ps.card_type >= G200) {
156		/*DSTORG - location of active screen in framebuffer*/
157		ACCW(DSTORG,((uint8*)si->fbc.frame_buffer) - ((uint8*)si->framebuffer));
158
159		/*SRCORG - init source address - same as dest*/
160		ACCW(SRCORG,((uint8*)si->fbc.frame_buffer) - ((uint8*)si->framebuffer));
161	}
162
163	/* init YDSTORG - apsed, if not inited, BitBlts may fails on <= G200 */
164	si->engine.src_dst = 0;
165	ACCW(YDSTORG, si->engine.src_dst);
166
167	/* <= G100 uses this register as SRCORG/DSTORG replacement, but
168	 * MIL 1/2 does not need framebuffer space for the hardcursor! */
169	if ((si->ps.card_type == G100) && (si->settings.hardcursor))
170	{
171		switch (si->dm.space)
172		{
173			case B_CMAP8:
174				si->engine.src_dst = 1024 / 1;
175				break;
176			case B_RGB15_LITTLE:
177			case B_RGB16_LITTLE:
178				si->engine.src_dst = 1024 / 2;
179				break;
180			case B_RGB32_LITTLE:
181				si->engine.src_dst =  1024 / 4;
182				break;
183			default:
184				LOG(8,("ACC: G100 hardcursor not supported for current colorspace\n"));
185				return B_ERROR;
186		}
187	}
188	ACCW(YDSTORG, si->engine.src_dst);
189
190	/* clipping */
191	/* i.e. highest and lowest X pixel adresses */
192	ACCW(CXBNDRY,(((si->fbc.bytes_per_row / (si->engine.depth >> 3)) - 1) << 16) | (0));
193
194	/* Y pixel addresses must be linear */
195	/* lowest adress */
196	ACCW(YTOP, 0 + si->engine.src_dst);
197	/* highest adress */
198	ACCW(YBOT,((si->dm.virtual_height - 1) *
199		(si->fbc.bytes_per_row / (si->engine.depth >> 3))) + si->engine.src_dst);
200
201	return B_OK;
202}
203
204
205/*
206	note:
207	moved acceleration 'top-level' routines to be integrated in the engine:
208	it is costly to call the engine for every single function within a loop!
209	(measured with BeRoMeter 1.2.6: upto 15% speed increase on all CPU's.)
210*/
211
212/* screen to screen blit - i.e. move windows around.
213 * Engine function bitblit, paragraph 4.5.7.2 */
214void SCREEN_TO_SCREEN_BLIT(engine_token *et, blit_params *list, uint32 count)
215{
216	uint32 t_start,t_end,offset;
217	uint32 b_start,b_end;
218	int i = 0;
219
220	/* calc offset 'per line' */
221	offset = (si->fbc.bytes_per_row / (si->engine.depth >> 3));
222
223	while (count--)
224	{
225		/* find where the top and bottom are */
226		t_end = t_start =
227			list[i].src_left + (offset * list[i].src_top) + si->engine.src_dst;
228		t_end += list[i].width;
229
230		b_end = b_start =
231			list[i].src_left + (offset * (list[i].src_top + list[i].height)) + si->engine.src_dst;
232		b_end += list[i].width;
233
234		/* sgnzero bit _must_ be '0' before accessing SGN! */
235		ACCW(DWGCTL, 0x00000000);
236
237		/*find which quadrant */
238		switch((list[i].dest_top > list[i].src_top) | ((list[i].dest_left > list[i].src_left) << 1))
239		{
240		case 0: /*L->R,down*/
241			ACCW(SGN, 0);
242			ACCW(AR3, t_start);
243			ACCW(AR0, t_end);
244			ACCW(AR5, offset);
245			ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
246			break;
247		case 1: /*L->R,up*/
248			ACCW(SGN, 4);
249			ACCW(AR3, b_start);
250			ACCW(AR0, b_end);
251			ACCW(AR5, -offset);
252			ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
253			break;
254		case 2: /*R->L,down*/
255			ACCW(SGN, 1);
256			ACCW(AR3, t_end);
257			ACCW(AR0, t_start);
258			ACCW(AR5, offset);
259			ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
260			break;
261		case 3: /*R->L,up*/
262			ACCW(SGN, 5);
263			ACCW(AR3, b_end);
264			ACCW(AR0, b_start);
265			ACCW(AR5, -offset);
266			ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
267			break;
268		}
269		ACCW(FXBNDRY,((list[i].dest_left + list[i].width) << 16) | list[i].dest_left);
270
271		/* start the blit */
272		ACCGO(DWGCTL, 0x040c4018); // atype RSTR
273		i++;
274	}
275}
276
277/* screen to screen tranparent blit - not sure what uses this.
278 * Engine function bitblit, paragraph 4.5.7.2 */
279//WARNING:
280//yet untested function!!
281void SCREEN_TO_SCREEN_TRANSPARENT_BLIT(engine_token *et, uint32 transparent_colour, blit_params *list, uint32 count)
282{
283	uint32 t_start,t_end,offset;
284	uint32 b_start,b_end;
285	int i = 0;
286
287	/* calc offset 'per line' */
288	offset = (si->fbc.bytes_per_row / (si->engine.depth >> 3));
289
290	while (count--)
291	{
292		/* find where the top and bottom are */
293		t_end = t_start =
294			list[i].src_left + (offset * list[i].src_top) + si->engine.src_dst;
295		t_end += list[i].width;
296
297		b_end = b_start =
298			list[i].src_left + (offset * (list[i].src_top + list[i].height)) + si->engine.src_dst;
299		b_end += list[i].width;
300
301		/* sgnzero bit _must_ be '0' before accessing SGN! */
302		ACCW(DWGCTL, 0x00000000);
303
304		/*find which quadrant */
305		switch((list[i].dest_top > list[i].src_top) | ((list[i].dest_left > list[i].src_left) << 1))
306		{
307		case 0: /*L->R,down*/
308			ACCW(SGN, 0);
309			ACCW(AR3, t_start);
310			ACCW(AR0, t_end);
311			ACCW(AR5, offset);
312			ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
313			break;
314		case 1: /*L->R,up*/
315			ACCW(SGN, 4);
316			ACCW(AR3, b_start);
317			ACCW(AR0, b_end);
318			ACCW(AR5, -offset);
319			ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
320			break;
321		case 2: /*R->L,down*/
322			ACCW(SGN, 1);
323			ACCW(AR3, t_end);
324			ACCW(AR0, t_start);
325			ACCW(AR5, offset);
326			ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
327			break;
328		case 3: /*R->L,up*/
329			ACCW(SGN, 5);
330			ACCW(AR3, b_end);
331			ACCW(AR0, b_start);
332			ACCW(AR5, -offset);
333			ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
334			break;
335		}
336		ACCW(FXBNDRY,((list[i].dest_left + list[i].width) << 16) | list[i].dest_left);
337
338		/* start the blit */
339		ACCW(FCOL, transparent_colour);
340		ACCW(BCOL, 0xffffffff);
341		ACCGO(DWGCTL, 0x440c4018); // atype RSTR
342		i++;
343	}
344}
345
346/* screen to screen scaled filtered blit - i.e. scale video in memory.
347 * Engine function texture mapping for video, paragraphs 4.5.5.5 - 4.5.5.9 */
348//fixme: implement...
349void SCREEN_TO_SCREEN_SCALED_FILTERED_BLIT(engine_token *et, scaled_blit_params *list, uint32 count)
350{
351	int i = 0;
352
353	while (count--)
354	{
355/*
356			list[i].src_left,
357			list[i].src_top,
358			list[i].src_width,
359			list[i].src_height,
360			list[i].dest_left,
361			list[i].dest_top,
362			list[i].dest_width,
363			list[i].dest_height
364*/
365		i++;
366	}
367}
368
369/* rectangle fill.
370 * Engine function rectangle_fill: paragraph 4.5.5.2 */
371void FILL_RECTANGLE(engine_token *et, uint32 colorIndex, fill_rect_params *list, uint32 count)
372{
373/*
374	FXBNDRY - left and right coordinates    a
375	YDSTLEN - y start and no of lines       a
376	(or YDST and LEN)
377	DWGCTL - atype must be RSTR or BLK      a
378	FCOL - foreground colour                a
379*/
380	int i = 0;
381
382	while (count--)
383	{
384		ACCW(FXBNDRY, (((list[i].right + 1) << 16) | list[i].left));
385		ACCW_YDSTLEN(list[i].top, ((list[i].bottom - list[i].top) + 1));
386		ACCW(FCOL, colorIndex);
387
388		/* start the fill */
389//acc fixme: checkout blockmode constraints for G100+ (mil: nc?): also add blockmode
390//	         for other functions, and use fastblt on MIL1/2 if possible...
391//or is CMAP8 contraint a non-blockmode contraint? (linearisation problem maybe?)
392		if ((si->dm.space == B_CMAP8) || si->ps.sdram)
393		{
394			ACCGO(DWGCTL, 0x400c7814); // atype RSTR
395		}
396		else
397		{
398			ACCGO(DWGCTL, 0x400c7844); // atype BLK
399		}
400		i++;
401	}
402}
403
404/* horizontal span fill.
405 * Engine function rectangle_fill: paragraph 4.5.5.2 */
406//(uint32 xs,uint32 xe,uint32 ys,uint32 yl,uint32 col)
407void FILL_SPAN(engine_token *et, uint32 colorIndex, uint16 *list, uint32 count)
408{
409/*
410	FXBNDRY - left and right coordinates    a
411	YDSTLEN - y start and no of lines       a
412	(or YDST and LEN)
413	DWGCTL - atype must be RSTR or BLK      a
414	FCOL - foreground colour                a
415*/
416	int i = 0;
417
418	while (count--)
419	{
420		ACCW(FXBNDRY, ((list[i + 2] + 1) << 16)| list[i + 1]);
421		ACCW_YDSTLEN(list[i], 1);
422		ACCW(FCOL, colorIndex);
423
424		/* start the fill */
425//acc fixme: checkout blockmode constraints for G100+ (mil: nc?): also add blockmode
426//	         for other functions, and use fastblt on MIL1/2 if possible...
427//or is CMAP8 contraint a non-blockmode contraint? (linearisation problem maybe?)
428		if ((si->dm.space == B_CMAP8) || si->ps.sdram)
429		{
430			ACCGO(DWGCTL, 0x400c7814); // atype RSTR
431		}
432		else
433		{
434			ACCGO(DWGCTL, 0x400c7844); // atype BLK
435		}
436		i += 3;
437	}
438}
439
440/* rectangle invert.
441 * Engine function rectangle_fill: paragraph 4.5.5.2 */
442void INVERT_RECTANGLE(engine_token *et, fill_rect_params *list, uint32 count)
443{
444/*
445	FXBNDRY - left and right coordinates    a
446	YDSTLEN - y start and no of lines       a
447	(or YDST and LEN)
448	DWGCTL - atype must be RSTR or BLK      a
449	FCOL - foreground colour                a
450*/
451	int i = 0;
452//	uint32 * dma;
453//	uint32 pci;
454
455	while (count--)
456	{
457		ACCW(FXBNDRY, (((list[i].right) + 1) << 16) | list[i].left);
458		ACCW_YDSTLEN(list[i].top, ((list[i].bottom - list[i].top) + 1));
459		ACCW(FCOL, 0); /* color */
460
461		/* start the invert (top nibble is c is clipping enabled) */
462		ACCGO(DWGCTL, 0x40057814); // atype RSTR
463
464		/* pseudo_dma version! */
465//		MGAACC_DWGCTL      =0x1c00,
466//		MGAACC_FCOL        =0x1c24,
467//		MGAACC_FXBNDRY     =0x1c84,
468//		MGAACC_YDSTLEN     =0x1c88,
469//
470//		40,09,21,22 (ordered as registers)
471
472//		dma = (uint32 *)si->pseudo_dma;
473//		*dma++= 0x40092221;
474//		*dma++= (((list[i].right) + 1) << 16) | list[i].left;
475//		*dma++= (list[i].top << 16) | ((list[i].bottom - list[i].top) + 1);
476//		*dma++= 0; /* color */
477//		*dma++= 0x40057814;
478
479		/* real dma version! */
480//		dma = (vuint32 *)si->dma_buffer;
481//		*dma++= 0x40092221; /* indices */
482//		*dma++= (((list[i].right) + 1) << 16) | list[i].left;
483//		*dma++= (list[i].top << 16) | ((list[i].bottom - list[i].top) + 1);
484//		*dma++= 0; /* color */
485//		*dma++= 0x40057814;
486
487//		pci = si->dma_buffer_pci;
488//		ACCW(PRIMADDRESS, (pci));
489//		ACCW(PRIMEND, (20 + pci));
490
491//		delay(100);
492
493		i++;
494	}
495}
496