1/*
2	Copyright (c) 2002, Thomas Kurschel
3
4	Part of Radeon kernel driver
5
6	PCI GART.
7
8	Currently, we use PCI DMA. Changing to AGP would
9	only affect this file, but AGP-GART is specific to
10	the chipset of the motherboard, and as DMA is really
11	overkill for 2D, I cannot bother writing a dozen
12	of AGP drivers just to gain little extra speedup.
13*/
14
15
16#include "radeon_driver.h"
17#include "mmio.h"
18#include "buscntrl_regs.h"
19#include "memcntrl_regs.h"
20#include "cp_regs.h"
21
22#include <image.h>
23
24#include <stdlib.h>
25#include <string.h>
26
27
28#if 1
29//! create actual GART buffer
30static status_t
31createGARTBuffer(GART_info *gart, size_t size)
32{
33	SHOW_FLOW0( 3, "" );
34
35	gart->buffer.size = size = (size + B_PAGE_SIZE - 1) & ~(B_PAGE_SIZE - 1);
36
37	// if this buffer is used for PCI BM, cache snooping
38	// takes care of syncing memory accesses; if used for AGP,
39	// we'll have to access via AGP aperture (and mark aperture
40	// as write-combined) as cache consistency doesn't need to
41	// be guaranteed
42
43	// the specs say that some chipsets do kind of lazy flushing
44	// so the graphics card may read obsolete data; up to now
45	// we use PCI only where this shouldn't happen by design;
46	// if we change to AGP we may tweak the pre-charge time of
47	// the write buffer pointer
48
49	// as some variables in accelerant point directly into
50	// the DMA buffer, we have to grant access for all apps
51	gart->buffer.area = create_area("Radeon PCI GART buffer",
52		&gart->buffer.ptr, B_ANY_KERNEL_ADDRESS,
53		size, B_FULL_LOCK,
54		// TODO: really user read/write?
55		B_READ_AREA | B_WRITE_AREA | B_CLONEABLE_AREA);
56	if (gart->buffer.area < 0) {
57		SHOW_ERROR(1, "cannot create PCI GART buffer (%s)",
58			strerror(gart->buffer.area));
59		return gart->buffer.area;
60	}
61
62	gart->buffer.unaligned_area = -1;
63
64	memset( gart->buffer.ptr, 0, size );
65
66	return B_OK;
67}
68
69#else
70
71static status_t createGARTBuffer( GART_info *gart, size_t size )
72{
73	physical_entry map[1];
74	void *unaligned_addr, *aligned_phys;
75
76	SHOW_FLOW0( 3, "" );
77
78	gart->buffer.size = size = (size + B_PAGE_SIZE - 1) & ~(B_PAGE_SIZE - 1);
79
80	// we allocate an contiguous area having twice the size
81	// to be able to find an aligned, contiguous range within it;
82	// the graphics card doesn't care, but the CPU cannot
83	// make an arbitrary area WC'ed, at least elder ones
84	// question: is this necessary for a PCI GART because of bus snooping?
85	gart->buffer.unaligned_area = create_area( "Radeon PCI GART buffer",
86		&unaligned_addr, B_ANY_KERNEL_ADDRESS,
87		2 * size, B_CONTIGUOUS/*B_FULL_LOCK*/, B_READ_AREA | B_WRITE_AREA | B_CLONEABLE_AREA );
88		// TODO: Physical aligning can be done without waste using the
89		// private create_area_etc().
90	if (gart->buffer.unaligned_area < 0) {
91		SHOW_ERROR( 1, "cannot create PCI GART buffer (%s)",
92			strerror( gart->buffer.unaligned_area ));
93		return gart->buffer.unaligned_area;
94	}
95
96	get_memory_map( unaligned_addr, B_PAGE_SIZE, map, 1 );
97
98	aligned_phys =
99		(void **)((map[0].address + size - 1) & ~(size - 1));
100
101	SHOW_FLOW( 3, "aligned_phys=%p", aligned_phys );
102
103	gart->buffer.area = map_physical_memory( "Radeon aligned PCI GART buffer",
104		(addr_t)aligned_phys,
105		size, B_ANY_KERNEL_BLOCK_ADDRESS | B_MTR_WC,
106		B_READ_AREA | B_WRITE_AREA, &gart->buffer.ptr );
107
108	if( gart->buffer.area < 0 ) {
109		SHOW_ERROR0( 3, "cannot map buffer with WC" );
110		gart->buffer.area = map_physical_memory( "Radeon aligned PCI GART buffer",
111			(addr_t)aligned_phys,
112			size, B_ANY_KERNEL_BLOCK_ADDRESS,
113			B_READ_AREA | B_WRITE_AREA, &gart->buffer.ptr );
114	}
115
116	if( gart->buffer.area < 0 ) {
117		SHOW_ERROR0( 1, "cannot map GART buffer" );
118		delete_area( gart->buffer.unaligned_area );
119		gart->buffer.unaligned_area = -1;
120		return gart->buffer.area;
121	}
122
123	memset( gart->buffer.ptr, 0, size );
124
125	return B_OK;
126}
127
128#endif
129
130// init GATT (could be used for both PCI and AGP)
131static status_t initGATT( GART_info *gart )
132{
133	area_id map_area;
134	uint32 map_area_size;
135	physical_entry *map;
136	physical_entry PTB_map[1];
137	size_t map_count;
138	uint32 i;
139	uint32 *gatt_entry;
140	size_t num_pages;
141
142	SHOW_FLOW0( 3, "" );
143
144	num_pages = (gart->buffer.size + B_PAGE_SIZE - 1) & ~(B_PAGE_SIZE - 1);
145
146	// GART must be contiguous
147	gart->GATT.area = create_area("Radeon GATT", (void **)&gart->GATT.ptr,
148		B_ANY_KERNEL_ADDRESS,
149		(num_pages * sizeof( uint32 ) + B_PAGE_SIZE - 1) & ~(B_PAGE_SIZE - 1),
150		B_32_BIT_CONTIGUOUS,
151			// TODO: Physical address is cast to 32 bit below! Use B_CONTIGUOUS,
152			// when that is (/can be) fixed!
153		// TODO: really user read/write?
154		B_READ_AREA | B_WRITE_AREA | B_CLONEABLE_AREA);
155
156	if (gart->GATT.area < 0) {
157		SHOW_ERROR(1, "cannot create GATT table (%s)",
158			strerror(gart->GATT.area));
159		return gart->GATT.area;
160	}
161
162	get_memory_map(gart->GATT.ptr, B_PAGE_SIZE, PTB_map, 1);
163	gart->GATT.phys = PTB_map[0].address;
164
165	SHOW_INFO(3, "GATT_ptr=%p, GATT_phys=%p", gart->GATT.ptr,
166		(void *)gart->GATT.phys);
167
168	// get address mapping
169	memset(gart->GATT.ptr, 0, num_pages * sizeof(uint32));
170
171	map_count = num_pages + 1;
172
173	// align size to B_PAGE_SIZE
174	map_area_size = map_count * sizeof(physical_entry);
175	if ((map_area_size / B_PAGE_SIZE) * B_PAGE_SIZE != map_area_size)
176		map_area_size = ((map_area_size / B_PAGE_SIZE) + 1) * B_PAGE_SIZE;
177
178	// temporary area where we fill in the memory map (deleted below)
179	map_area = create_area("pci_gart_map_area", (void **)&map, B_ANY_ADDRESS,
180		map_area_size, B_FULL_LOCK, B_READ_AREA | B_WRITE_AREA);
181		// TODO: We actually have a working malloc() in the kernel. Why create
182		// an area?
183	dprintf("pci_gart_map_area: %" B_PRId32 "\n", map_area);
184
185	get_memory_map( gart->buffer.ptr, gart->buffer.size, map, map_count );
186
187	// the following looks a bit strange as the kernel
188	// combines successive entries
189	gatt_entry = gart->GATT.ptr;
190
191	for( i = 0; i < map_count; ++i ) {
192		phys_addr_t addr = map[i].address;
193		size_t size = map[i].size;
194
195		if( size == 0 )
196			break;
197
198		while( size > 0 ) {
199			*gatt_entry++ = addr;
200			//SHOW_FLOW( 3, "%lx", *(gart_entry-1) );
201			addr += ATI_PCIGART_PAGE_SIZE;
202			size -= ATI_PCIGART_PAGE_SIZE;
203		}
204	}
205
206	delete_area(map_area);
207
208	if( i == map_count ) {
209		// this case should never happen
210		SHOW_ERROR0( 0, "memory map of GART buffer too large!" );
211		delete_area( gart->GATT.area );
212		gart->GATT.area = -1;
213		return B_ERROR;
214	}
215
216	// this might be a bit more than needed, as
217	// 1. Intel CPUs have "processor order", i.e. writes appear to external
218	//    devices in program order, so a simple final write should be sufficient
219	// 2. if it is a PCI GART, bus snooping should provide cache coherence
220	// 3. this function is a no-op :(
221	clear_caches( gart->GATT.ptr, num_pages * sizeof( uint32 ),
222		B_FLUSH_DCACHE );
223
224	// back to real live - some chipsets have write buffers that
225	// proove all previous assumptions wrong
226	// (don't know whether this really helps though)
227	#if defined(__i386__)
228	asm volatile ( "wbinvd" ::: "memory" );
229	#elif defined(__POWERPC__)
230	// TODO : icbi on PowerPC to flush instruction cache?
231	#endif
232	return B_OK;
233}
234
235// destroy GART buffer
236static void destroyGARTBuffer( GART_info *gart )
237{
238	if( gart->buffer.area > 0 )
239		delete_area( gart->buffer.area );
240
241	if( gart->buffer.unaligned_area > 0 )
242		delete_area( gart->buffer.unaligned_area );
243
244	gart->buffer.area = gart->buffer.unaligned_area = -1;
245}
246
247
248// destroy GATT
249static void destroyGATT( GART_info *gart )
250{
251	if( gart->GATT.area > 0 )
252		delete_area( gart->GATT.area );
253
254	gart->GATT.area = -1;
255}
256
257
258// init PCI GART
259status_t Radeon_InitPCIGART( device_info *di )
260{
261	status_t result;
262
263	result = createGARTBuffer( &di->pci_gart, PCI_GART_SIZE );
264	if( result < 0 )
265		goto err1;
266
267	result = initGATT( &di->pci_gart );
268	if( result < 0 )
269		goto err2;
270
271	return B_OK;
272
273err2:
274	destroyGARTBuffer( &di->pci_gart );
275
276err1:
277	return result;
278}
279
280
281// cleanup PCI GART
282void Radeon_CleanupPCIGART( device_info *di )
283{
284	vuint8 *regs = di->regs;
285
286	SHOW_FLOW0( 3, "" );
287
288	// perhaps we should wait for FIFO space before messing around with registers, but
289	// 1. I don't want to add all the sync stuff to the kernel driver
290	// 2. I doubt that these regs are buffered by FIFO
291	// but still: in worst case CP has written some commands to register FIFO,
292	// which can do any kind of nasty things
293
294	// disable CP BM
295	OUTREG( regs, RADEON_CP_CSQ_CNTL, RADEON_CSQ_PRIDIS_INDDIS );
296	// read-back for flushing
297	INREG( regs, RADEON_CP_CSQ_CNTL );
298
299	// disable bus mastering
300	OUTREGP( regs, RADEON_BUS_CNTL, RADEON_BUS_MASTER_DIS, ~RADEON_BUS_MASTER_DIS );
301	// disable PCI GART
302	OUTREGP( regs, RADEON_AIC_CNTL, 0, ~RADEON_PCIGART_TRANSLATE_EN );
303
304	destroyGATT( &di->pci_gart );
305	destroyGARTBuffer( &di->pci_gart );
306}
307