1/*
2	Copyright (c) 2002, Thomas Kurschel
3
4	Part of Radeon kernel driver
5
6	PCI GART.
7
8	Currently, we use PCI DMA. Changing to AGP would
9	only affect this file, but AGP-GART is specific to
10	the chipset of the motherboard, and as DMA is really
11	overkill for 2D, I cannot bother writing a dozen
12	of AGP drivers just to gain little extra speedup.
13*/
14
15
16#include "radeon_driver.h"
17#include "mmio.h"
18#include "buscntrl_regs.h"
19#include "memcntrl_regs.h"
20#include "cp_regs.h"
21
22#include <image.h>
23
24#include <stdlib.h>
25#include <string.h>
26
27
28#if 1
29//! create actual GART buffer
30static status_t
31createGARTBuffer(GART_info *gart, size_t size)
32{
33	SHOW_FLOW0( 3, "" );
34
35	gart->buffer.size = size = (size + B_PAGE_SIZE - 1) & ~(B_PAGE_SIZE - 1);
36
37	// if this buffer is used for PCI BM, cache snooping
38	// takes care of syncing memory accesses; if used for AGP,
39	// we'll have to access via AGP aperture (and mark aperture
40	// as write-combined) as cache consistency doesn't need to
41	// be guaranteed
42
43	// the specs say that some chipsets do kind of lazy flushing
44	// so the graphics card may read obsolete data; up to now
45	// we use PCI only where this shouldn't happen by design;
46	// if we change to AGP we may tweak the pre-charge time of
47	// the write buffer pointer
48
49	// as some variables in accelerant point directly into
50	// the DMA buffer, we have to grant access for all apps
51	gart->buffer.area = create_area("Radeon PCI GART buffer",
52		&gart->buffer.ptr, B_ANY_KERNEL_ADDRESS,
53		size, B_FULL_LOCK,
54#ifdef HAIKU_TARGET_PLATFORM_HAIKU
55		// TODO: really user read/write?
56		B_READ_AREA | B_WRITE_AREA | B_USER_CLONEABLE_AREA
57#else
58		0
59#endif
60		);
61	if (gart->buffer.area < 0) {
62		SHOW_ERROR(1, "cannot create PCI GART buffer (%s)",
63			strerror(gart->buffer.area));
64		return gart->buffer.area;
65	}
66
67	gart->buffer.unaligned_area = -1;
68
69	memset( gart->buffer.ptr, 0, size );
70
71	return B_OK;
72}
73
74#else
75
76static status_t createGARTBuffer( GART_info *gart, size_t size )
77{
78	physical_entry map[1];
79	void *unaligned_addr, *aligned_phys;
80
81	SHOW_FLOW0( 3, "" );
82
83	gart->buffer.size = size = (size + B_PAGE_SIZE - 1) & ~(B_PAGE_SIZE - 1);
84
85	// we allocate an contiguous area having twice the size
86	// to be able to find an aligned, contiguous range within it;
87	// the graphics card doesn't care, but the CPU cannot
88	// make an arbitrary area WC'ed, at least elder ones
89	// question: is this necessary for a PCI GART because of bus snooping?
90	gart->buffer.unaligned_area = create_area( "Radeon PCI GART buffer",
91		&unaligned_addr, B_ANY_KERNEL_ADDRESS,
92		2 * size, B_CONTIGUOUS/*B_FULL_LOCK*/, B_READ_AREA | B_WRITE_AREA | B_USER_CLONEABLE_AREA );
93		// TODO: Physical aligning can be done without waste using the
94		// private create_area_etc().
95	if (gart->buffer.unaligned_area < 0) {
96		SHOW_ERROR( 1, "cannot create PCI GART buffer (%s)",
97			strerror( gart->buffer.unaligned_area ));
98		return gart->buffer.unaligned_area;
99	}
100
101	get_memory_map( unaligned_addr, B_PAGE_SIZE, map, 1 );
102
103	aligned_phys =
104		(void **)((map[0].address + size - 1) & ~(size - 1));
105
106	SHOW_FLOW( 3, "aligned_phys=%p", aligned_phys );
107
108	gart->buffer.area = map_physical_memory( "Radeon aligned PCI GART buffer",
109		(addr_t)aligned_phys,
110		size, B_ANY_KERNEL_BLOCK_ADDRESS | B_MTR_WC,
111		B_READ_AREA | B_WRITE_AREA, &gart->buffer.ptr );
112
113	if( gart->buffer.area < 0 ) {
114		SHOW_ERROR0( 3, "cannot map buffer with WC" );
115		gart->buffer.area = map_physical_memory( "Radeon aligned PCI GART buffer",
116			(addr_t)aligned_phys,
117			size, B_ANY_KERNEL_BLOCK_ADDRESS,
118			B_READ_AREA | B_WRITE_AREA, &gart->buffer.ptr );
119	}
120
121	if( gart->buffer.area < 0 ) {
122		SHOW_ERROR0( 1, "cannot map GART buffer" );
123		delete_area( gart->buffer.unaligned_area );
124		gart->buffer.unaligned_area = -1;
125		return gart->buffer.area;
126	}
127
128	memset( gart->buffer.ptr, 0, size );
129
130	return B_OK;
131}
132
133#endif
134
135// init GATT (could be used for both PCI and AGP)
136static status_t initGATT( GART_info *gart )
137{
138	area_id map_area;
139	uint32 map_area_size;
140	physical_entry *map;
141	physical_entry PTB_map[1];
142	size_t map_count;
143	uint32 i;
144	uint32 *gatt_entry;
145	size_t num_pages;
146
147	SHOW_FLOW0( 3, "" );
148
149	num_pages = (gart->buffer.size + B_PAGE_SIZE - 1) & ~(B_PAGE_SIZE - 1);
150
151	// GART must be contiguous
152	gart->GATT.area = create_area("Radeon GATT", (void **)&gart->GATT.ptr,
153		B_ANY_KERNEL_ADDRESS,
154		(num_pages * sizeof( uint32 ) + B_PAGE_SIZE - 1) & ~(B_PAGE_SIZE - 1),
155		B_32_BIT_CONTIGUOUS,
156			// TODO: Physical address is cast to 32 bit below! Use B_CONTIGUOUS,
157			// when that is (/can be) fixed!
158#ifdef HAIKU_TARGET_PLATFORM_HAIKU
159		// TODO: really user read/write?
160		B_READ_AREA | B_WRITE_AREA | B_USER_CLONEABLE_AREA
161#else
162		0
163#endif
164		);
165
166	if (gart->GATT.area < 0) {
167		SHOW_ERROR(1, "cannot create GATT table (%s)",
168			strerror(gart->GATT.area));
169		return gart->GATT.area;
170	}
171
172	get_memory_map(gart->GATT.ptr, B_PAGE_SIZE, PTB_map, 1);
173	gart->GATT.phys = PTB_map[0].address;
174
175	SHOW_INFO(3, "GATT_ptr=%p, GATT_phys=%p", gart->GATT.ptr,
176		(void *)gart->GATT.phys);
177
178	// get address mapping
179	memset(gart->GATT.ptr, 0, num_pages * sizeof(uint32));
180
181	map_count = num_pages + 1;
182
183	// align size to B_PAGE_SIZE
184	map_area_size = map_count * sizeof(physical_entry);
185	if ((map_area_size / B_PAGE_SIZE) * B_PAGE_SIZE != map_area_size)
186		map_area_size = ((map_area_size / B_PAGE_SIZE) + 1) * B_PAGE_SIZE;
187
188	// temporary area where we fill in the memory map (deleted below)
189	map_area = create_area("pci_gart_map_area", (void **)&map, B_ANY_ADDRESS,
190		map_area_size, B_FULL_LOCK, B_READ_AREA | B_WRITE_AREA);
191		// TODO: We actually have a working malloc() in the kernel. Why create
192		// an area?
193	dprintf("pci_gart_map_area: %ld\n", map_area);
194
195	get_memory_map( gart->buffer.ptr, gart->buffer.size, map, map_count );
196
197	// the following looks a bit strange as the kernel
198	// combines successive entries
199	gatt_entry = gart->GATT.ptr;
200
201	for( i = 0; i < map_count; ++i ) {
202		phys_addr_t addr = map[i].address;
203		size_t size = map[i].size;
204
205		if( size == 0 )
206			break;
207
208		while( size > 0 ) {
209			*gatt_entry++ = addr;
210			//SHOW_FLOW( 3, "%lx", *(gart_entry-1) );
211			addr += ATI_PCIGART_PAGE_SIZE;
212			size -= ATI_PCIGART_PAGE_SIZE;
213		}
214	}
215
216	delete_area(map_area);
217
218	if( i == map_count ) {
219		// this case should never happen
220		SHOW_ERROR0( 0, "memory map of GART buffer too large!" );
221		delete_area( gart->GATT.area );
222		gart->GATT.area = -1;
223		return B_ERROR;
224	}
225
226	// this might be a bit more than needed, as
227	// 1. Intel CPUs have "processor order", i.e. writes appear to external
228	//    devices in program order, so a simple final write should be sufficient
229	// 2. if it is a PCI GART, bus snooping should provide cache coherence
230	// 3. this function is a no-op :(
231	clear_caches( gart->GATT.ptr, num_pages * sizeof( uint32 ),
232		B_FLUSH_DCACHE );
233
234	// back to real live - some chipsets have write buffers that
235	// proove all previous assumptions wrong
236	// (don't know whether this really helps though)
237	#if defined(__INTEL__)
238	asm volatile ( "wbinvd" ::: "memory" );
239	#elif defined(__POWERPC__)
240	// TODO : icbi on PowerPC to flush instruction cache?
241	#endif
242	return B_OK;
243}
244
245// destroy GART buffer
246static void destroyGARTBuffer( GART_info *gart )
247{
248	if( gart->buffer.area > 0 )
249		delete_area( gart->buffer.area );
250
251	if( gart->buffer.unaligned_area > 0 )
252		delete_area( gart->buffer.unaligned_area );
253
254	gart->buffer.area = gart->buffer.unaligned_area = -1;
255}
256
257
258// destroy GATT
259static void destroyGATT( GART_info *gart )
260{
261	if( gart->GATT.area > 0 )
262		delete_area( gart->GATT.area );
263
264	gart->GATT.area = -1;
265}
266
267
268// init PCI GART
269status_t Radeon_InitPCIGART( device_info *di )
270{
271	status_t result;
272
273	result = createGARTBuffer( &di->pci_gart, PCI_GART_SIZE );
274	if( result < 0 )
275		goto err1;
276
277	result = initGATT( &di->pci_gart );
278	if( result < 0 )
279		goto err2;
280
281	return B_OK;
282
283err2:
284	destroyGARTBuffer( &di->pci_gart );
285
286err1:
287	return result;
288}
289
290
291// cleanup PCI GART
292void Radeon_CleanupPCIGART( device_info *di )
293{
294	vuint8 *regs = di->regs;
295
296	SHOW_FLOW0( 3, "" );
297
298	// perhaps we should wait for FIFO space before messing around with registers, but
299	// 1. I don't want to add all the sync stuff to the kernel driver
300	// 2. I doubt that these regs are buffered by FIFO
301	// but still: in worst case CP has written some commands to register FIFO,
302	// which can do any kind of nasty things
303
304	// disable CP BM
305	OUTREG( regs, RADEON_CP_CSQ_CNTL, RADEON_CSQ_PRIDIS_INDDIS );
306	// read-back for flushing
307	INREG( regs, RADEON_CP_CSQ_CNTL );
308
309	// disable bus mastering
310	OUTREGP( regs, RADEON_BUS_CNTL, RADEON_BUS_MASTER_DIS, ~RADEON_BUS_MASTER_DIS );
311	// disable PCI GART
312	OUTREGP( regs, RADEON_AIC_CNTL, 0, ~RADEON_PCIGART_TRANSLATE_EN );
313
314	destroyGATT( &di->pci_gart );
315	destroyGARTBuffer( &di->pci_gart );
316}
317