cache_mipsNN.c revision 330897
1/*	$NetBSD: cache_mipsNN.c,v 1.10 2005/12/24 20:07:19 perry Exp $	*/
2
3/*
4 * SPDX-License-Identifier: BSD-4-Clause
5 *
6 * Copyright 2001 Wasabi Systems, Inc.
7 * All rights reserved.
8 *
9 * Written by Jason R. Thorpe and Simon Burge for Wasabi Systems, Inc.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed for the NetBSD Project by
22 *	Wasabi Systems, Inc.
23 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
24 *    or promote products derived from this software without specific prior
25 *    written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD: stable/11/sys/mips/mips/cache_mipsNN.c 330897 2018-03-14 03:19:51Z eadler $");
42
43#include <sys/types.h>
44#include <sys/systm.h>
45#include <sys/param.h>
46
47#include <machine/cache.h>
48#include <machine/cache_r4k.h>
49#include <machine/cpuinfo.h>
50
51#define	round_line16(x)		(((x) + 15) & ~15)
52#define	trunc_line16(x)		((x) & ~15)
53
54#define	round_line32(x)		(((x) + 31) & ~31)
55#define	trunc_line32(x)		((x) & ~31)
56
57#define	round_line64(x)		(((x) + 63) & ~63)
58#define	trunc_line64(x)		((x) & ~63)
59
60#define	round_line128(x)	(((x) + 127) & ~127)
61#define	trunc_line128(x)	((x) & ~127)
62
63#if defined(CPU_NLM)
64static __inline void
65xlp_sync(void)
66{
67        __asm __volatile (
68	    ".set push              \n"
69	    ".set noreorder         \n"
70	    ".set mips64            \n"
71	    "dla    $8, 1f          \n"
72	    "/* jr.hb $8 */         \n"
73	    ".word 0x1000408        \n"
74	    "nop                    \n"
75	 "1: nop                    \n"
76	    ".set pop               \n"
77	    : : : "$8");
78}
79#endif
80
81#if defined(SB1250_PASS1)
82#define	SYNC	__asm volatile("sync; sync")
83#elif defined(CPU_NLM)
84#define SYNC	xlp_sync()
85#else
86#define	SYNC	__asm volatile("sync")
87#endif
88
89#if defined(CPU_CNMIPS)
90#define SYNCI  mips_sync_icache();
91#elif defined(CPU_NLM)
92#define SYNCI	xlp_sync()
93#else
94#define SYNCI
95#endif
96
97/*
98 * Exported variables for consumers like bus_dma code
99 */
100int mips_picache_linesize;
101int mips_pdcache_linesize;
102
103static int picache_size;
104static int picache_stride;
105static int picache_loopcount;
106static int picache_way_mask;
107static int pdcache_size;
108static int pdcache_stride;
109static int pdcache_loopcount;
110static int pdcache_way_mask;
111static int sdcache_size;
112static int sdcache_stride;
113static int sdcache_loopcount;
114static int sdcache_way_mask;
115
116void
117mipsNN_cache_init(struct mips_cpuinfo * cpuinfo)
118{
119	int flush_multiple_lines_per_way;
120
121	flush_multiple_lines_per_way = cpuinfo->l1.ic_nsets * cpuinfo->l1.ic_linesize * cpuinfo->l1.ic_linesize > PAGE_SIZE;
122	if (cpuinfo->icache_virtual) {
123		/*
124		 * With a virtual Icache we don't need to flush
125		 * multiples of the page size with index ops; we just
126		 * need to flush one pages' worth.
127		 */
128		flush_multiple_lines_per_way = 0;
129	}
130
131	if (flush_multiple_lines_per_way) {
132		picache_stride = PAGE_SIZE;
133		picache_loopcount = (cpuinfo->l1.ic_nsets * cpuinfo->l1.ic_linesize / PAGE_SIZE) *
134		    cpuinfo->l1.ic_nways;
135	} else {
136		picache_stride = cpuinfo->l1.ic_nsets * cpuinfo->l1.ic_linesize;
137		picache_loopcount = cpuinfo->l1.ic_nways;
138	}
139
140	if (cpuinfo->l1.dc_nsets * cpuinfo->l1.dc_linesize < PAGE_SIZE) {
141		pdcache_stride = cpuinfo->l1.dc_nsets * cpuinfo->l1.dc_linesize;
142		pdcache_loopcount = cpuinfo->l1.dc_nways;
143	} else {
144		pdcache_stride = PAGE_SIZE;
145		pdcache_loopcount = (cpuinfo->l1.dc_nsets * cpuinfo->l1.dc_linesize / PAGE_SIZE) *
146		    cpuinfo->l1.dc_nways;
147	}
148
149	mips_picache_linesize = cpuinfo->l1.ic_linesize;
150	mips_pdcache_linesize = cpuinfo->l1.dc_linesize;
151
152	picache_size = cpuinfo->l1.ic_size;
153	picache_way_mask = cpuinfo->l1.ic_nways - 1;
154	pdcache_size = cpuinfo->l1.dc_size;
155	pdcache_way_mask = cpuinfo->l1.dc_nways - 1;
156
157	sdcache_stride = cpuinfo->l2.dc_nsets * cpuinfo->l2.dc_linesize;
158	sdcache_loopcount = cpuinfo->l2.dc_nways;
159	sdcache_size = cpuinfo->l2.dc_size;
160	sdcache_way_mask = cpuinfo->l2.dc_nways - 1;
161
162#define CACHE_DEBUG
163#ifdef CACHE_DEBUG
164	printf("Cache info:\n");
165	if (cpuinfo->icache_virtual)
166		printf("  icache is virtual\n");
167	printf("  picache_stride    = %d\n", picache_stride);
168	printf("  picache_loopcount = %d\n", picache_loopcount);
169	printf("  pdcache_stride    = %d\n", pdcache_stride);
170	printf("  pdcache_loopcount = %d\n", pdcache_loopcount);
171#endif
172}
173
174void
175mipsNN_icache_sync_all_16(void)
176{
177	vm_offset_t va, eva;
178
179	va = MIPS_PHYS_TO_KSEG0(0);
180	eva = va + picache_size;
181
182	/*
183	 * Since we're hitting the whole thing, we don't have to
184	 * worry about the N different "ways".
185	 */
186
187	mips_intern_dcache_wbinv_all();
188
189	while (va < eva) {
190		cache_r4k_op_32lines_16(va, CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
191		va += (32 * 16);
192	}
193
194	SYNC;
195}
196
197void
198mipsNN_icache_sync_all_32(void)
199{
200	vm_offset_t va, eva;
201
202	va = MIPS_PHYS_TO_KSEG0(0);
203	eva = va + picache_size;
204
205	/*
206	 * Since we're hitting the whole thing, we don't have to
207	 * worry about the N different "ways".
208	 */
209
210	mips_intern_dcache_wbinv_all();
211
212	while (va < eva) {
213		cache_r4k_op_32lines_32(va, CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
214		va += (32 * 32);
215	}
216
217	SYNC;
218}
219
220void
221mipsNN_icache_sync_all_64(void)
222{
223	vm_offset_t va, eva;
224
225	va = MIPS_PHYS_TO_KSEG0(0);
226	eva = va + picache_size;
227
228	/*
229	 * Since we're hitting the whole thing, we don't have to
230	 * worry about the N different "ways".
231	 */
232
233	mips_intern_dcache_wbinv_all();
234
235	while (va < eva) {
236		cache_r4k_op_32lines_64(va, CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
237		va += (32 * 64);
238	}
239
240	SYNC;
241}
242
243void
244mipsNN_icache_sync_range_16(vm_offset_t va, vm_size_t size)
245{
246	vm_offset_t eva;
247
248	eva = round_line16(va + size);
249	va = trunc_line16(va);
250
251	mips_intern_dcache_wb_range(va, (eva - va));
252
253	while ((eva - va) >= (32 * 16)) {
254		cache_r4k_op_32lines_16(va, CACHE_R4K_I|CACHEOP_R4K_HIT_INV);
255		va += (32 * 16);
256	}
257
258	while (va < eva) {
259		cache_op_r4k_line(va, CACHE_R4K_I|CACHEOP_R4K_HIT_INV);
260		va += 16;
261	}
262
263	SYNC;
264}
265
266void
267mipsNN_icache_sync_range_32(vm_offset_t va, vm_size_t size)
268{
269	vm_offset_t eva;
270
271	eva = round_line32(va + size);
272	va = trunc_line32(va);
273
274	mips_intern_dcache_wb_range(va, (eva - va));
275
276	while ((eva - va) >= (32 * 32)) {
277		cache_r4k_op_32lines_32(va, CACHE_R4K_I|CACHEOP_R4K_HIT_INV);
278		va += (32 * 32);
279	}
280
281	while (va < eva) {
282		cache_op_r4k_line(va, CACHE_R4K_I|CACHEOP_R4K_HIT_INV);
283		va += 32;
284	}
285
286	SYNC;
287}
288
289void
290mipsNN_icache_sync_range_64(vm_offset_t va, vm_size_t size)
291{
292	vm_offset_t eva;
293
294	eva = round_line64(va + size);
295	va = trunc_line64(va);
296
297	mips_intern_dcache_wb_range(va, (eva - va));
298
299	while ((eva - va) >= (32 * 64)) {
300		cache_r4k_op_32lines_64(va, CACHE_R4K_I|CACHEOP_R4K_HIT_INV);
301		va += (32 * 64);
302	}
303
304	while (va < eva) {
305		cache_op_r4k_line(va, CACHE_R4K_I|CACHEOP_R4K_HIT_INV);
306		va += 64;
307	}
308
309	SYNC;
310}
311
312void
313mipsNN_icache_sync_range_index_16(vm_offset_t va, vm_size_t size)
314{
315	vm_offset_t eva, tmpva;
316	int i, stride, loopcount;
317
318	/*
319	 * Since we're doing Index ops, we expect to not be able
320	 * to access the address we've been given.  So, get the
321	 * bits that determine the cache index, and make a KSEG0
322	 * address out of them.
323	 */
324	va = MIPS_PHYS_TO_KSEG0(va & picache_way_mask);
325
326	eva = round_line16(va + size);
327	va = trunc_line16(va);
328
329	/*
330	 * GCC generates better code in the loops if we reference local
331	 * copies of these global variables.
332	 */
333	stride = picache_stride;
334	loopcount = picache_loopcount;
335
336	mips_intern_dcache_wbinv_range_index(va, (eva - va));
337
338	while ((eva - va) >= (8 * 16)) {
339		tmpva = va;
340		for (i = 0; i < loopcount; i++, tmpva += stride)
341			cache_r4k_op_8lines_16(tmpva,
342			    CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
343		va += 8 * 16;
344	}
345
346	while (va < eva) {
347		tmpva = va;
348		for (i = 0; i < loopcount; i++, tmpva += stride)
349			cache_op_r4k_line(tmpva,
350			    CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
351		va += 16;
352	}
353}
354
355void
356mipsNN_icache_sync_range_index_32(vm_offset_t va, vm_size_t size)
357{
358	vm_offset_t eva, tmpva;
359	int i, stride, loopcount;
360
361	/*
362	 * Since we're doing Index ops, we expect to not be able
363	 * to access the address we've been given.  So, get the
364	 * bits that determine the cache index, and make a KSEG0
365	 * address out of them.
366	 */
367	va = MIPS_PHYS_TO_KSEG0(va & picache_way_mask);
368
369	eva = round_line32(va + size);
370	va = trunc_line32(va);
371
372	/*
373	 * GCC generates better code in the loops if we reference local
374	 * copies of these global variables.
375	 */
376	stride = picache_stride;
377	loopcount = picache_loopcount;
378
379	mips_intern_dcache_wbinv_range_index(va, (eva - va));
380
381	while ((eva - va) >= (8 * 32)) {
382		tmpva = va;
383		for (i = 0; i < loopcount; i++, tmpva += stride)
384			cache_r4k_op_8lines_32(tmpva,
385			    CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
386		va += 8 * 32;
387	}
388
389	while (va < eva) {
390		tmpva = va;
391		for (i = 0; i < loopcount; i++, tmpva += stride)
392			cache_op_r4k_line(tmpva,
393			    CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
394		va += 32;
395	}
396}
397
398void
399mipsNN_icache_sync_range_index_64(vm_offset_t va, vm_size_t size)
400{
401	vm_offset_t eva, tmpva;
402	int i, stride, loopcount;
403
404	/*
405	 * Since we're doing Index ops, we expect to not be able
406	 * to access the address we've been given.  So, get the
407	 * bits that determine the cache index, and make a KSEG0
408	 * address out of them.
409	 */
410	va = MIPS_PHYS_TO_KSEG0(va & picache_way_mask);
411
412	eva = round_line64(va + size);
413	va = trunc_line64(va);
414
415	/*
416	 * GCC generates better code in the loops if we reference local
417	 * copies of these global variables.
418	 */
419	stride = picache_stride;
420	loopcount = picache_loopcount;
421
422	mips_intern_dcache_wbinv_range_index(va, (eva - va));
423
424	while ((eva - va) >= (8 * 64)) {
425		tmpva = va;
426		for (i = 0; i < loopcount; i++, tmpva += stride)
427			cache_r4k_op_8lines_64(tmpva,
428			    CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
429		va += 8 * 64;
430	}
431
432	while (va < eva) {
433		tmpva = va;
434		for (i = 0; i < loopcount; i++, tmpva += stride)
435			cache_op_r4k_line(tmpva,
436			    CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
437		va += 64;
438	}
439}
440
441void
442mipsNN_pdcache_wbinv_all_16(void)
443{
444	vm_offset_t va, eva;
445
446	va = MIPS_PHYS_TO_KSEG0(0);
447	eva = va + pdcache_size;
448
449	/*
450	 * Since we're hitting the whole thing, we don't have to
451	 * worry about the N different "ways".
452	 */
453
454	while (va < eva) {
455		cache_r4k_op_32lines_16(va,
456		    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
457		va += (32 * 16);
458	}
459
460	SYNC;
461}
462
463void
464mipsNN_pdcache_wbinv_all_32(void)
465{
466	vm_offset_t va, eva;
467
468	va = MIPS_PHYS_TO_KSEG0(0);
469	eva = va + pdcache_size;
470
471	/*
472	 * Since we're hitting the whole thing, we don't have to
473	 * worry about the N different "ways".
474	 */
475
476	while (va < eva) {
477		cache_r4k_op_32lines_32(va,
478		    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
479		va += (32 * 32);
480	}
481
482	SYNC;
483}
484
485void
486mipsNN_pdcache_wbinv_all_64(void)
487{
488	vm_offset_t va, eva;
489
490	va = MIPS_PHYS_TO_KSEG0(0);
491	eva = va + pdcache_size;
492
493	/*
494	 * Since we're hitting the whole thing, we don't have to
495	 * worry about the N different "ways".
496	 */
497
498	while (va < eva) {
499		cache_r4k_op_32lines_64(va,
500		    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
501		va += (32 * 64);
502	}
503
504	SYNC;
505}
506
507void
508mipsNN_pdcache_wbinv_range_16(vm_offset_t va, vm_size_t size)
509{
510	vm_offset_t eva;
511
512	eva = round_line16(va + size);
513	va = trunc_line16(va);
514
515	while ((eva - va) >= (32 * 16)) {
516		cache_r4k_op_32lines_16(va,
517		    CACHE_R4K_D|CACHEOP_R4K_HIT_WB_INV);
518		va += (32 * 16);
519	}
520
521	while (va < eva) {
522		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB_INV);
523		va += 16;
524	}
525
526	SYNC;
527}
528
529void
530mipsNN_pdcache_wbinv_range_32(vm_offset_t va, vm_size_t size)
531{
532	vm_offset_t eva;
533
534	eva = round_line32(va + size);
535	va = trunc_line32(va);
536
537	while ((eva - va) >= (32 * 32)) {
538		cache_r4k_op_32lines_32(va,
539		    CACHE_R4K_D|CACHEOP_R4K_HIT_WB_INV);
540		va += (32 * 32);
541	}
542
543	while (va < eva) {
544		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB_INV);
545		va += 32;
546	}
547
548	SYNC;
549}
550
551void
552mipsNN_pdcache_wbinv_range_64(vm_offset_t va, vm_size_t size)
553{
554	vm_offset_t eva;
555
556	eva = round_line64(va + size);
557	va = trunc_line64(va);
558
559	while ((eva - va) >= (32 * 64)) {
560		cache_r4k_op_32lines_64(va,
561		    CACHE_R4K_D|CACHEOP_R4K_HIT_WB_INV);
562		va += (32 * 64);
563	}
564
565	while (va < eva) {
566		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB_INV);
567		va += 64;
568	}
569
570	SYNC;
571}
572
573void
574mipsNN_pdcache_wbinv_range_index_16(vm_offset_t va, vm_size_t size)
575{
576	vm_offset_t eva, tmpva;
577	int i, stride, loopcount;
578
579	/*
580	 * Since we're doing Index ops, we expect to not be able
581	 * to access the address we've been given.  So, get the
582	 * bits that determine the cache index, and make a KSEG0
583	 * address out of them.
584	 */
585	va = MIPS_PHYS_TO_KSEG0(va & pdcache_way_mask);
586
587	eva = round_line16(va + size);
588	va = trunc_line16(va);
589
590	/*
591	 * GCC generates better code in the loops if we reference local
592	 * copies of these global variables.
593	 */
594	stride = pdcache_stride;
595	loopcount = pdcache_loopcount;
596
597	while ((eva - va) >= (8 * 16)) {
598		tmpva = va;
599		for (i = 0; i < loopcount; i++, tmpva += stride)
600			cache_r4k_op_8lines_16(tmpva,
601			    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
602		va += 8 * 16;
603	}
604
605	while (va < eva) {
606		tmpva = va;
607		for (i = 0; i < loopcount; i++, tmpva += stride)
608			cache_op_r4k_line(tmpva,
609			    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
610		va += 16;
611	}
612}
613
614void
615mipsNN_pdcache_wbinv_range_index_32(vm_offset_t va, vm_size_t size)
616{
617	vm_offset_t eva, tmpva;
618	int i, stride, loopcount;
619
620	/*
621	 * Since we're doing Index ops, we expect to not be able
622	 * to access the address we've been given.  So, get the
623	 * bits that determine the cache index, and make a KSEG0
624	 * address out of them.
625	 */
626	va = MIPS_PHYS_TO_KSEG0(va & pdcache_way_mask);
627
628	eva = round_line32(va + size);
629	va = trunc_line32(va);
630
631	/*
632	 * GCC generates better code in the loops if we reference local
633	 * copies of these global variables.
634	 */
635	stride = pdcache_stride;
636	loopcount = pdcache_loopcount;
637
638	while ((eva - va) >= (8 * 32)) {
639		tmpva = va;
640		for (i = 0; i < loopcount; i++, tmpva += stride)
641			cache_r4k_op_8lines_32(tmpva,
642			    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
643		va += 8 * 32;
644	}
645
646	while (va < eva) {
647		tmpva = va;
648		for (i = 0; i < loopcount; i++, tmpva += stride)
649			cache_op_r4k_line(tmpva,
650			    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
651		va += 32;
652	}
653}
654
655void
656mipsNN_pdcache_wbinv_range_index_64(vm_offset_t va, vm_size_t size)
657{
658	vm_offset_t eva, tmpva;
659	int i, stride, loopcount;
660
661	/*
662	 * Since we're doing Index ops, we expect to not be able
663	 * to access the address we've been given.  So, get the
664	 * bits that determine the cache index, and make a KSEG0
665	 * address out of them.
666	 */
667	va = MIPS_PHYS_TO_KSEG0(va & pdcache_way_mask);
668
669	eva = round_line64(va + size);
670	va = trunc_line64(va);
671
672	/*
673	 * GCC generates better code in the loops if we reference local
674	 * copies of these global variables.
675	 */
676	stride = pdcache_stride;
677	loopcount = pdcache_loopcount;
678
679	while ((eva - va) >= (8 * 64)) {
680		tmpva = va;
681		for (i = 0; i < loopcount; i++, tmpva += stride)
682			cache_r4k_op_8lines_64(tmpva,
683			    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
684		va += 8 * 64;
685	}
686
687	while (va < eva) {
688		tmpva = va;
689		for (i = 0; i < loopcount; i++, tmpva += stride)
690			cache_op_r4k_line(tmpva,
691			    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
692		va += 64;
693	}
694}
695
696void
697mipsNN_pdcache_inv_range_16(vm_offset_t va, vm_size_t size)
698{
699	vm_offset_t eva;
700
701	eva = round_line16(va + size);
702	va = trunc_line16(va);
703
704	while ((eva - va) >= (32 * 16)) {
705		cache_r4k_op_32lines_16(va, CACHE_R4K_D|CACHEOP_R4K_HIT_INV);
706		va += (32 * 16);
707	}
708
709	while (va < eva) {
710		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_INV);
711		va += 16;
712	}
713
714	SYNC;
715}
716
717void
718mipsNN_pdcache_inv_range_32(vm_offset_t va, vm_size_t size)
719{
720	vm_offset_t eva;
721
722	eva = round_line32(va + size);
723	va = trunc_line32(va);
724
725	while ((eva - va) >= (32 * 32)) {
726		cache_r4k_op_32lines_32(va, CACHE_R4K_D|CACHEOP_R4K_HIT_INV);
727		va += (32 * 32);
728	}
729
730	while (va < eva) {
731		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_INV);
732		va += 32;
733	}
734
735	SYNC;
736}
737
738void
739mipsNN_pdcache_inv_range_64(vm_offset_t va, vm_size_t size)
740{
741	vm_offset_t eva;
742
743	eva = round_line64(va + size);
744	va = trunc_line64(va);
745
746	while ((eva - va) >= (32 * 64)) {
747		cache_r4k_op_32lines_64(va, CACHE_R4K_D|CACHEOP_R4K_HIT_INV);
748		va += (32 * 64);
749	}
750
751	while (va < eva) {
752		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_INV);
753		va += 64;
754	}
755
756	SYNC;
757}
758
759void
760mipsNN_pdcache_wb_range_16(vm_offset_t va, vm_size_t size)
761{
762	vm_offset_t eva;
763
764	eva = round_line16(va + size);
765	va = trunc_line16(va);
766
767	while ((eva - va) >= (32 * 16)) {
768		cache_r4k_op_32lines_16(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB);
769		va += (32 * 16);
770	}
771
772	while (va < eva) {
773		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB);
774		va += 16;
775	}
776
777	SYNC;
778}
779
780void
781mipsNN_pdcache_wb_range_32(vm_offset_t va, vm_size_t size)
782{
783	vm_offset_t eva;
784
785	eva = round_line32(va + size);
786	va = trunc_line32(va);
787
788	while ((eva - va) >= (32 * 32)) {
789		cache_r4k_op_32lines_32(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB);
790		va += (32 * 32);
791	}
792
793	while (va < eva) {
794		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB);
795		va += 32;
796	}
797
798	SYNC;
799}
800
801void
802mipsNN_pdcache_wb_range_64(vm_offset_t va, vm_size_t size)
803{
804	vm_offset_t eva;
805
806	eva = round_line64(va + size);
807	va = trunc_line64(va);
808
809	while ((eva - va) >= (32 * 64)) {
810		cache_r4k_op_32lines_64(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB);
811		va += (32 * 64);
812	}
813
814	while (va < eva) {
815		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB);
816		va += 64;
817	}
818
819	SYNC;
820}
821
822#ifdef CPU_CNMIPS
823
824void
825mipsNN_icache_sync_all_128(void)
826{
827        SYNCI
828}
829
830void
831mipsNN_icache_sync_range_128(vm_offset_t va, vm_size_t size)
832{
833	SYNC;
834}
835
836void
837mipsNN_icache_sync_range_index_128(vm_offset_t va, vm_size_t size)
838{
839}
840
841
842void
843mipsNN_pdcache_wbinv_all_128(void)
844{
845}
846
847
848void
849mipsNN_pdcache_wbinv_range_128(vm_offset_t va, vm_size_t size)
850{
851	SYNC;
852}
853
854void
855mipsNN_pdcache_wbinv_range_index_128(vm_offset_t va, vm_size_t size)
856{
857}
858
859void
860mipsNN_pdcache_inv_range_128(vm_offset_t va, vm_size_t size)
861{
862}
863
864void
865mipsNN_pdcache_wb_range_128(vm_offset_t va, vm_size_t size)
866{
867	SYNC;
868}
869
870#else
871
872void
873mipsNN_icache_sync_all_128(void)
874{
875	vm_offset_t va, eva;
876
877	va = MIPS_PHYS_TO_KSEG0(0);
878	eva = va + picache_size;
879
880	/*
881	 * Since we're hitting the whole thing, we don't have to
882	 * worry about the N different "ways".
883	 */
884
885	mips_intern_dcache_wbinv_all();
886
887	while (va < eva) {
888		cache_r4k_op_32lines_128(va, CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
889		va += (32 * 128);
890	}
891
892	SYNC;
893}
894
895void
896mipsNN_icache_sync_range_128(vm_offset_t va, vm_size_t size)
897{
898	vm_offset_t eva;
899
900	eva = round_line128(va + size);
901	va = trunc_line128(va);
902
903	mips_intern_dcache_wb_range(va, (eva - va));
904
905	while ((eva - va) >= (32 * 128)) {
906		cache_r4k_op_32lines_128(va, CACHE_R4K_I|CACHEOP_R4K_HIT_INV);
907		va += (32 * 128);
908	}
909
910	while (va < eva) {
911		cache_op_r4k_line(va, CACHE_R4K_I|CACHEOP_R4K_HIT_INV);
912		va += 128;
913	}
914
915	SYNC;
916}
917
918void
919mipsNN_icache_sync_range_index_128(vm_offset_t va, vm_size_t size)
920{
921	vm_offset_t eva, tmpva;
922	int i, stride, loopcount;
923
924	/*
925	 * Since we're doing Index ops, we expect to not be able
926	 * to access the address we've been given.  So, get the
927	 * bits that determine the cache index, and make a KSEG0
928	 * address out of them.
929	 */
930	va = MIPS_PHYS_TO_KSEG0(va & picache_way_mask);
931
932	eva = round_line128(va + size);
933	va = trunc_line128(va);
934
935	/*
936	 * GCC generates better code in the loops if we reference local
937	 * copies of these global variables.
938	 */
939	stride = picache_stride;
940	loopcount = picache_loopcount;
941
942	mips_intern_dcache_wbinv_range_index(va, (eva - va));
943
944	while ((eva - va) >= (32 * 128)) {
945		tmpva = va;
946		for (i = 0; i < loopcount; i++, tmpva += stride)
947			cache_r4k_op_32lines_128(tmpva,
948			    CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
949		va += 32 * 128;
950	}
951
952	while (va < eva) {
953		tmpva = va;
954		for (i = 0; i < loopcount; i++, tmpva += stride)
955			cache_op_r4k_line(tmpva,
956			    CACHE_R4K_I|CACHEOP_R4K_INDEX_INV);
957		va += 128;
958	}
959}
960
961void
962mipsNN_pdcache_wbinv_all_128(void)
963{
964	vm_offset_t va, eva;
965
966	va = MIPS_PHYS_TO_KSEG0(0);
967	eva = va + pdcache_size;
968
969	/*
970	 * Since we're hitting the whole thing, we don't have to
971	 * worry about the N different "ways".
972	 */
973
974	while (va < eva) {
975		cache_r4k_op_32lines_128(va,
976		    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
977		va += (32 * 128);
978	}
979
980	SYNC;
981}
982
983
984void
985mipsNN_pdcache_wbinv_range_128(vm_offset_t va, vm_size_t size)
986{
987	vm_offset_t eva;
988
989	eva = round_line128(va + size);
990	va = trunc_line128(va);
991
992	while ((eva - va) >= (32 * 128)) {
993		cache_r4k_op_32lines_128(va,
994		    CACHE_R4K_D|CACHEOP_R4K_HIT_WB_INV);
995		va += (32 * 128);
996	}
997
998	while (va < eva) {
999		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB_INV);
1000		va += 128;
1001	}
1002
1003	SYNC;
1004}
1005
1006void
1007mipsNN_pdcache_wbinv_range_index_128(vm_offset_t va, vm_size_t size)
1008{
1009	vm_offset_t eva, tmpva;
1010	int i, stride, loopcount;
1011
1012	/*
1013	 * Since we're doing Index ops, we expect to not be able
1014	 * to access the address we've been given.  So, get the
1015	 * bits that determine the cache index, and make a KSEG0
1016	 * address out of them.
1017	 */
1018	va = MIPS_PHYS_TO_KSEG0(va & pdcache_way_mask);
1019
1020	eva = round_line128(va + size);
1021	va = trunc_line128(va);
1022
1023	/*
1024	 * GCC generates better code in the loops if we reference local
1025	 * copies of these global variables.
1026	 */
1027	stride = pdcache_stride;
1028	loopcount = pdcache_loopcount;
1029
1030	while ((eva - va) >= (32 * 128)) {
1031		tmpva = va;
1032		for (i = 0; i < loopcount; i++, tmpva += stride)
1033			cache_r4k_op_32lines_128(tmpva,
1034			    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
1035		va += 32 * 128;
1036	}
1037
1038	while (va < eva) {
1039		tmpva = va;
1040		for (i = 0; i < loopcount; i++, tmpva += stride)
1041			cache_op_r4k_line(tmpva,
1042			    CACHE_R4K_D|CACHEOP_R4K_INDEX_WB_INV);
1043		va += 128;
1044	}
1045}
1046
1047void
1048mipsNN_pdcache_inv_range_128(vm_offset_t va, vm_size_t size)
1049{
1050	vm_offset_t eva;
1051
1052	eva = round_line128(va + size);
1053	va = trunc_line128(va);
1054
1055	while ((eva - va) >= (32 * 128)) {
1056		cache_r4k_op_32lines_128(va, CACHE_R4K_D|CACHEOP_R4K_HIT_INV);
1057		va += (32 * 128);
1058	}
1059
1060	while (va < eva) {
1061		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_INV);
1062		va += 128;
1063	}
1064
1065	SYNC;
1066}
1067
1068void
1069mipsNN_pdcache_wb_range_128(vm_offset_t va, vm_size_t size)
1070{
1071	vm_offset_t eva;
1072
1073	eva = round_line128(va + size);
1074	va = trunc_line128(va);
1075
1076	while ((eva - va) >= (32 * 128)) {
1077		cache_r4k_op_32lines_128(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB);
1078		va += (32 * 128);
1079	}
1080
1081	while (va < eva) {
1082		cache_op_r4k_line(va, CACHE_R4K_D|CACHEOP_R4K_HIT_WB);
1083		va += 128;
1084	}
1085
1086	SYNC;
1087}
1088
1089#endif
1090
1091void
1092mipsNN_sdcache_wbinv_all_32(void)
1093{
1094	vm_offset_t va = MIPS_PHYS_TO_KSEG0(0);
1095	vm_offset_t eva = va + sdcache_size;
1096
1097	while (va < eva) {
1098		cache_r4k_op_32lines_32(va,
1099		    CACHE_R4K_SD|CACHEOP_R4K_INDEX_WB_INV);
1100		va += (32 * 32);
1101	}
1102}
1103
1104void
1105mipsNN_sdcache_wbinv_all_64(void)
1106{
1107	vm_offset_t va = MIPS_PHYS_TO_KSEG0(0);
1108	vm_offset_t eva = va + sdcache_size;
1109
1110	while (va < eva) {
1111		cache_r4k_op_32lines_64(va,
1112		    CACHE_R4K_SD|CACHEOP_R4K_INDEX_WB_INV);
1113		va += (32 * 64);
1114	}
1115}
1116
1117void
1118mipsNN_sdcache_wbinv_range_32(vm_offset_t va, vm_size_t size)
1119{
1120	vm_offset_t eva = round_line32(va + size);
1121
1122	va = trunc_line32(va);
1123
1124	while ((eva - va) >= (32 * 32)) {
1125		cache_r4k_op_32lines_32(va,
1126		    CACHE_R4K_SD|CACHEOP_R4K_HIT_WB_INV);
1127		va += (32 * 32);
1128	}
1129
1130	while (va < eva) {
1131		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_WB_INV);
1132		va += 32;
1133	}
1134}
1135
1136void
1137mipsNN_sdcache_wbinv_range_64(vm_offset_t va, vm_size_t size)
1138{
1139	vm_offset_t eva = round_line64(va + size);
1140
1141	va = trunc_line64(va);
1142
1143	while ((eva - va) >= (32 * 64)) {
1144		cache_r4k_op_32lines_64(va,
1145		    CACHE_R4K_SD|CACHEOP_R4K_HIT_WB_INV);
1146		va += (32 * 64);
1147	}
1148
1149	while (va < eva) {
1150		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_WB_INV);
1151		va += 64;
1152	}
1153}
1154
1155void
1156mipsNN_sdcache_wbinv_range_index_32(vm_offset_t va, vm_size_t size)
1157{
1158	vm_offset_t eva;
1159
1160	/*
1161	 * Since we're doing Index ops, we expect to not be able
1162	 * to access the address we've been given.  So, get the
1163	 * bits that determine the cache index, and make a KSEG0
1164	 * address out of them.
1165	 */
1166	va = MIPS_PHYS_TO_KSEG0(va & (sdcache_size - 1));
1167
1168	eva = round_line32(va + size);
1169	va = trunc_line32(va);
1170
1171	while ((eva - va) >= (32 * 32)) {
1172		cache_r4k_op_32lines_32(va,
1173		    CACHE_R4K_SD|CACHEOP_R4K_INDEX_WB_INV);
1174		va += (32 * 32);
1175	}
1176
1177	while (va < eva) {
1178		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_INDEX_WB_INV);
1179		va += 32;
1180	}
1181}
1182
1183void
1184mipsNN_sdcache_wbinv_range_index_64(vm_offset_t va, vm_size_t size)
1185{
1186	vm_offset_t eva;
1187
1188	/*
1189	 * Since we're doing Index ops, we expect to not be able
1190	 * to access the address we've been given.  So, get the
1191	 * bits that determine the cache index, and make a KSEG0
1192	 * address out of them.
1193	 */
1194	va = MIPS_PHYS_TO_KSEG0(va & (sdcache_size - 1));
1195
1196	eva = round_line64(va + size);
1197	va = trunc_line64(va);
1198
1199	while ((eva - va) >= (32 * 64)) {
1200		cache_r4k_op_32lines_64(va,
1201		    CACHE_R4K_SD|CACHEOP_R4K_INDEX_WB_INV);
1202		va += (32 * 64);
1203	}
1204
1205	while (va < eva) {
1206		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_INDEX_WB_INV);
1207		va += 64;
1208	}
1209}
1210
1211void
1212mipsNN_sdcache_inv_range_32(vm_offset_t va, vm_size_t size)
1213{
1214	vm_offset_t eva = round_line32(va + size);
1215
1216	va = trunc_line32(va);
1217
1218	while ((eva - va) >= (32 * 32)) {
1219		cache_r4k_op_32lines_32(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_INV);
1220		va += (32 * 32);
1221	}
1222
1223	while (va < eva) {
1224		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_INV);
1225		va += 32;
1226	}
1227}
1228
1229void
1230mipsNN_sdcache_inv_range_64(vm_offset_t va, vm_size_t size)
1231{
1232	vm_offset_t eva = round_line64(va + size);
1233
1234	va = trunc_line64(va);
1235
1236	while ((eva - va) >= (32 * 64)) {
1237		cache_r4k_op_32lines_64(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_INV);
1238		va += (32 * 64);
1239	}
1240
1241	while (va < eva) {
1242		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_INV);
1243		va += 64;
1244	}
1245}
1246
1247void
1248mipsNN_sdcache_wb_range_32(vm_offset_t va, vm_size_t size)
1249{
1250	vm_offset_t eva = round_line32(va + size);
1251
1252	va = trunc_line32(va);
1253
1254	while ((eva - va) >= (32 * 32)) {
1255		cache_r4k_op_32lines_32(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_WB);
1256		va += (32 * 32);
1257	}
1258
1259	while (va < eva) {
1260		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_WB);
1261		va += 32;
1262	}
1263}
1264
1265void
1266mipsNN_sdcache_wb_range_64(vm_offset_t va, vm_size_t size)
1267{
1268	vm_offset_t eva = round_line64(va + size);
1269
1270	va = trunc_line64(va);
1271
1272	while ((eva - va) >= (32 * 64)) {
1273		cache_r4k_op_32lines_64(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_WB);
1274		va += (32 * 64);
1275	}
1276
1277	while (va < eva) {
1278		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_WB);
1279		va += 64;
1280	}
1281}
1282
1283void
1284mipsNN_sdcache_wbinv_all_128(void)
1285{
1286	vm_offset_t va = MIPS_PHYS_TO_KSEG0(0);
1287	vm_offset_t eva = va + sdcache_size;
1288
1289	while (va < eva) {
1290		cache_r4k_op_32lines_128(va,
1291		    CACHE_R4K_SD|CACHEOP_R4K_INDEX_WB_INV);
1292		va += (32 * 128);
1293	}
1294}
1295
1296void
1297mipsNN_sdcache_wbinv_range_128(vm_offset_t va, vm_size_t size)
1298{
1299	vm_offset_t eva = round_line128(va + size);
1300
1301	va = trunc_line128(va);
1302
1303	while ((eva - va) >= (32 * 128)) {
1304		cache_r4k_op_32lines_128(va,
1305		    CACHE_R4K_SD|CACHEOP_R4K_HIT_WB_INV);
1306		va += (32 * 128);
1307	}
1308
1309	while (va < eva) {
1310		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_WB_INV);
1311		va += 128;
1312	}
1313}
1314
1315void
1316mipsNN_sdcache_wbinv_range_index_128(vm_offset_t va, vm_size_t size)
1317{
1318	vm_offset_t eva;
1319
1320	/*
1321	 * Since we're doing Index ops, we expect to not be able
1322	 * to access the address we've been given.  So, get the
1323	 * bits that determine the cache index, and make a KSEG0
1324	 * address out of them.
1325	 */
1326	va = MIPS_PHYS_TO_KSEG0(va & (sdcache_size - 1));
1327
1328	eva = round_line128(va + size);
1329	va = trunc_line128(va);
1330
1331	while ((eva - va) >= (32 * 128)) {
1332		cache_r4k_op_32lines_128(va,
1333		    CACHE_R4K_SD|CACHEOP_R4K_INDEX_WB_INV);
1334		va += (32 * 128);
1335	}
1336
1337	while (va < eva) {
1338		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_INDEX_WB_INV);
1339		va += 128;
1340	}
1341}
1342
1343void
1344mipsNN_sdcache_inv_range_128(vm_offset_t va, vm_size_t size)
1345{
1346	vm_offset_t eva = round_line128(va + size);
1347
1348	va = trunc_line128(va);
1349
1350	while ((eva - va) >= (32 * 128)) {
1351		cache_r4k_op_32lines_128(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_INV);
1352		va += (32 * 128);
1353	}
1354
1355	while (va < eva) {
1356		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_INV);
1357		va += 128;
1358	}
1359}
1360
1361void
1362mipsNN_sdcache_wb_range_128(vm_offset_t va, vm_size_t size)
1363{
1364	vm_offset_t eva = round_line128(va + size);
1365
1366	va = trunc_line128(va);
1367
1368	while ((eva - va) >= (32 * 128)) {
1369		cache_r4k_op_32lines_128(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_WB);
1370		va += (32 * 128);
1371	}
1372
1373	while (va < eva) {
1374		cache_op_r4k_line(va, CACHE_R4K_SD|CACHEOP_R4K_HIT_WB);
1375		va += 128;
1376	}
1377}
1378