support.s revision 169895
1/*-
2 * Copyright (c) 1993 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: head/sys/i386/i386/support.s 169895 2007-05-23 08:33:06Z kib $
30 */
31
32#include "opt_npx.h"
33
34#include <machine/asmacros.h>
35#include <machine/cputypes.h>
36#include <machine/intr_machdep.h>
37#include <machine/pmap.h>
38#include <machine/specialreg.h>
39
40#include "assym.s"
41
42#define IDXSHIFT	10
43
44	.data
45	.globl	bcopy_vector
46bcopy_vector:
47	.long	generic_bcopy
48	.globl	bzero_vector
49bzero_vector:
50	.long	generic_bzero
51	.globl	copyin_vector
52copyin_vector:
53	.long	generic_copyin
54	.globl	copyout_vector
55copyout_vector:
56	.long	generic_copyout
57#if defined(I586_CPU) && defined(DEV_NPX)
58kernel_fpu_lock:
59	.byte	0xfe
60	.space	3
61#endif
62	ALIGN_DATA
63	.globl	intrcnt, eintrcnt
64intrcnt:
65	.space	INTRCNT_COUNT * 4
66eintrcnt:
67
68	.globl	intrnames, eintrnames
69intrnames:
70	.space	INTRCNT_COUNT * (MAXCOMLEN + 1)
71eintrnames:
72
73	.text
74
75/*
76 * bcopy family
77 * void bzero(void *buf, u_int len)
78 */
79
80ENTRY(bzero)
81	MEXITCOUNT
82	jmp	*bzero_vector
83
84ENTRY(generic_bzero)
85	pushl	%edi
86	movl	8(%esp),%edi
87	movl	12(%esp),%ecx
88	xorl	%eax,%eax
89	shrl	$2,%ecx
90	cld
91	rep
92	stosl
93	movl	12(%esp),%ecx
94	andl	$3,%ecx
95	rep
96	stosb
97	popl	%edi
98	ret
99
100#ifdef I486_CPU
101ENTRY(i486_bzero)
102	movl	4(%esp),%edx
103	movl	8(%esp),%ecx
104	xorl	%eax,%eax
105/*
106 * do 64 byte chunks first
107 *
108 * XXX this is probably over-unrolled at least for DX2's
109 */
1102:
111	cmpl	$64,%ecx
112	jb	3f
113	movl	%eax,(%edx)
114	movl	%eax,4(%edx)
115	movl	%eax,8(%edx)
116	movl	%eax,12(%edx)
117	movl	%eax,16(%edx)
118	movl	%eax,20(%edx)
119	movl	%eax,24(%edx)
120	movl	%eax,28(%edx)
121	movl	%eax,32(%edx)
122	movl	%eax,36(%edx)
123	movl	%eax,40(%edx)
124	movl	%eax,44(%edx)
125	movl	%eax,48(%edx)
126	movl	%eax,52(%edx)
127	movl	%eax,56(%edx)
128	movl	%eax,60(%edx)
129	addl	$64,%edx
130	subl	$64,%ecx
131	jnz	2b
132	ret
133
134/*
135 * do 16 byte chunks
136 */
137	SUPERALIGN_TEXT
1383:
139	cmpl	$16,%ecx
140	jb	4f
141	movl	%eax,(%edx)
142	movl	%eax,4(%edx)
143	movl	%eax,8(%edx)
144	movl	%eax,12(%edx)
145	addl	$16,%edx
146	subl	$16,%ecx
147	jnz	3b
148	ret
149
150/*
151 * do 4 byte chunks
152 */
153	SUPERALIGN_TEXT
1544:
155	cmpl	$4,%ecx
156	jb	5f
157	movl	%eax,(%edx)
158	addl	$4,%edx
159	subl	$4,%ecx
160	jnz	4b
161	ret
162
163/*
164 * do 1 byte chunks
165 * a jump table seems to be faster than a loop or more range reductions
166 *
167 * XXX need a const section for non-text
168 */
169	.data
170jtab:
171	.long	do0
172	.long	do1
173	.long	do2
174	.long	do3
175
176	.text
177	SUPERALIGN_TEXT
1785:
179	jmp	*jtab(,%ecx,4)
180
181	SUPERALIGN_TEXT
182do3:
183	movw	%ax,(%edx)
184	movb	%al,2(%edx)
185	ret
186
187	SUPERALIGN_TEXT
188do2:
189	movw	%ax,(%edx)
190	ret
191
192	SUPERALIGN_TEXT
193do1:
194	movb	%al,(%edx)
195	ret
196
197	SUPERALIGN_TEXT
198do0:
199	ret
200#endif
201
202#if defined(I586_CPU) && defined(DEV_NPX)
203ENTRY(i586_bzero)
204	movl	4(%esp),%edx
205	movl	8(%esp),%ecx
206
207	/*
208	 * The FPU register method is twice as fast as the integer register
209	 * method unless the target is in the L1 cache and we pre-allocate a
210	 * cache line for it (then the integer register method is 4-5 times
211	 * faster).  However, we never pre-allocate cache lines, since that
212	 * would make the integer method 25% or more slower for the common
213	 * case when the target isn't in either the L1 cache or the L2 cache.
214	 * Thus we normally use the FPU register method unless the overhead
215	 * would be too large.
216	 */
217	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
218	jb	intreg_i586_bzero
219
220	/*
221	 * The FPU registers may belong to an application or to fastmove()
222	 * or to another invocation of bcopy() or ourself in a higher level
223	 * interrupt or trap handler.  Preserving the registers is
224	 * complicated since we avoid it if possible at all levels.  We
225	 * want to localize the complications even when that increases them.
226	 * Here the extra work involves preserving CR0_TS in TS.
227	 * `fpcurthread != NULL' is supposed to be the condition that all the
228	 * FPU resources belong to an application, but fpcurthread and CR0_TS
229	 * aren't set atomically enough for this condition to work in
230	 * interrupt handlers.
231	 *
232	 * Case 1: FPU registers belong to the application: we must preserve
233	 * the registers if we use them, so we only use the FPU register
234	 * method if the target size is large enough to amortize the extra
235	 * overhead for preserving them.  CR0_TS must be preserved although
236	 * it is very likely to end up as set.
237	 *
238	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
239	 * makes the registers look like they belong to an application so
240	 * that cpu_switch() and savectx() don't have to know about it, so
241	 * this case reduces to case 1.
242	 *
243	 * Case 3: FPU registers belong to the kernel: don't use the FPU
244	 * register method.  This case is unlikely, and supporting it would
245	 * be more complicated and might take too much stack.
246	 *
247	 * Case 4: FPU registers don't belong to anyone: the FPU registers
248	 * don't need to be preserved, so we always use the FPU register
249	 * method.  CR0_TS must be preserved although it is very likely to
250	 * always end up as clear.
251	 */
252	cmpl	$0,PCPU(FPCURTHREAD)
253	je	i586_bz1
254
255	/*
256	 * XXX don't use the FPU for cases 1 and 2, since preemptive
257	 * scheduling of ithreads broke these cases.  Note that we can
258	 * no longer get here from an interrupt handler, since the
259	 * context sitch to the interrupt handler will have saved the
260	 * FPU state.
261	 */
262	jmp	intreg_i586_bzero
263
264	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
265	jb	intreg_i586_bzero
266	sarb	$1,kernel_fpu_lock
267	jc	intreg_i586_bzero
268	smsw	%ax
269	clts
270	subl	$108,%esp
271	fnsave	0(%esp)
272	jmp	i586_bz2
273
274i586_bz1:
275	sarb	$1,kernel_fpu_lock
276	jc	intreg_i586_bzero
277	smsw	%ax
278	clts
279	fninit				/* XXX should avoid needing this */
280i586_bz2:
281	fldz
282
283	/*
284	 * Align to an 8 byte boundary (misalignment in the main loop would
285	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
286	 * already aligned) by always zeroing 8 bytes and using the part up
287	 * to the _next_ alignment position.
288	 */
289	fstl	0(%edx)
290	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
291	addl	$8,%edx
292	andl	$~7,%edx
293	subl	%edx,%ecx
294
295	/*
296	 * Similarly align `len' to a multiple of 8.
297	 */
298	fstl	-8(%edx,%ecx)
299	decl	%ecx
300	andl	$~7,%ecx
301
302	/*
303	 * This wouldn't be any faster if it were unrolled, since the loop
304	 * control instructions are much faster than the fstl and/or done
305	 * in parallel with it so their overhead is insignificant.
306	 */
307fpureg_i586_bzero_loop:
308	fstl	0(%edx)
309	addl	$8,%edx
310	subl	$8,%ecx
311	cmpl	$8,%ecx
312	jae	fpureg_i586_bzero_loop
313
314	cmpl	$0,PCPU(FPCURTHREAD)
315	je	i586_bz3
316
317	/* XXX check that the condition for cases 1-2 stayed false. */
318i586_bzero_oops:
319	int	$3
320	jmp	i586_bzero_oops
321
322	frstor	0(%esp)
323	addl	$108,%esp
324	lmsw	%ax
325	movb	$0xfe,kernel_fpu_lock
326	ret
327
328i586_bz3:
329	fstp	%st(0)
330	lmsw	%ax
331	movb	$0xfe,kernel_fpu_lock
332	ret
333
334intreg_i586_bzero:
335	/*
336	 * `rep stos' seems to be the best method in practice for small
337	 * counts.  Fancy methods usually take too long to start up due
338	 * to cache and BTB misses.
339	 */
340	pushl	%edi
341	movl	%edx,%edi
342	xorl	%eax,%eax
343	shrl	$2,%ecx
344	cld
345	rep
346	stosl
347	movl	12(%esp),%ecx
348	andl	$3,%ecx
349	jne	1f
350	popl	%edi
351	ret
352
3531:
354	rep
355	stosb
356	popl	%edi
357	ret
358#endif /* I586_CPU && defined(DEV_NPX) */
359
360ENTRY(sse2_pagezero)
361	pushl	%ebx
362	movl	8(%esp),%ecx
363	movl	%ecx,%eax
364	addl	$4096,%eax
365	xor	%ebx,%ebx
3661:
367	movnti	%ebx,(%ecx)
368	addl	$4,%ecx
369	cmpl	%ecx,%eax
370	jne	1b
371	sfence
372	popl	%ebx
373	ret
374
375ENTRY(i686_pagezero)
376	pushl	%edi
377	pushl	%ebx
378
379	movl	12(%esp),%edi
380	movl	$1024,%ecx
381	cld
382
383	ALIGN_TEXT
3841:
385	xorl	%eax,%eax
386	repe
387	scasl
388	jnz	2f
389
390	popl	%ebx
391	popl	%edi
392	ret
393
394	ALIGN_TEXT
395
3962:
397	incl	%ecx
398	subl	$4,%edi
399
400	movl	%ecx,%edx
401	cmpl	$16,%ecx
402
403	jge	3f
404
405	movl	%edi,%ebx
406	andl	$0x3f,%ebx
407	shrl	%ebx
408	shrl	%ebx
409	movl	$16,%ecx
410	subl	%ebx,%ecx
411
4123:
413	subl	%ecx,%edx
414	rep
415	stosl
416
417	movl	%edx,%ecx
418	testl	%edx,%edx
419	jnz	1b
420
421	popl	%ebx
422	popl	%edi
423	ret
424
425/* fillw(pat, base, cnt) */
426ENTRY(fillw)
427	pushl	%edi
428	movl	8(%esp),%eax
429	movl	12(%esp),%edi
430	movl	16(%esp),%ecx
431	cld
432	rep
433	stosw
434	popl	%edi
435	ret
436
437ENTRY(bcopyb)
438	pushl	%esi
439	pushl	%edi
440	movl	12(%esp),%esi
441	movl	16(%esp),%edi
442	movl	20(%esp),%ecx
443	movl	%edi,%eax
444	subl	%esi,%eax
445	cmpl	%ecx,%eax			/* overlapping && src < dst? */
446	jb	1f
447	cld					/* nope, copy forwards */
448	rep
449	movsb
450	popl	%edi
451	popl	%esi
452	ret
453
454	ALIGN_TEXT
4551:
456	addl	%ecx,%edi			/* copy backwards. */
457	addl	%ecx,%esi
458	decl	%edi
459	decl	%esi
460	std
461	rep
462	movsb
463	popl	%edi
464	popl	%esi
465	cld
466	ret
467
468ENTRY(bcopy)
469	MEXITCOUNT
470	jmp	*bcopy_vector
471
472/*
473 * generic_bcopy(src, dst, cnt)
474 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
475 */
476ENTRY(generic_bcopy)
477	pushl	%esi
478	pushl	%edi
479	movl	12(%esp),%esi
480	movl	16(%esp),%edi
481	movl	20(%esp),%ecx
482
483	movl	%edi,%eax
484	subl	%esi,%eax
485	cmpl	%ecx,%eax			/* overlapping && src < dst? */
486	jb	1f
487
488	shrl	$2,%ecx				/* copy by 32-bit words */
489	cld					/* nope, copy forwards */
490	rep
491	movsl
492	movl	20(%esp),%ecx
493	andl	$3,%ecx				/* any bytes left? */
494	rep
495	movsb
496	popl	%edi
497	popl	%esi
498	ret
499
500	ALIGN_TEXT
5011:
502	addl	%ecx,%edi			/* copy backwards */
503	addl	%ecx,%esi
504	decl	%edi
505	decl	%esi
506	andl	$3,%ecx				/* any fractional bytes? */
507	std
508	rep
509	movsb
510	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
511	shrl	$2,%ecx
512	subl	$3,%esi
513	subl	$3,%edi
514	rep
515	movsl
516	popl	%edi
517	popl	%esi
518	cld
519	ret
520
521#if defined(I586_CPU) && defined(DEV_NPX)
522ENTRY(i586_bcopy)
523	pushl	%esi
524	pushl	%edi
525	movl	12(%esp),%esi
526	movl	16(%esp),%edi
527	movl	20(%esp),%ecx
528
529	movl	%edi,%eax
530	subl	%esi,%eax
531	cmpl	%ecx,%eax			/* overlapping && src < dst? */
532	jb	1f
533
534	cmpl	$1024,%ecx
535	jb	small_i586_bcopy
536
537	sarb	$1,kernel_fpu_lock
538	jc	small_i586_bcopy
539	cmpl	$0,PCPU(FPCURTHREAD)
540	je	i586_bc1
541
542	/* XXX turn off handling of cases 1-2, as above. */
543	movb	$0xfe,kernel_fpu_lock
544	jmp	small_i586_bcopy
545
546	smsw	%dx
547	clts
548	subl	$108,%esp
549	fnsave	0(%esp)
550	jmp	4f
551
552i586_bc1:
553	smsw	%dx
554	clts
555	fninit				/* XXX should avoid needing this */
556
557	ALIGN_TEXT
5584:
559	pushl	%ecx
560#define	DCACHE_SIZE	8192
561	cmpl	$(DCACHE_SIZE-512)/2,%ecx
562	jbe	2f
563	movl	$(DCACHE_SIZE-512)/2,%ecx
5642:
565	subl	%ecx,0(%esp)
566	cmpl	$256,%ecx
567	jb	5f			/* XXX should prefetch if %ecx >= 32 */
568	pushl	%esi
569	pushl	%ecx
570	ALIGN_TEXT
5713:
572	movl	0(%esi),%eax
573	movl	32(%esi),%eax
574	movl	64(%esi),%eax
575	movl	96(%esi),%eax
576	movl	128(%esi),%eax
577	movl	160(%esi),%eax
578	movl	192(%esi),%eax
579	movl	224(%esi),%eax
580	addl	$256,%esi
581	subl	$256,%ecx
582	cmpl	$256,%ecx
583	jae	3b
584	popl	%ecx
585	popl	%esi
5865:
587	ALIGN_TEXT
588large_i586_bcopy_loop:
589	fildq	0(%esi)
590	fildq	8(%esi)
591	fildq	16(%esi)
592	fildq	24(%esi)
593	fildq	32(%esi)
594	fildq	40(%esi)
595	fildq	48(%esi)
596	fildq	56(%esi)
597	fistpq	56(%edi)
598	fistpq	48(%edi)
599	fistpq	40(%edi)
600	fistpq	32(%edi)
601	fistpq	24(%edi)
602	fistpq	16(%edi)
603	fistpq	8(%edi)
604	fistpq	0(%edi)
605	addl	$64,%esi
606	addl	$64,%edi
607	subl	$64,%ecx
608	cmpl	$64,%ecx
609	jae	large_i586_bcopy_loop
610	popl	%eax
611	addl	%eax,%ecx
612	cmpl	$64,%ecx
613	jae	4b
614
615	cmpl	$0,PCPU(FPCURTHREAD)
616	je	i586_bc2
617
618	/* XXX check that the condition for cases 1-2 stayed false. */
619i586_bcopy_oops:
620	int	$3
621	jmp	i586_bcopy_oops
622
623	frstor	0(%esp)
624	addl	$108,%esp
625i586_bc2:
626	lmsw	%dx
627	movb	$0xfe,kernel_fpu_lock
628
629/*
630 * This is a duplicate of the main part of generic_bcopy.  See the comments
631 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
632 * would mess up high resolution profiling.
633 */
634	ALIGN_TEXT
635small_i586_bcopy:
636	shrl	$2,%ecx
637	cld
638	rep
639	movsl
640	movl	20(%esp),%ecx
641	andl	$3,%ecx
642	rep
643	movsb
644	popl	%edi
645	popl	%esi
646	ret
647
648	ALIGN_TEXT
6491:
650	addl	%ecx,%edi
651	addl	%ecx,%esi
652	decl	%edi
653	decl	%esi
654	andl	$3,%ecx
655	std
656	rep
657	movsb
658	movl	20(%esp),%ecx
659	shrl	$2,%ecx
660	subl	$3,%esi
661	subl	$3,%edi
662	rep
663	movsl
664	popl	%edi
665	popl	%esi
666	cld
667	ret
668#endif /* I586_CPU && defined(DEV_NPX) */
669
670/*
671 * Note: memcpy does not support overlapping copies
672 */
673ENTRY(memcpy)
674	pushl	%edi
675	pushl	%esi
676	movl	12(%esp),%edi
677	movl	16(%esp),%esi
678	movl	20(%esp),%ecx
679	movl	%edi,%eax
680	shrl	$2,%ecx				/* copy by 32-bit words */
681	cld					/* nope, copy forwards */
682	rep
683	movsl
684	movl	20(%esp),%ecx
685	andl	$3,%ecx				/* any bytes left? */
686	rep
687	movsb
688	popl	%esi
689	popl	%edi
690	ret
691
692
693/*****************************************************************************/
694/* copyout and fubyte family                                                 */
695/*****************************************************************************/
696/*
697 * Access user memory from inside the kernel. These routines and possibly
698 * the math- and DOS emulators should be the only places that do this.
699 *
700 * We have to access the memory with user's permissions, so use a segment
701 * selector with RPL 3. For writes to user space we have to additionally
702 * check the PTE for write permission, because the 386 does not check
703 * write permissions when we are executing with EPL 0. The 486 does check
704 * this if the WP bit is set in CR0, so we can use a simpler version here.
705 *
706 * These routines set curpcb->onfault for the time they execute. When a
707 * protection violation occurs inside the functions, the trap handler
708 * returns to *curpcb->onfault instead of the function.
709 */
710
711/*
712 * copyout(from_kernel, to_user, len)  - MP SAFE
713 */
714ENTRY(copyout)
715	MEXITCOUNT
716	jmp	*copyout_vector
717
718ENTRY(generic_copyout)
719	movl	PCPU(CURPCB),%eax
720	movl	$copyout_fault,PCB_ONFAULT(%eax)
721	pushl	%esi
722	pushl	%edi
723	pushl	%ebx
724	movl	16(%esp),%esi
725	movl	20(%esp),%edi
726	movl	24(%esp),%ebx
727	testl	%ebx,%ebx			/* anything to do? */
728	jz	done_copyout
729
730	/*
731	 * Check explicitly for non-user addresses.  If 486 write protection
732	 * is being used, this check is essential because we are in kernel
733	 * mode so the h/w does not provide any protection against writing
734	 * kernel addresses.
735	 */
736
737	/*
738	 * First, prevent address wrapping.
739	 */
740	movl	%edi,%eax
741	addl	%ebx,%eax
742	jc	copyout_fault
743/*
744 * XXX STOP USING VM_MAXUSER_ADDRESS.
745 * It is an end address, not a max, so every time it is used correctly it
746 * looks like there is an off by one error, and of course it caused an off
747 * by one error in several places.
748 */
749	cmpl	$VM_MAXUSER_ADDRESS,%eax
750	ja	copyout_fault
751
752	/* bcopy(%esi, %edi, %ebx) */
753	movl	%ebx,%ecx
754
755#if defined(I586_CPU) && defined(DEV_NPX)
756	ALIGN_TEXT
757slow_copyout:
758#endif
759	shrl	$2,%ecx
760	cld
761	rep
762	movsl
763	movb	%bl,%cl
764	andb	$3,%cl
765	rep
766	movsb
767
768done_copyout:
769	popl	%ebx
770	popl	%edi
771	popl	%esi
772	xorl	%eax,%eax
773	movl	PCPU(CURPCB),%edx
774	movl	%eax,PCB_ONFAULT(%edx)
775	ret
776
777	ALIGN_TEXT
778copyout_fault:
779	popl	%ebx
780	popl	%edi
781	popl	%esi
782	movl	PCPU(CURPCB),%edx
783	movl	$0,PCB_ONFAULT(%edx)
784	movl	$EFAULT,%eax
785	ret
786
787#if defined(I586_CPU) && defined(DEV_NPX)
788ENTRY(i586_copyout)
789	/*
790	 * Duplicated from generic_copyout.  Could be done a bit better.
791	 */
792	movl	PCPU(CURPCB),%eax
793	movl	$copyout_fault,PCB_ONFAULT(%eax)
794	pushl	%esi
795	pushl	%edi
796	pushl	%ebx
797	movl	16(%esp),%esi
798	movl	20(%esp),%edi
799	movl	24(%esp),%ebx
800	testl	%ebx,%ebx			/* anything to do? */
801	jz	done_copyout
802
803	/*
804	 * Check explicitly for non-user addresses.  If 486 write protection
805	 * is being used, this check is essential because we are in kernel
806	 * mode so the h/w does not provide any protection against writing
807	 * kernel addresses.
808	 */
809
810	/*
811	 * First, prevent address wrapping.
812	 */
813	movl	%edi,%eax
814	addl	%ebx,%eax
815	jc	copyout_fault
816/*
817 * XXX STOP USING VM_MAXUSER_ADDRESS.
818 * It is an end address, not a max, so every time it is used correctly it
819 * looks like there is an off by one error, and of course it caused an off
820 * by one error in several places.
821 */
822	cmpl	$VM_MAXUSER_ADDRESS,%eax
823	ja	copyout_fault
824
825	/* bcopy(%esi, %edi, %ebx) */
8263:
827	movl	%ebx,%ecx
828	/*
829	 * End of duplicated code.
830	 */
831
832	cmpl	$1024,%ecx
833	jb	slow_copyout
834
835	pushl	%ecx
836	call	fastmove
837	addl	$4,%esp
838	jmp	done_copyout
839#endif /* I586_CPU && defined(DEV_NPX) */
840
841/*
842 * copyin(from_user, to_kernel, len) - MP SAFE
843 */
844ENTRY(copyin)
845	MEXITCOUNT
846	jmp	*copyin_vector
847
848ENTRY(generic_copyin)
849	movl	PCPU(CURPCB),%eax
850	movl	$copyin_fault,PCB_ONFAULT(%eax)
851	pushl	%esi
852	pushl	%edi
853	movl	12(%esp),%esi			/* caddr_t from */
854	movl	16(%esp),%edi			/* caddr_t to */
855	movl	20(%esp),%ecx			/* size_t  len */
856
857	/*
858	 * make sure address is valid
859	 */
860	movl	%esi,%edx
861	addl	%ecx,%edx
862	jc	copyin_fault
863	cmpl	$VM_MAXUSER_ADDRESS,%edx
864	ja	copyin_fault
865
866#if defined(I586_CPU) && defined(DEV_NPX)
867	ALIGN_TEXT
868slow_copyin:
869#endif
870	movb	%cl,%al
871	shrl	$2,%ecx				/* copy longword-wise */
872	cld
873	rep
874	movsl
875	movb	%al,%cl
876	andb	$3,%cl				/* copy remaining bytes */
877	rep
878	movsb
879
880#if defined(I586_CPU) && defined(DEV_NPX)
881	ALIGN_TEXT
882done_copyin:
883#endif
884	popl	%edi
885	popl	%esi
886	xorl	%eax,%eax
887	movl	PCPU(CURPCB),%edx
888	movl	%eax,PCB_ONFAULT(%edx)
889	ret
890
891	ALIGN_TEXT
892copyin_fault:
893	popl	%edi
894	popl	%esi
895	movl	PCPU(CURPCB),%edx
896	movl	$0,PCB_ONFAULT(%edx)
897	movl	$EFAULT,%eax
898	ret
899
900#if defined(I586_CPU) && defined(DEV_NPX)
901ENTRY(i586_copyin)
902	/*
903	 * Duplicated from generic_copyin.  Could be done a bit better.
904	 */
905	movl	PCPU(CURPCB),%eax
906	movl	$copyin_fault,PCB_ONFAULT(%eax)
907	pushl	%esi
908	pushl	%edi
909	movl	12(%esp),%esi			/* caddr_t from */
910	movl	16(%esp),%edi			/* caddr_t to */
911	movl	20(%esp),%ecx			/* size_t  len */
912
913	/*
914	 * make sure address is valid
915	 */
916	movl	%esi,%edx
917	addl	%ecx,%edx
918	jc	copyin_fault
919	cmpl	$VM_MAXUSER_ADDRESS,%edx
920	ja	copyin_fault
921	/*
922	 * End of duplicated code.
923	 */
924
925	cmpl	$1024,%ecx
926	jb	slow_copyin
927
928	pushl	%ebx			/* XXX prepare for fastmove_fault */
929	pushl	%ecx
930	call	fastmove
931	addl	$8,%esp
932	jmp	done_copyin
933#endif /* I586_CPU && defined(DEV_NPX) */
934
935#if defined(I586_CPU) && defined(DEV_NPX)
936/* fastmove(src, dst, len)
937	src in %esi
938	dst in %edi
939	len in %ecx		XXX changed to on stack for profiling
940	uses %eax and %edx for tmp. storage
941 */
942/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
943ENTRY(fastmove)
944	pushl	%ebp
945	movl	%esp,%ebp
946	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
947
948	movl	8(%ebp),%ecx
949	cmpl	$63,%ecx
950	jbe	fastmove_tail
951
952	testl	$7,%esi	/* check if src addr is multiple of 8 */
953	jnz	fastmove_tail
954
955	testl	$7,%edi	/* check if dst addr is multiple of 8 */
956	jnz	fastmove_tail
957
958	/* XXX grab FPU context atomically. */
959	cli
960
961/* if (fpcurthread != NULL) { */
962	cmpl	$0,PCPU(FPCURTHREAD)
963	je	6f
964/*    fnsave(&curpcb->pcb_savefpu); */
965	movl	PCPU(CURPCB),%eax
966	fnsave	PCB_SAVEFPU(%eax)
967/*   FPCURTHREAD = NULL; */
968	movl	$0,PCPU(FPCURTHREAD)
969/* } */
9706:
971/* now we own the FPU. */
972
973/*
974 * The process' FP state is saved in the pcb, but if we get
975 * switched, the cpu_switch() will store our FP state in the
976 * pcb.  It should be possible to avoid all the copying for
977 * this, e.g., by setting a flag to tell cpu_switch() to
978 * save the state somewhere else.
979 */
980/* tmp = curpcb->pcb_savefpu; */
981	movl	%ecx,-12(%ebp)
982	movl	%esi,-8(%ebp)
983	movl	%edi,-4(%ebp)
984	movl	%esp,%edi
985	movl	PCPU(CURPCB),%esi
986	addl	$PCB_SAVEFPU,%esi
987	cld
988	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
989	rep
990	movsl
991	movl	-12(%ebp),%ecx
992	movl	-8(%ebp),%esi
993	movl	-4(%ebp),%edi
994/* stop_emulating(); */
995	clts
996/* fpcurthread = curthread; */
997	movl	PCPU(CURTHREAD),%eax
998	movl	%eax,PCPU(FPCURTHREAD)
999	movl	PCPU(CURPCB),%eax
1000
1001	/* XXX end of atomic FPU context grab. */
1002	sti
1003
1004	movl	$fastmove_fault,PCB_ONFAULT(%eax)
10054:
1006	movl	%ecx,-12(%ebp)
1007	cmpl	$1792,%ecx
1008	jbe	2f
1009	movl	$1792,%ecx
10102:
1011	subl	%ecx,-12(%ebp)
1012	cmpl	$256,%ecx
1013	jb	5f
1014	movl	%ecx,-8(%ebp)
1015	movl	%esi,-4(%ebp)
1016	ALIGN_TEXT
10173:
1018	movl	0(%esi),%eax
1019	movl	32(%esi),%eax
1020	movl	64(%esi),%eax
1021	movl	96(%esi),%eax
1022	movl	128(%esi),%eax
1023	movl	160(%esi),%eax
1024	movl	192(%esi),%eax
1025	movl	224(%esi),%eax
1026	addl	$256,%esi
1027	subl	$256,%ecx
1028	cmpl	$256,%ecx
1029	jae	3b
1030	movl	-8(%ebp),%ecx
1031	movl	-4(%ebp),%esi
10325:
1033	ALIGN_TEXT
1034fastmove_loop:
1035	fildq	0(%esi)
1036	fildq	8(%esi)
1037	fildq	16(%esi)
1038	fildq	24(%esi)
1039	fildq	32(%esi)
1040	fildq	40(%esi)
1041	fildq	48(%esi)
1042	fildq	56(%esi)
1043	fistpq	56(%edi)
1044	fistpq	48(%edi)
1045	fistpq	40(%edi)
1046	fistpq	32(%edi)
1047	fistpq	24(%edi)
1048	fistpq	16(%edi)
1049	fistpq	8(%edi)
1050	fistpq	0(%edi)
1051	addl	$-64,%ecx
1052	addl	$64,%esi
1053	addl	$64,%edi
1054	cmpl	$63,%ecx
1055	ja	fastmove_loop
1056	movl	-12(%ebp),%eax
1057	addl	%eax,%ecx
1058	cmpl	$64,%ecx
1059	jae	4b
1060
1061	/* XXX ungrab FPU context atomically. */
1062	cli
1063
1064/* curpcb->pcb_savefpu = tmp; */
1065	movl	%ecx,-12(%ebp)
1066	movl	%esi,-8(%ebp)
1067	movl	%edi,-4(%ebp)
1068	movl	PCPU(CURPCB),%edi
1069	addl	$PCB_SAVEFPU,%edi
1070	movl	%esp,%esi
1071	cld
1072	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1073	rep
1074	movsl
1075	movl	-12(%ebp),%ecx
1076	movl	-8(%ebp),%esi
1077	movl	-4(%ebp),%edi
1078
1079/* start_emulating(); */
1080	smsw	%ax
1081	orb	$CR0_TS,%al
1082	lmsw	%ax
1083/* fpcurthread = NULL; */
1084	movl	$0,PCPU(FPCURTHREAD)
1085
1086	/* XXX end of atomic FPU context ungrab. */
1087	sti
1088
1089	ALIGN_TEXT
1090fastmove_tail:
1091	movl	PCPU(CURPCB),%eax
1092	movl	$fastmove_tail_fault,PCB_ONFAULT(%eax)
1093
1094	movb	%cl,%al
1095	shrl	$2,%ecx				/* copy longword-wise */
1096	cld
1097	rep
1098	movsl
1099	movb	%al,%cl
1100	andb	$3,%cl				/* copy remaining bytes */
1101	rep
1102	movsb
1103
1104	movl	%ebp,%esp
1105	popl	%ebp
1106	ret
1107
1108	ALIGN_TEXT
1109fastmove_fault:
1110	/* XXX ungrab FPU context atomically. */
1111	cli
1112
1113	movl	PCPU(CURPCB),%edi
1114	addl	$PCB_SAVEFPU,%edi
1115	movl	%esp,%esi
1116	cld
1117	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1118	rep
1119	movsl
1120
1121	smsw	%ax
1122	orb	$CR0_TS,%al
1123	lmsw	%ax
1124	movl	$0,PCPU(FPCURTHREAD)
1125
1126	/* XXX end of atomic FPU context ungrab. */
1127	sti
1128
1129fastmove_tail_fault:
1130	movl	%ebp,%esp
1131	popl	%ebp
1132	addl	$8,%esp
1133	popl	%ebx
1134	popl	%edi
1135	popl	%esi
1136	movl	PCPU(CURPCB),%edx
1137	movl	$0,PCB_ONFAULT(%edx)
1138	movl	$EFAULT,%eax
1139	ret
1140#endif /* I586_CPU && defined(DEV_NPX) */
1141
1142/*
1143 * casuword.  Compare and set user word.  Returns -1 or the current value.
1144 */
1145
1146ALTENTRY(casuword32)
1147ENTRY(casuword)
1148	movl	PCPU(CURPCB),%ecx
1149	movl	$fusufault,PCB_ONFAULT(%ecx)
1150	movl	4(%esp),%edx			/* dst */
1151	movl	8(%esp),%eax			/* old */
1152	movl	12(%esp),%ecx			/* new */
1153
1154	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1155	ja	fusufault
1156
1157#ifdef SMP
1158	lock
1159#endif
1160	cmpxchgl %ecx,(%edx)			/* Compare and set. */
1161
1162	/*
1163	 * The old value is in %eax.  If the store succeeded it will be the
1164	 * value we expected (old) from before the store, otherwise it will
1165	 * be the current value.
1166	 */
1167
1168	movl	PCPU(CURPCB),%ecx
1169	movl	$fusufault,PCB_ONFAULT(%ecx)
1170	movl	$0,PCB_ONFAULT(%ecx)
1171	ret
1172
1173/*
1174 * Fetch (load) a 32-bit word, a 16-bit word, or an 8-bit byte from user
1175 * memory.  All these functions are MPSAFE.
1176 */
1177
1178ALTENTRY(fuword32)
1179ENTRY(fuword)
1180	movl	PCPU(CURPCB),%ecx
1181	movl	$fusufault,PCB_ONFAULT(%ecx)
1182	movl	4(%esp),%edx			/* from */
1183
1184	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1185	ja	fusufault
1186
1187	movl	(%edx),%eax
1188	movl	$0,PCB_ONFAULT(%ecx)
1189	ret
1190
1191/*
1192 * fuswintr() and suswintr() are specialized variants of fuword16() and
1193 * suword16(), respectively.  They are called from the profiling code,
1194 * potentially at interrupt time.  If they fail, that's okay; good things
1195 * will happen later.  They always fail for now, until the trap code is
1196 * able to deal with this.
1197 */
1198ALTENTRY(suswintr)
1199ENTRY(fuswintr)
1200	movl	$-1,%eax
1201	ret
1202
1203ENTRY(fuword16)
1204	movl	PCPU(CURPCB),%ecx
1205	movl	$fusufault,PCB_ONFAULT(%ecx)
1206	movl	4(%esp),%edx
1207
1208	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
1209	ja	fusufault
1210
1211	movzwl	(%edx),%eax
1212	movl	$0,PCB_ONFAULT(%ecx)
1213	ret
1214
1215ENTRY(fubyte)
1216	movl	PCPU(CURPCB),%ecx
1217	movl	$fusufault,PCB_ONFAULT(%ecx)
1218	movl	4(%esp),%edx
1219
1220	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1221	ja	fusufault
1222
1223	movzbl	(%edx),%eax
1224	movl	$0,PCB_ONFAULT(%ecx)
1225	ret
1226
1227	ALIGN_TEXT
1228fusufault:
1229	movl	PCPU(CURPCB),%ecx
1230	xorl	%eax,%eax
1231	movl	%eax,PCB_ONFAULT(%ecx)
1232	decl	%eax
1233	ret
1234
1235/*
1236 * Store a 32-bit word, a 16-bit word, or an 8-bit byte to user memory.
1237 * All these functions are MPSAFE.
1238 */
1239
1240ALTENTRY(suword32)
1241ENTRY(suword)
1242	movl	PCPU(CURPCB),%ecx
1243	movl	$fusufault,PCB_ONFAULT(%ecx)
1244	movl	4(%esp),%edx
1245
1246	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1247	ja	fusufault
1248
1249	movl	8(%esp),%eax
1250	movl	%eax,(%edx)
1251	xorl	%eax,%eax
1252	movl	PCPU(CURPCB),%ecx
1253	movl	%eax,PCB_ONFAULT(%ecx)
1254	ret
1255
1256ENTRY(suword16)
1257	movl	PCPU(CURPCB),%ecx
1258	movl	$fusufault,PCB_ONFAULT(%ecx)
1259	movl	4(%esp),%edx
1260
1261	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1262	ja	fusufault
1263
1264	movw	8(%esp),%ax
1265	movw	%ax,(%edx)
1266	xorl	%eax,%eax
1267	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
1268	movl	%eax,PCB_ONFAULT(%ecx)
1269	ret
1270
1271ENTRY(subyte)
1272	movl	PCPU(CURPCB),%ecx
1273	movl	$fusufault,PCB_ONFAULT(%ecx)
1274	movl	4(%esp),%edx
1275
1276	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1277	ja	fusufault
1278
1279	movb	8(%esp),%al
1280	movb	%al,(%edx)
1281	xorl	%eax,%eax
1282	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
1283	movl	%eax,PCB_ONFAULT(%ecx)
1284	ret
1285
1286/*
1287 * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
1288 *
1289 *	copy a string from from to to, stop when a 0 character is reached.
1290 *	return ENAMETOOLONG if string is longer than maxlen, and
1291 *	EFAULT on protection violations. If lencopied is non-zero,
1292 *	return the actual length in *lencopied.
1293 */
1294ENTRY(copyinstr)
1295	pushl	%esi
1296	pushl	%edi
1297	movl	PCPU(CURPCB),%ecx
1298	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1299
1300	movl	12(%esp),%esi			/* %esi = from */
1301	movl	16(%esp),%edi			/* %edi = to */
1302	movl	20(%esp),%edx			/* %edx = maxlen */
1303
1304	movl	$VM_MAXUSER_ADDRESS,%eax
1305
1306	/* make sure 'from' is within bounds */
1307	subl	%esi,%eax
1308	jbe	cpystrflt
1309
1310	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1311	cmpl	%edx,%eax
1312	jae	1f
1313	movl	%eax,%edx
1314	movl	%eax,20(%esp)
13151:
1316	incl	%edx
1317	cld
1318
13192:
1320	decl	%edx
1321	jz	3f
1322
1323	lodsb
1324	stosb
1325	orb	%al,%al
1326	jnz	2b
1327
1328	/* Success -- 0 byte reached */
1329	decl	%edx
1330	xorl	%eax,%eax
1331	jmp	cpystrflt_x
13323:
1333	/* edx is zero - return ENAMETOOLONG or EFAULT */
1334	cmpl	$VM_MAXUSER_ADDRESS,%esi
1335	jae	cpystrflt
13364:
1337	movl	$ENAMETOOLONG,%eax
1338	jmp	cpystrflt_x
1339
1340cpystrflt:
1341	movl	$EFAULT,%eax
1342
1343cpystrflt_x:
1344	/* set *lencopied and return %eax */
1345	movl	PCPU(CURPCB),%ecx
1346	movl	$0,PCB_ONFAULT(%ecx)
1347	movl	20(%esp),%ecx
1348	subl	%edx,%ecx
1349	movl	24(%esp),%edx
1350	testl	%edx,%edx
1351	jz	1f
1352	movl	%ecx,(%edx)
13531:
1354	popl	%edi
1355	popl	%esi
1356	ret
1357
1358
1359/*
1360 * copystr(from, to, maxlen, int *lencopied) - MP SAFE
1361 */
1362ENTRY(copystr)
1363	pushl	%esi
1364	pushl	%edi
1365
1366	movl	12(%esp),%esi			/* %esi = from */
1367	movl	16(%esp),%edi			/* %edi = to */
1368	movl	20(%esp),%edx			/* %edx = maxlen */
1369	incl	%edx
1370	cld
13711:
1372	decl	%edx
1373	jz	4f
1374	lodsb
1375	stosb
1376	orb	%al,%al
1377	jnz	1b
1378
1379	/* Success -- 0 byte reached */
1380	decl	%edx
1381	xorl	%eax,%eax
1382	jmp	6f
13834:
1384	/* edx is zero -- return ENAMETOOLONG */
1385	movl	$ENAMETOOLONG,%eax
1386
13876:
1388	/* set *lencopied and return %eax */
1389	movl	20(%esp),%ecx
1390	subl	%edx,%ecx
1391	movl	24(%esp),%edx
1392	testl	%edx,%edx
1393	jz	7f
1394	movl	%ecx,(%edx)
13957:
1396	popl	%edi
1397	popl	%esi
1398	ret
1399
1400ENTRY(bcmp)
1401	pushl	%edi
1402	pushl	%esi
1403	movl	12(%esp),%edi
1404	movl	16(%esp),%esi
1405	movl	20(%esp),%edx
1406
1407	movl	%edx,%ecx
1408	shrl	$2,%ecx
1409	cld					/* compare forwards */
1410	repe
1411	cmpsl
1412	jne	1f
1413
1414	movl	%edx,%ecx
1415	andl	$3,%ecx
1416	repe
1417	cmpsb
14181:
1419	setne	%al
1420	movsbl	%al,%eax
1421	popl	%esi
1422	popl	%edi
1423	ret
1424
1425
1426/*
1427 * Handling of special 386 registers and descriptor tables etc
1428 */
1429/* void lgdt(struct region_descriptor *rdp); */
1430ENTRY(lgdt)
1431	/* reload the descriptor table */
1432	movl	4(%esp),%eax
1433	lgdt	(%eax)
1434
1435	/* flush the prefetch q */
1436	jmp	1f
1437	nop
14381:
1439	/* reload "stale" selectors */
1440	movl	$KDSEL,%eax
1441	movl	%eax,%ds
1442	movl	%eax,%es
1443	movl	%eax,%gs
1444	movl	%eax,%ss
1445	movl	$KPSEL,%eax
1446	movl	%eax,%fs
1447
1448	/* reload code selector by turning return into intersegmental return */
1449	movl	(%esp),%eax
1450	pushl	%eax
1451	movl	$KCSEL,4(%esp)
1452	MEXITCOUNT
1453	lret
1454
1455/* ssdtosd(*ssdp,*sdp) */
1456ENTRY(ssdtosd)
1457	pushl	%ebx
1458	movl	8(%esp),%ecx
1459	movl	8(%ecx),%ebx
1460	shll	$16,%ebx
1461	movl	(%ecx),%edx
1462	roll	$16,%edx
1463	movb	%dh,%bl
1464	movb	%dl,%bh
1465	rorl	$8,%ebx
1466	movl	4(%ecx),%eax
1467	movw	%ax,%dx
1468	andl	$0xf0000,%eax
1469	orl	%eax,%ebx
1470	movl	12(%esp),%ecx
1471	movl	%edx,(%ecx)
1472	movl	%ebx,4(%ecx)
1473	popl	%ebx
1474	ret
1475
1476/* void reset_dbregs() */
1477ENTRY(reset_dbregs)
1478	movl    $0,%eax
1479	movl    %eax,%dr7     /* disable all breapoints first */
1480	movl    %eax,%dr0
1481	movl    %eax,%dr1
1482	movl    %eax,%dr2
1483	movl    %eax,%dr3
1484	movl    %eax,%dr6
1485	ret
1486
1487/*****************************************************************************/
1488/* setjump, longjump                                                         */
1489/*****************************************************************************/
1490
1491ENTRY(setjmp)
1492	movl	4(%esp),%eax
1493	movl	%ebx,(%eax)			/* save ebx */
1494	movl	%esp,4(%eax)			/* save esp */
1495	movl	%ebp,8(%eax)			/* save ebp */
1496	movl	%esi,12(%eax)			/* save esi */
1497	movl	%edi,16(%eax)			/* save edi */
1498	movl	(%esp),%edx			/* get rta */
1499	movl	%edx,20(%eax)			/* save eip */
1500	xorl	%eax,%eax			/* return(0); */
1501	ret
1502
1503ENTRY(longjmp)
1504	movl	4(%esp),%eax
1505	movl	(%eax),%ebx			/* restore ebx */
1506	movl	4(%eax),%esp			/* restore esp */
1507	movl	8(%eax),%ebp			/* restore ebp */
1508	movl	12(%eax),%esi			/* restore esi */
1509	movl	16(%eax),%edi			/* restore edi */
1510	movl	20(%eax),%edx			/* get rta */
1511	movl	%edx,(%esp)			/* put in return frame */
1512	xorl	%eax,%eax			/* return(1); */
1513	incl	%eax
1514	ret
1515
1516/*
1517 * Support for BB-profiling (gcc -a).  The kernbb program will extract
1518 * the data from the kernel.
1519 */
1520
1521	.data
1522	ALIGN_DATA
1523	.globl bbhead
1524bbhead:
1525	.long 0
1526
1527	.text
1528NON_GPROF_ENTRY(__bb_init_func)
1529	movl	4(%esp),%eax
1530	movl	$1,(%eax)
1531	movl	bbhead,%edx
1532	movl	%edx,16(%eax)
1533	movl	%eax,bbhead
1534	NON_GPROF_RET
1535