support.s revision 19678
1/*-
2 * Copyright (c) 1993 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	$Id: support.s,v 1.43 1996/11/11 20:38:53 bde Exp $
34 */
35
36#include "opt_cpu.h"
37
38#include <machine/asmacros.h>
39#include <machine/cputypes.h>
40#include <machine/specialreg.h>
41
42#include "assym.s"
43
44#define KDSEL		0x10			/* kernel data selector */
45#define IDXSHIFT	10
46
47	.data
48	.globl	_bcopy_vector
49_bcopy_vector:
50	.long	_generic_bcopy
51	.globl	_bzero
52_bzero:
53	.long	_generic_bzero
54	.globl	_copyin_vector
55_copyin_vector:
56	.long	_generic_copyin
57	.globl	_copyout_vector
58_copyout_vector:
59	.long	_generic_copyout
60	.globl	_ovbcopy_vector
61_ovbcopy_vector:
62	.long	_generic_bcopy
63kernel_fpu_lock:
64	.byte	0xfe
65	.space	3
66
67	.text
68
69/*
70 * bcopy family
71 * void bzero(void *buf, u_int len)
72 */
73
74ENTRY(generic_bzero)
75	pushl	%edi
76	movl	8(%esp),%edi
77	movl	12(%esp),%ecx
78	xorl	%eax,%eax
79	shrl	$2,%ecx
80	cld
81	rep
82	stosl
83	movl	12(%esp),%ecx
84	andl	$3,%ecx
85	rep
86	stosb
87	popl	%edi
88	ret
89
90#if defined(I486_CPU)
91ENTRY(i486_bzero)
92	movl	4(%esp),%edx
93	movl	8(%esp),%ecx
94	xorl	%eax,%eax
95/*
96 * do 64 byte chunks first
97 *
98 * XXX this is probably over-unrolled at least for DX2's
99 */
1002:
101	cmpl	$64,%ecx
102	jb	3f
103	movl	%eax,(%edx)
104	movl	%eax,4(%edx)
105	movl	%eax,8(%edx)
106	movl	%eax,12(%edx)
107	movl	%eax,16(%edx)
108	movl	%eax,20(%edx)
109	movl	%eax,24(%edx)
110	movl	%eax,28(%edx)
111	movl	%eax,32(%edx)
112	movl	%eax,36(%edx)
113	movl	%eax,40(%edx)
114	movl	%eax,44(%edx)
115	movl	%eax,48(%edx)
116	movl	%eax,52(%edx)
117	movl	%eax,56(%edx)
118	movl	%eax,60(%edx)
119	addl	$64,%edx
120	subl	$64,%ecx
121	jnz	2b
122	ret
123
124/*
125 * do 16 byte chunks
126 */
127	SUPERALIGN_TEXT
1283:
129	cmpl	$16,%ecx
130	jb	4f
131	movl	%eax,(%edx)
132	movl	%eax,4(%edx)
133	movl	%eax,8(%edx)
134	movl	%eax,12(%edx)
135	addl	$16,%edx
136	subl	$16,%ecx
137	jnz	3b
138	ret
139
140/*
141 * do 4 byte chunks
142 */
143	SUPERALIGN_TEXT
1444:
145	cmpl	$4,%ecx
146	jb	5f
147	movl	%eax,(%edx)
148	addl	$4,%edx
149	subl	$4,%ecx
150	jnz	4b
151	ret
152
153/*
154 * do 1 byte chunks
155 * a jump table seems to be faster than a loop or more range reductions
156 *
157 * XXX need a const section for non-text
158 */
159	.data
160jtab:
161	.long	do0
162	.long	do1
163	.long	do2
164	.long	do3
165
166	.text
167	SUPERALIGN_TEXT
1685:
169	jmp	jtab(,%ecx,4)
170
171	SUPERALIGN_TEXT
172do3:
173	movw	%ax,(%edx)
174	movb	%al,2(%edx)
175	ret
176
177	SUPERALIGN_TEXT
178do2:
179	movw	%ax,(%edx)
180	ret
181
182	SUPERALIGN_TEXT
183do1:
184	movb	%al,(%edx)
185	ret
186
187	SUPERALIGN_TEXT
188do0:
189	ret
190#endif
191
192#ifdef I586_CPU
193ENTRY(i586_bzero)
194	movl	4(%esp),%edx
195	movl	8(%esp),%ecx
196
197	/*
198	 * The FPU register method is twice as fast as the integer register
199	 * method unless the target is in the L1 cache and we pre-allocate a
200	 * cache line for it (then the integer register method is 4-5 times
201	 * faster).  However, we never pre-allocate cache lines, since that
202	 * would make the integer method 25% or more slower for the common
203	 * case when the target isn't in either the L1 cache or the L2 cache.
204	 * Thus we normally use the FPU register method unless the overhead
205	 * would be too large.
206	 */
207	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
208	jb	intreg_i586_bzero
209
210	/*
211	 * The FPU registers may belong to an application or to fastmove()
212	 * or to another invocation of bcopy() or ourself in a higher level
213	 * interrupt or trap handler.  Preserving the registers is
214	 * complicated since we avoid it if possible at all levels.  We
215	 * want to localize the complications even when that increases them.
216	 * Here the extra work involves preserving CR0_TS in TS.
217	 * `npxproc != NULL' is supposed to be the condition that all the
218	 * FPU resources belong to an application, but npxproc and CR0_TS
219	 * aren't set atomically enough for this condition to work in
220	 * interrupt handlers.
221	 *
222	 * Case 1: FPU registers belong to the application: we must preserve
223	 * the registers if we use them, so we only use the FPU register
224	 * method if the target size is large enough to amortize the extra
225	 * overhead for preserving them.  CR0_TS must be preserved although
226	 * it is very likely to end up as set.
227	 *
228	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
229	 * makes the registers look like they belong to an application so
230	 * that cpu_switch() and savectx() don't have to know about it, so
231	 * this case reduces to case 1.
232	 *
233	 * Case 3: FPU registers belong to the kernel: don't use the FPU
234	 * register method.  This case is unlikely, and supporting it would
235	 * be more complicated and might take too much stack.
236	 *
237	 * Case 4: FPU registers don't belong to anyone: the FPU registers
238	 * don't need to be preserved, so we always use the FPU register
239	 * method.  CR0_TS must be preserved although it is very likely to
240	 * always end up as clear.
241	 */
242	cmpl	$0,_npxproc
243	je	i586_bz1
244	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
245	jb	intreg_i586_bzero
246	sarb	$1,kernel_fpu_lock
247	jc	intreg_i586_bzero
248	smsw	%ax
249	clts
250	subl	$108,%esp
251	fnsave	0(%esp)
252	jmp	i586_bz2
253
254i586_bz1:
255	sarb	$1,kernel_fpu_lock
256	jc	intreg_i586_bzero
257	smsw	%ax
258	clts
259	fninit				/* XXX should avoid needing this */
260i586_bz2:
261	fldz
262
263	/*
264	 * Align to an 8 byte boundary (misalignment in the main loop would
265	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
266	 * already aligned) by always zeroing 8 bytes and using the part up
267	 * to the _next_ alignment position.
268	 */
269	fstl	0(%edx)
270	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
271	addl	$8,%edx
272	andl	$~7,%edx
273	subl	%edx,%ecx
274
275	/*
276	 * Similarly align `len' to a multiple of 8.
277	 */
278	fstl	-8(%edx,%ecx)
279	decl	%ecx
280	andl	$~7,%ecx
281
282	/*
283	 * This wouldn't be any faster if it were unrolled, since the loop
284	 * control instructions are much faster than the fstl and/or done
285	 * in parallel with it so their overhead is insignificant.
286	 */
287fpureg_i586_bzero_loop:
288	fstl	0(%edx)
289	addl	$8,%edx
290	subl	$8,%ecx
291	cmpl	$8,%ecx
292	jae	fpureg_i586_bzero_loop
293
294	cmpl	$0,_npxproc
295	je	i586_bz3
296	frstor	0(%esp)
297	addl	$108,%esp
298	lmsw	%ax
299	movb	$0xfe,kernel_fpu_lock
300	ret
301
302i586_bz3:
303	fstpl	%st(0)
304	lmsw	%ax
305	movb	$0xfe,kernel_fpu_lock
306	ret
307
308intreg_i586_bzero:
309	/*
310	 * `rep stos' seems to be the best method in practice for small
311	 * counts.  Fancy methods usually take too long to start up due
312	 * to cache and BTB misses.
313	 */
314	pushl	%edi
315	movl	%edx,%edi
316	xorl	%eax,%eax
317	shrl	$2,%ecx
318	cld
319	rep
320	stosl
321	movl	12(%esp),%ecx
322	andl	$3,%ecx
323	jne	1f
324	popl	%edi
325	ret
326
3271:
328	rep
329	stosb
330	popl	%edi
331	ret
332#endif /* I586_CPU */
333
334/* fillw(pat, base, cnt) */
335ENTRY(fillw)
336	pushl	%edi
337	movl	8(%esp),%eax
338	movl	12(%esp),%edi
339	movl	16(%esp),%ecx
340	cld
341	rep
342	stosw
343	popl	%edi
344	ret
345
346ENTRY(bcopyb)
347bcopyb:
348	pushl	%esi
349	pushl	%edi
350	movl	12(%esp),%esi
351	movl	16(%esp),%edi
352	movl	20(%esp),%ecx
353	movl	%edi,%eax
354	subl	%esi,%eax
355	cmpl	%ecx,%eax			/* overlapping && src < dst? */
356	jb	1f
357	cld					/* nope, copy forwards */
358	rep
359	movsb
360	popl	%edi
361	popl	%esi
362	ret
363
364	ALIGN_TEXT
3651:
366	addl	%ecx,%edi			/* copy backwards. */
367	addl	%ecx,%esi
368	decl	%edi
369	decl	%esi
370	std
371	rep
372	movsb
373	popl	%edi
374	popl	%esi
375	cld
376	ret
377
378ENTRY(bcopy)
379	MEXITCOUNT
380	jmp	*_bcopy_vector
381
382ENTRY(ovbcopy)
383	MEXITCOUNT
384	jmp	*_ovbcopy_vector
385
386/*
387 * generic_bcopy(src, dst, cnt)
388 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
389 */
390ENTRY(generic_bcopy)
391	pushl	%esi
392	pushl	%edi
393	movl	12(%esp),%esi
394	movl	16(%esp),%edi
395	movl	20(%esp),%ecx
396
397	movl	%edi,%eax
398	subl	%esi,%eax
399	cmpl	%ecx,%eax			/* overlapping && src < dst? */
400	jb	1f
401
402	shrl	$2,%ecx				/* copy by 32-bit words */
403	cld					/* nope, copy forwards */
404	rep
405	movsl
406	movl	20(%esp),%ecx
407	andl	$3,%ecx				/* any bytes left? */
408	rep
409	movsb
410	popl	%edi
411	popl	%esi
412	ret
413
414	ALIGN_TEXT
4151:
416	addl	%ecx,%edi			/* copy backwards */
417	addl	%ecx,%esi
418	decl	%edi
419	decl	%esi
420	andl	$3,%ecx				/* any fractional bytes? */
421	std
422	rep
423	movsb
424	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
425	shrl	$2,%ecx
426	subl	$3,%esi
427	subl	$3,%edi
428	rep
429	movsl
430	popl	%edi
431	popl	%esi
432	cld
433	ret
434
435#ifdef I586_CPU
436ENTRY(i586_bcopy)
437	pushl	%esi
438	pushl	%edi
439	movl	12(%esp),%esi
440	movl	16(%esp),%edi
441	movl	20(%esp),%ecx
442
443	movl	%edi,%eax
444	subl	%esi,%eax
445	cmpl	%ecx,%eax			/* overlapping && src < dst? */
446	jb	1f
447
448	cmpl	$1024,%ecx
449	jb	small_i586_bcopy
450
451	sarb	$1,kernel_fpu_lock
452	jc	small_i586_bcopy
453	cmpl	$0,_npxproc
454	je	i586_bc1
455	smsw	%dx
456	clts
457	subl	$108,%esp
458	fnsave	0(%esp)
459	jmp	4f
460
461i586_bc1:
462	smsw	%dx
463	clts
464	fninit				/* XXX should avoid needing this */
465
466	ALIGN_TEXT
4674:
468	pushl	%ecx
469#define	DCACHE_SIZE	8192
470	cmpl	$(DCACHE_SIZE-512)/2,%ecx
471	jbe	2f
472	movl	$(DCACHE_SIZE-512)/2,%ecx
4732:
474	subl	%ecx,0(%esp)
475	cmpl	$256,%ecx
476	jb	5f			/* XXX should prefetch if %ecx >= 32 */
477	pushl	%esi
478	pushl	%ecx
479	ALIGN_TEXT
4803:
481	movl	0(%esi),%eax
482	movl	32(%esi),%eax
483	movl	64(%esi),%eax
484	movl	96(%esi),%eax
485	movl	128(%esi),%eax
486	movl	160(%esi),%eax
487	movl	192(%esi),%eax
488	movl	224(%esi),%eax
489	addl	$256,%esi
490	subl	$256,%ecx
491	cmpl	$256,%ecx
492	jae	3b
493	popl	%ecx
494	popl	%esi
4955:
496	ALIGN_TEXT
497large_i586_bcopy_loop:
498	fildq	0(%esi)
499	fildq	8(%esi)
500	fildq	16(%esi)
501	fildq	24(%esi)
502	fildq	32(%esi)
503	fildq	40(%esi)
504	fildq	48(%esi)
505	fildq	56(%esi)
506	fistpq	56(%edi)
507	fistpq	48(%edi)
508	fistpq	40(%edi)
509	fistpq	32(%edi)
510	fistpq	24(%edi)
511	fistpq	16(%edi)
512	fistpq	8(%edi)
513	fistpq	0(%edi)
514	addl	$64,%esi
515	addl	$64,%edi
516	subl	$64,%ecx
517	cmpl	$64,%ecx
518	jae	large_i586_bcopy_loop
519	popl	%eax
520	addl	%eax,%ecx
521	cmpl	$64,%ecx
522	jae	4b
523
524	cmpl	$0,_npxproc
525	je	i586_bc2
526	frstor	0(%esp)
527	addl	$108,%esp
528i586_bc2:
529	lmsw	%dx
530	movb	$0xfe,kernel_fpu_lock
531
532/*
533 * This is a duplicate of the main part of generic_bcopy.  See the comments
534 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
535 * would mess up high resolution profiling.
536 */
537	ALIGN_TEXT
538small_i586_bcopy:
539	shrl	$2,%ecx
540	cld
541	rep
542	movsl
543	movl	20(%esp),%ecx
544	andl	$3,%ecx
545	rep
546	movsb
547	popl	%edi
548	popl	%esi
549	ret
550
551	ALIGN_TEXT
5521:
553	addl	%ecx,%edi
554	addl	%ecx,%esi
555	decl	%edi
556	decl	%esi
557	andl	$3,%ecx
558	std
559	rep
560	movsb
561	movl	20(%esp),%ecx
562	shrl	$2,%ecx
563	subl	$3,%esi
564	subl	$3,%edi
565	rep
566	movsl
567	popl	%edi
568	popl	%esi
569	cld
570	ret
571#endif /* I586_CPU */
572
573/*
574 * Note: memcpy does not support overlapping copies
575 */
576ENTRY(memcpy)
577	pushl	%edi
578	pushl	%esi
579	movl	12(%esp),%edi
580	movl	16(%esp),%esi
581	movl	20(%esp),%ecx
582	movl	%edi,%eax
583	shrl	$2,%ecx				/* copy by 32-bit words */
584	cld					/* nope, copy forwards */
585	rep
586	movsl
587	movl	20(%esp),%ecx
588	andl	$3,%ecx				/* any bytes left? */
589	rep
590	movsb
591	popl	%esi
592	popl	%edi
593	ret
594
595
596/*****************************************************************************/
597/* copyout and fubyte family                                                 */
598/*****************************************************************************/
599/*
600 * Access user memory from inside the kernel. These routines and possibly
601 * the math- and DOS emulators should be the only places that do this.
602 *
603 * We have to access the memory with user's permissions, so use a segment
604 * selector with RPL 3. For writes to user space we have to additionally
605 * check the PTE for write permission, because the 386 does not check
606 * write permissions when we are executing with EPL 0. The 486 does check
607 * this if the WP bit is set in CR0, so we can use a simpler version here.
608 *
609 * These routines set curpcb->onfault for the time they execute. When a
610 * protection violation occurs inside the functions, the trap handler
611 * returns to *curpcb->onfault instead of the function.
612 */
613
614/* copyout(from_kernel, to_user, len) */
615ENTRY(copyout)
616	MEXITCOUNT
617	jmp	*_copyout_vector
618
619ENTRY(generic_copyout)
620	movl	_curpcb,%eax
621	movl	$copyout_fault,PCB_ONFAULT(%eax)
622	pushl	%esi
623	pushl	%edi
624	pushl	%ebx
625	movl	16(%esp),%esi
626	movl	20(%esp),%edi
627	movl	24(%esp),%ebx
628	testl	%ebx,%ebx			/* anything to do? */
629	jz	done_copyout
630
631	/*
632	 * Check explicitly for non-user addresses.  If 486 write protection
633	 * is being used, this check is essential because we are in kernel
634	 * mode so the h/w does not provide any protection against writing
635	 * kernel addresses.
636	 */
637
638	/*
639	 * First, prevent address wrapping.
640	 */
641	movl	%edi,%eax
642	addl	%ebx,%eax
643	jc	copyout_fault
644/*
645 * XXX STOP USING VM_MAXUSER_ADDRESS.
646 * It is an end address, not a max, so every time it is used correctly it
647 * looks like there is an off by one error, and of course it caused an off
648 * by one error in several places.
649 */
650	cmpl	$VM_MAXUSER_ADDRESS,%eax
651	ja	copyout_fault
652
653#if defined(I386_CPU)
654
655#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
656	cmpl	$CPUCLASS_386,_cpu_class
657	jne	3f
658#endif
659/*
660 * We have to check each PTE for user write permission.
661 * The checking may cause a page fault, so it is important to set
662 * up everything for return via copyout_fault before here.
663 */
664	/* compute number of pages */
665	movl	%edi,%ecx
666	andl	$PAGE_MASK,%ecx
667	addl	%ebx,%ecx
668	decl	%ecx
669	shrl	$IDXSHIFT+2,%ecx
670	incl	%ecx
671
672	/* compute PTE offset for start address */
673	movl	%edi,%edx
674	shrl	$IDXSHIFT,%edx
675	andb	$0xfc,%dl
676
6771:	/* check PTE for each page */
678	movb	_PTmap(%edx),%al
679	andb	$0x07,%al			/* Pages must be VALID + USERACC + WRITABLE */
680	cmpb	$0x07,%al
681	je	2f
682
683	/* simulate a trap */
684	pushl	%edx
685	pushl	%ecx
686	shll	$IDXSHIFT,%edx
687	pushl	%edx
688	call	_trapwrite			/* trapwrite(addr) */
689	popl	%edx
690	popl	%ecx
691	popl	%edx
692
693	testl	%eax,%eax			/* if not ok, return EFAULT */
694	jnz	copyout_fault
695
6962:
697	addl	$4,%edx
698	decl	%ecx
699	jnz	1b				/* check next page */
700#endif /* I386_CPU */
701
702	/* bcopy(%esi, %edi, %ebx) */
7033:
704	movl	%ebx,%ecx
705
706#ifdef I586_CPU
707	ALIGN_TEXT
708slow_copyout:
709#endif
710	shrl	$2,%ecx
711	cld
712	rep
713	movsl
714	movb	%bl,%cl
715	andb	$3,%cl
716	rep
717	movsb
718
719done_copyout:
720	popl	%ebx
721	popl	%edi
722	popl	%esi
723	xorl	%eax,%eax
724	movl	_curpcb,%edx
725	movl	%eax,PCB_ONFAULT(%edx)
726	ret
727
728	ALIGN_TEXT
729copyout_fault:
730	popl	%ebx
731	popl	%edi
732	popl	%esi
733	movl	_curpcb,%edx
734	movl	$0,PCB_ONFAULT(%edx)
735	movl	$EFAULT,%eax
736	ret
737
738#ifdef I586_CPU
739ENTRY(i586_copyout)
740	/*
741	 * Duplicated from generic_copyout.  Could be done a bit better.
742	 */
743	movl	_curpcb,%eax
744	movl	$copyout_fault,PCB_ONFAULT(%eax)
745	pushl	%esi
746	pushl	%edi
747	pushl	%ebx
748	movl	16(%esp),%esi
749	movl	20(%esp),%edi
750	movl	24(%esp),%ebx
751	testl	%ebx,%ebx			/* anything to do? */
752	jz	done_copyout
753
754	/*
755	 * Check explicitly for non-user addresses.  If 486 write protection
756	 * is being used, this check is essential because we are in kernel
757	 * mode so the h/w does not provide any protection against writing
758	 * kernel addresses.
759	 */
760
761	/*
762	 * First, prevent address wrapping.
763	 */
764	movl	%edi,%eax
765	addl	%ebx,%eax
766	jc	copyout_fault
767/*
768 * XXX STOP USING VM_MAXUSER_ADDRESS.
769 * It is an end address, not a max, so every time it is used correctly it
770 * looks like there is an off by one error, and of course it caused an off
771 * by one error in several places.
772 */
773	cmpl	$VM_MAXUSER_ADDRESS,%eax
774	ja	copyout_fault
775
776	/* bcopy(%esi, %edi, %ebx) */
7773:
778	movl	%ebx,%ecx
779	/*
780	 * End of duplicated code.
781	 */
782
783	cmpl	$1024,%ecx
784	jb	slow_copyout
785
786	pushl	%ecx
787	call	_fastmove
788	addl	$4,%esp
789	jmp	done_copyout
790#endif /* I586_CPU */
791
792/* copyin(from_user, to_kernel, len) */
793ENTRY(copyin)
794	MEXITCOUNT
795	jmp	*_copyin_vector
796
797ENTRY(generic_copyin)
798	movl	_curpcb,%eax
799	movl	$copyin_fault,PCB_ONFAULT(%eax)
800	pushl	%esi
801	pushl	%edi
802	movl	12(%esp),%esi			/* caddr_t from */
803	movl	16(%esp),%edi			/* caddr_t to */
804	movl	20(%esp),%ecx			/* size_t  len */
805
806	/*
807	 * make sure address is valid
808	 */
809	movl	%esi,%edx
810	addl	%ecx,%edx
811	jc	copyin_fault
812	cmpl	$VM_MAXUSER_ADDRESS,%edx
813	ja	copyin_fault
814
815#ifdef I586_CPU
816	ALIGN_TEXT
817slow_copyin:
818#endif
819	movb	%cl,%al
820	shrl	$2,%ecx				/* copy longword-wise */
821	cld
822	rep
823	movsl
824	movb	%al,%cl
825	andb	$3,%cl				/* copy remaining bytes */
826	rep
827	movsb
828
829#if defined(I586_CPU)
830	ALIGN_TEXT
831done_copyin:
832#endif /* I586_CPU */
833	popl	%edi
834	popl	%esi
835	xorl	%eax,%eax
836	movl	_curpcb,%edx
837	movl	%eax,PCB_ONFAULT(%edx)
838	ret
839
840	ALIGN_TEXT
841copyin_fault:
842	popl	%edi
843	popl	%esi
844	movl	_curpcb,%edx
845	movl	$0,PCB_ONFAULT(%edx)
846	movl	$EFAULT,%eax
847	ret
848
849#ifdef I586_CPU
850ENTRY(i586_copyin)
851	/*
852	 * Duplicated from generic_copyin.  Could be done a bit better.
853	 */
854	movl	_curpcb,%eax
855	movl	$copyin_fault,PCB_ONFAULT(%eax)
856	pushl	%esi
857	pushl	%edi
858	movl	12(%esp),%esi			/* caddr_t from */
859	movl	16(%esp),%edi			/* caddr_t to */
860	movl	20(%esp),%ecx			/* size_t  len */
861
862	/*
863	 * make sure address is valid
864	 */
865	movl	%esi,%edx
866	addl	%ecx,%edx
867	jc	copyin_fault
868	cmpl	$VM_MAXUSER_ADDRESS,%edx
869	ja	copyin_fault
870	/*
871	 * End of duplicated code.
872	 */
873
874	cmpl	$1024,%ecx
875	jb	slow_copyin
876
877	pushl	%ecx
878	call	_fastmove
879	addl	$4,%esp
880	jmp	done_copyin
881#endif /* I586_CPU */
882
883#if defined(I586_CPU)
884/* fastmove(src, dst, len)
885	src in %esi
886	dst in %edi
887	len in %ecx		XXX changed to on stack for profiling
888	uses %eax and %edx for tmp. storage
889 */
890/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
891ENTRY(fastmove)
892	movl	4(%esp),%ecx
893	cmpl	$63,%ecx
894	jbe	fastmove_tail
895
896	testl	$7,%esi	/* check if src addr is multiple of 8 */
897	jnz	fastmove_tail
898
899	testl	$7,%edi	/* check if dst addr is multiple of 8 */
900	jnz	fastmove_tail
901
902	pushl	%ebp
903	movl	%esp,%ebp
904	subl	$PCB_SAVEFPU_SIZE,%esp
905
906/* if (npxproc != NULL) { */
907	cmpl	$0,_npxproc
908	je	6f
909/*    fnsave(&curpcb->pcb_savefpu); */
910	movl	_curpcb,%eax
911	fnsave	PCB_SAVEFPU(%eax)
912/*   npxproc = NULL; */
913	movl	$0,_npxproc
914/* } */
9156:
916/* now we own the FPU. */
917
918/*
919 * The process' FP state is saved in the pcb, but if we get
920 * switched, the cpu_switch() will store our FP state in the
921 * pcb.  It should be possible to avoid all the copying for
922 * this, e.g., by setting a flag to tell cpu_switch() to
923 * save the state somewhere else.
924 */
925/* tmp = curpcb->pcb_savefpu; */
926	pushl	%edi
927	pushl	%esi
928	pushl	%ecx
929	leal	-PCB_SAVEFPU_SIZE(%ebp),%edi
930	movl	_curpcb,%esi
931	addl	$PCB_SAVEFPU,%esi
932	cld
933	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
934	rep
935	movsl
936	popl	%ecx
937	popl	%esi
938	popl	%edi
939/* stop_emulating(); */
940	clts
941/* npxproc = curproc; */
942	movl	_curproc,%eax
943	movl	%eax,_npxproc
9444:
945	pushl	%ecx
946	cmpl	$1792,%ecx
947	jbe	2f
948	movl	$1792,%ecx
9492:
950	subl	%ecx,0(%esp)
951	cmpl	$256,%ecx
952	jb	5f
953	pushl	%esi
954	pushl	%ecx
955	ALIGN_TEXT
9563:
957	movl	0(%esi),%eax
958	movl	32(%esi),%eax
959	movl	64(%esi),%eax
960	movl	96(%esi),%eax
961	movl	128(%esi),%eax
962	movl	160(%esi),%eax
963	movl	192(%esi),%eax
964	movl	224(%esi),%eax
965	addl	$256,%esi
966	subl	$256,%ecx
967	cmpl	$256,%ecx
968	jae	3b
969	popl	%ecx
970	popl	%esi
9715:
972	ALIGN_TEXT
973fastmove_loop:
974	fildq	0(%esi)
975	fildq	8(%esi)
976	fildq	16(%esi)
977	fildq	24(%esi)
978	fildq	32(%esi)
979	fildq	40(%esi)
980	fildq	48(%esi)
981	fildq	56(%esi)
982	fistpq	56(%edi)
983	fistpq	48(%edi)
984	fistpq	40(%edi)
985	fistpq	32(%edi)
986	fistpq	24(%edi)
987	fistpq	16(%edi)
988	fistpq	8(%edi)
989	fistpq	0(%edi)
990	addl	$-64,%ecx
991	addl	$64,%esi
992	addl	$64,%edi
993	cmpl	$63,%ecx
994	ja	fastmove_loop
995	popl	%eax
996	addl	%eax,%ecx
997	cmpl	$64,%ecx
998	jae	4b
999
1000/* curpcb->pcb_savefpu = tmp; */
1001	pushl	%edi
1002	pushl	%esi
1003	pushl	%ecx
1004	movl	_curpcb,%edi
1005	addl	$PCB_SAVEFPU,%edi
1006	leal	-PCB_SAVEFPU_SIZE(%ebp),%esi
1007	cld
1008	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1009	rep
1010	movsl
1011	popl	%ecx
1012	popl	%esi
1013	popl	%edi
1014
1015/* start_emulating(); */
1016	smsw	%ax
1017	orb	$CR0_TS,%al
1018	lmsw	%ax
1019/* npxproc = NULL; */
1020	movl	$0,_npxproc
1021	movl	%ebp,%esp
1022	popl	%ebp
1023
1024	ALIGN_TEXT
1025fastmove_tail:
1026	movb	%cl,%al
1027	shrl	$2,%ecx				/* copy longword-wise */
1028	cld
1029	rep
1030	movsl
1031	movb	%al,%cl
1032	andb	$3,%cl				/* copy remaining bytes */
1033	rep
1034	movsb
1035
1036	ret
1037#endif /* I586_CPU */
1038
1039/*
1040 * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
1041 */
1042ENTRY(fuword)
1043	movl	_curpcb,%ecx
1044	movl	$fusufault,PCB_ONFAULT(%ecx)
1045	movl	4(%esp),%edx			/* from */
1046
1047	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1048	ja	fusufault
1049
1050	movl	(%edx),%eax
1051	movl	$0,PCB_ONFAULT(%ecx)
1052	ret
1053
1054/*
1055 * These two routines are called from the profiling code, potentially
1056 * at interrupt time. If they fail, that's okay, good things will
1057 * happen later. Fail all the time for now - until the trap code is
1058 * able to deal with this.
1059 */
1060ALTENTRY(suswintr)
1061ENTRY(fuswintr)
1062	movl	$-1,%eax
1063	ret
1064
1065ENTRY(fusword)
1066	movl	_curpcb,%ecx
1067	movl	$fusufault,PCB_ONFAULT(%ecx)
1068	movl	4(%esp),%edx
1069
1070	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
1071	ja	fusufault
1072
1073	movzwl	(%edx),%eax
1074	movl	$0,PCB_ONFAULT(%ecx)
1075	ret
1076
1077ENTRY(fubyte)
1078	movl	_curpcb,%ecx
1079	movl	$fusufault,PCB_ONFAULT(%ecx)
1080	movl	4(%esp),%edx
1081
1082	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1083	ja	fusufault
1084
1085	movzbl	(%edx),%eax
1086	movl	$0,PCB_ONFAULT(%ecx)
1087	ret
1088
1089	ALIGN_TEXT
1090fusufault:
1091	movl	_curpcb,%ecx
1092	xorl	%eax,%eax
1093	movl	%eax,PCB_ONFAULT(%ecx)
1094	decl	%eax
1095	ret
1096
1097/*
1098 * su{byte,sword,word}: write a byte (word, longword) to user memory
1099 */
1100ENTRY(suword)
1101	movl	_curpcb,%ecx
1102	movl	$fusufault,PCB_ONFAULT(%ecx)
1103	movl	4(%esp),%edx
1104
1105#if defined(I386_CPU)
1106
1107#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1108	cmpl	$CPUCLASS_386,_cpu_class
1109	jne	2f				/* we only have to set the right segment selector */
1110#endif /* I486_CPU || I586_CPU || I686_CPU */
1111
1112	/* XXX - page boundary crossing is still not handled */
1113	movl	%edx,%eax
1114	shrl	$IDXSHIFT,%edx
1115	andb	$0xfc,%dl
1116	movb	_PTmap(%edx),%dl
1117	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1118	cmpb	$0x7,%dl
1119	je	1f
1120
1121	/* simulate a trap */
1122	pushl	%eax
1123	call	_trapwrite
1124	popl	%edx				/* remove junk parameter from stack */
1125	movl	_curpcb,%ecx			/* restore trashed register */
1126	testl	%eax,%eax
1127	jnz	fusufault
11281:
1129	movl	4(%esp),%edx
1130#endif
1131
11322:
1133	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1134	ja	fusufault
1135
1136	movl	8(%esp),%eax
1137	movl	%eax,(%edx)
1138	xorl	%eax,%eax
1139	movl	%eax,PCB_ONFAULT(%ecx)
1140	ret
1141
1142ENTRY(susword)
1143	movl	_curpcb,%ecx
1144	movl	$fusufault,PCB_ONFAULT(%ecx)
1145	movl	4(%esp),%edx
1146
1147#if defined(I386_CPU)
1148
1149#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1150	cmpl	$CPUCLASS_386,_cpu_class
1151	jne	2f
1152#endif /* I486_CPU || I586_CPU || I686_CPU */
1153
1154	/* XXX - page boundary crossing is still not handled */
1155	movl	%edx,%eax
1156	shrl	$IDXSHIFT,%edx
1157	andb	$0xfc,%dl
1158	movb	_PTmap(%edx),%dl
1159	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1160	cmpb	$0x7,%dl
1161	je	1f
1162
1163	/* simulate a trap */
1164	pushl	%eax
1165	call	_trapwrite
1166	popl	%edx				/* remove junk parameter from stack */
1167	movl	_curpcb,%ecx			/* restore trashed register */
1168	testl	%eax,%eax
1169	jnz	fusufault
11701:
1171	movl	4(%esp),%edx
1172#endif
1173
11742:
1175	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1176	ja	fusufault
1177
1178	movw	8(%esp),%ax
1179	movw	%ax,(%edx)
1180	xorl	%eax,%eax
1181	movl	%eax,PCB_ONFAULT(%ecx)
1182	ret
1183
1184ALTENTRY(suibyte)
1185ENTRY(subyte)
1186	movl	_curpcb,%ecx
1187	movl	$fusufault,PCB_ONFAULT(%ecx)
1188	movl	4(%esp),%edx
1189
1190#if defined(I386_CPU)
1191
1192#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1193	cmpl	$CPUCLASS_386,_cpu_class
1194	jne	2f
1195#endif /* I486_CPU || I586_CPU || I686_CPU */
1196
1197	movl	%edx,%eax
1198	shrl	$IDXSHIFT,%edx
1199	andb	$0xfc,%dl
1200	movb	_PTmap(%edx),%dl
1201	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1202	cmpb	$0x7,%dl
1203	je	1f
1204
1205	/* simulate a trap */
1206	pushl	%eax
1207	call	_trapwrite
1208	popl	%edx				/* remove junk parameter from stack */
1209	movl	_curpcb,%ecx			/* restore trashed register */
1210	testl	%eax,%eax
1211	jnz	fusufault
12121:
1213	movl	4(%esp),%edx
1214#endif
1215
12162:
1217	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1218	ja	fusufault
1219
1220	movb	8(%esp),%al
1221	movb	%al,(%edx)
1222	xorl	%eax,%eax
1223	movl	%eax,PCB_ONFAULT(%ecx)
1224	ret
1225
1226/*
1227 * copyinstr(from, to, maxlen, int *lencopied)
1228 *	copy a string from from to to, stop when a 0 character is reached.
1229 *	return ENAMETOOLONG if string is longer than maxlen, and
1230 *	EFAULT on protection violations. If lencopied is non-zero,
1231 *	return the actual length in *lencopied.
1232 */
1233ENTRY(copyinstr)
1234	pushl	%esi
1235	pushl	%edi
1236	movl	_curpcb,%ecx
1237	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1238
1239	movl	12(%esp),%esi			/* %esi = from */
1240	movl	16(%esp),%edi			/* %edi = to */
1241	movl	20(%esp),%edx			/* %edx = maxlen */
1242
1243	movl	$VM_MAXUSER_ADDRESS,%eax
1244
1245	/* make sure 'from' is within bounds */
1246	subl	%esi,%eax
1247	jbe	cpystrflt
1248
1249	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1250	cmpl	%edx,%eax
1251	jae	1f
1252	movl	%eax,%edx
1253	movl	%eax,20(%esp)
12541:
1255	incl	%edx
1256	cld
1257
12582:
1259	decl	%edx
1260	jz	3f
1261
1262	lodsb
1263	stosb
1264	orb	%al,%al
1265	jnz	2b
1266
1267	/* Success -- 0 byte reached */
1268	decl	%edx
1269	xorl	%eax,%eax
1270	jmp	cpystrflt_x
12713:
1272	/* edx is zero - return ENAMETOOLONG or EFAULT */
1273	cmpl	$VM_MAXUSER_ADDRESS,%esi
1274	jae	cpystrflt
12754:
1276	movl	$ENAMETOOLONG,%eax
1277	jmp	cpystrflt_x
1278
1279cpystrflt:
1280	movl	$EFAULT,%eax
1281
1282cpystrflt_x:
1283	/* set *lencopied and return %eax */
1284	movl	_curpcb,%ecx
1285	movl	$0,PCB_ONFAULT(%ecx)
1286	movl	20(%esp),%ecx
1287	subl	%edx,%ecx
1288	movl	24(%esp),%edx
1289	testl	%edx,%edx
1290	jz	1f
1291	movl	%ecx,(%edx)
12921:
1293	popl	%edi
1294	popl	%esi
1295	ret
1296
1297
1298/*
1299 * copystr(from, to, maxlen, int *lencopied)
1300 */
1301ENTRY(copystr)
1302	pushl	%esi
1303	pushl	%edi
1304
1305	movl	12(%esp),%esi			/* %esi = from */
1306	movl	16(%esp),%edi			/* %edi = to */
1307	movl	20(%esp),%edx			/* %edx = maxlen */
1308	incl	%edx
1309	cld
13101:
1311	decl	%edx
1312	jz	4f
1313	lodsb
1314	stosb
1315	orb	%al,%al
1316	jnz	1b
1317
1318	/* Success -- 0 byte reached */
1319	decl	%edx
1320	xorl	%eax,%eax
1321	jmp	6f
13224:
1323	/* edx is zero -- return ENAMETOOLONG */
1324	movl	$ENAMETOOLONG,%eax
1325
13266:
1327	/* set *lencopied and return %eax */
1328	movl	20(%esp),%ecx
1329	subl	%edx,%ecx
1330	movl	24(%esp),%edx
1331	testl	%edx,%edx
1332	jz	7f
1333	movl	%ecx,(%edx)
13347:
1335	popl	%edi
1336	popl	%esi
1337	ret
1338
1339ENTRY(bcmp)
1340	pushl	%edi
1341	pushl	%esi
1342	movl	12(%esp),%edi
1343	movl	16(%esp),%esi
1344	movl	20(%esp),%edx
1345	xorl	%eax,%eax
1346
1347	movl	%edx,%ecx
1348	shrl	$2,%ecx
1349	cld					/* compare forwards */
1350	repe
1351	cmpsl
1352	jne	1f
1353
1354	movl	%edx,%ecx
1355	andl	$3,%ecx
1356	repe
1357	cmpsb
1358	je	2f
13591:
1360	incl	%eax
13612:
1362	popl	%esi
1363	popl	%edi
1364	ret
1365
1366
1367/*
1368 * Handling of special 386 registers and descriptor tables etc
1369 */
1370/* void lgdt(struct region_descriptor *rdp); */
1371ENTRY(lgdt)
1372	/* reload the descriptor table */
1373	movl	4(%esp),%eax
1374	lgdt	(%eax)
1375
1376	/* flush the prefetch q */
1377	jmp	1f
1378	nop
13791:
1380	/* reload "stale" selectors */
1381	movl	$KDSEL,%eax
1382	movl	%ax,%ds
1383	movl	%ax,%es
1384	movl	%ax,%ss
1385
1386	/* reload code selector by turning return into intersegmental return */
1387	movl	(%esp),%eax
1388	pushl	%eax
1389#	movl	$KCSEL,4(%esp)
1390	movl	$8,4(%esp)
1391	lret
1392
1393/*
1394 * void lidt(struct region_descriptor *rdp);
1395 */
1396ENTRY(lidt)
1397	movl	4(%esp),%eax
1398	lidt	(%eax)
1399	ret
1400
1401/*
1402 * void lldt(u_short sel)
1403 */
1404ENTRY(lldt)
1405	lldt	4(%esp)
1406	ret
1407
1408/*
1409 * void ltr(u_short sel)
1410 */
1411ENTRY(ltr)
1412	ltr	4(%esp)
1413	ret
1414
1415/* ssdtosd(*ssdp,*sdp) */
1416ENTRY(ssdtosd)
1417	pushl	%ebx
1418	movl	8(%esp),%ecx
1419	movl	8(%ecx),%ebx
1420	shll	$16,%ebx
1421	movl	(%ecx),%edx
1422	roll	$16,%edx
1423	movb	%dh,%bl
1424	movb	%dl,%bh
1425	rorl	$8,%ebx
1426	movl	4(%ecx),%eax
1427	movw	%ax,%dx
1428	andl	$0xf0000,%eax
1429	orl	%eax,%ebx
1430	movl	12(%esp),%ecx
1431	movl	%edx,(%ecx)
1432	movl	%ebx,4(%ecx)
1433	popl	%ebx
1434	ret
1435
1436/* load_cr0(cr0) */
1437ENTRY(load_cr0)
1438	movl	4(%esp),%eax
1439	movl	%eax,%cr0
1440	ret
1441
1442/* rcr0() */
1443ENTRY(rcr0)
1444	movl	%cr0,%eax
1445	ret
1446
1447/* rcr3() */
1448ENTRY(rcr3)
1449	movl	%cr3,%eax
1450	ret
1451
1452/* void load_cr3(caddr_t cr3) */
1453ENTRY(load_cr3)
1454	movl	4(%esp),%eax
1455	movl	%eax,%cr3
1456	ret
1457
1458
1459/*****************************************************************************/
1460/* setjump, longjump                                                         */
1461/*****************************************************************************/
1462
1463ENTRY(setjmp)
1464	movl	4(%esp),%eax
1465	movl	%ebx,(%eax)			/* save ebx */
1466	movl	%esp,4(%eax)			/* save esp */
1467	movl	%ebp,8(%eax)			/* save ebp */
1468	movl	%esi,12(%eax)			/* save esi */
1469	movl	%edi,16(%eax)			/* save edi */
1470	movl	(%esp),%edx			/* get rta */
1471	movl	%edx,20(%eax)			/* save eip */
1472	xorl	%eax,%eax			/* return(0); */
1473	ret
1474
1475ENTRY(longjmp)
1476	movl	4(%esp),%eax
1477	movl	(%eax),%ebx			/* restore ebx */
1478	movl	4(%eax),%esp			/* restore esp */
1479	movl	8(%eax),%ebp			/* restore ebp */
1480	movl	12(%eax),%esi			/* restore esi */
1481	movl	16(%eax),%edi			/* restore edi */
1482	movl	20(%eax),%edx			/* get rta */
1483	movl	%edx,(%esp)			/* put in return frame */
1484	xorl	%eax,%eax			/* return(1); */
1485	incl	%eax
1486	ret
1487
1488/*
1489 * Here for doing BB-profiling (gcc -a).
1490 * We rely on the "bbset" instead, but need a dummy function.
1491 */
1492NON_GPROF_ENTRY(__bb_init_func)
1493	movl	4(%esp),%eax
1494	movl	$1,(%eax)
1495	.byte	0xc3				/* avoid macro for `ret' */
1496