support.s revision 20018
1/*-
2 * Copyright (c) 1993 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	$Id: support.s,v 1.44 1996/11/12 14:54:16 bde Exp $
34 */
35
36#include "opt_cpu.h"
37
38#include <machine/asmacros.h>
39#include <machine/cputypes.h>
40#include <machine/specialreg.h>
41
42#include "assym.s"
43
44#define KDSEL		0x10			/* kernel data selector */
45#define IDXSHIFT	10
46
47	.data
48	.globl	_bcopy_vector
49_bcopy_vector:
50	.long	_generic_bcopy
51	.globl	_bzero
52_bzero:
53	.long	_generic_bzero
54	.globl	_copyin_vector
55_copyin_vector:
56	.long	_generic_copyin
57	.globl	_copyout_vector
58_copyout_vector:
59	.long	_generic_copyout
60	.globl	_ovbcopy_vector
61_ovbcopy_vector:
62	.long	_generic_bcopy
63kernel_fpu_lock:
64	.byte	0xfe
65	.space	3
66
67	.text
68
69/*
70 * bcopy family
71 * void bzero(void *buf, u_int len)
72 */
73
74ENTRY(generic_bzero)
75	pushl	%edi
76	movl	8(%esp),%edi
77	movl	12(%esp),%ecx
78	xorl	%eax,%eax
79	shrl	$2,%ecx
80	cld
81	rep
82	stosl
83	movl	12(%esp),%ecx
84	andl	$3,%ecx
85	rep
86	stosb
87	popl	%edi
88	ret
89
90#if defined(I486_CPU)
91ENTRY(i486_bzero)
92	movl	4(%esp),%edx
93	movl	8(%esp),%ecx
94	xorl	%eax,%eax
95/*
96 * do 64 byte chunks first
97 *
98 * XXX this is probably over-unrolled at least for DX2's
99 */
1002:
101	cmpl	$64,%ecx
102	jb	3f
103	movl	%eax,(%edx)
104	movl	%eax,4(%edx)
105	movl	%eax,8(%edx)
106	movl	%eax,12(%edx)
107	movl	%eax,16(%edx)
108	movl	%eax,20(%edx)
109	movl	%eax,24(%edx)
110	movl	%eax,28(%edx)
111	movl	%eax,32(%edx)
112	movl	%eax,36(%edx)
113	movl	%eax,40(%edx)
114	movl	%eax,44(%edx)
115	movl	%eax,48(%edx)
116	movl	%eax,52(%edx)
117	movl	%eax,56(%edx)
118	movl	%eax,60(%edx)
119	addl	$64,%edx
120	subl	$64,%ecx
121	jnz	2b
122	ret
123
124/*
125 * do 16 byte chunks
126 */
127	SUPERALIGN_TEXT
1283:
129	cmpl	$16,%ecx
130	jb	4f
131	movl	%eax,(%edx)
132	movl	%eax,4(%edx)
133	movl	%eax,8(%edx)
134	movl	%eax,12(%edx)
135	addl	$16,%edx
136	subl	$16,%ecx
137	jnz	3b
138	ret
139
140/*
141 * do 4 byte chunks
142 */
143	SUPERALIGN_TEXT
1444:
145	cmpl	$4,%ecx
146	jb	5f
147	movl	%eax,(%edx)
148	addl	$4,%edx
149	subl	$4,%ecx
150	jnz	4b
151	ret
152
153/*
154 * do 1 byte chunks
155 * a jump table seems to be faster than a loop or more range reductions
156 *
157 * XXX need a const section for non-text
158 */
159	.data
160jtab:
161	.long	do0
162	.long	do1
163	.long	do2
164	.long	do3
165
166	.text
167	SUPERALIGN_TEXT
1685:
169	jmp	jtab(,%ecx,4)
170
171	SUPERALIGN_TEXT
172do3:
173	movw	%ax,(%edx)
174	movb	%al,2(%edx)
175	ret
176
177	SUPERALIGN_TEXT
178do2:
179	movw	%ax,(%edx)
180	ret
181
182	SUPERALIGN_TEXT
183do1:
184	movb	%al,(%edx)
185	ret
186
187	SUPERALIGN_TEXT
188do0:
189	ret
190#endif
191
192#ifdef I586_CPU
193ENTRY(i586_bzero)
194	movl	4(%esp),%edx
195	movl	8(%esp),%ecx
196
197	/*
198	 * The FPU register method is twice as fast as the integer register
199	 * method unless the target is in the L1 cache and we pre-allocate a
200	 * cache line for it (then the integer register method is 4-5 times
201	 * faster).  However, we never pre-allocate cache lines, since that
202	 * would make the integer method 25% or more slower for the common
203	 * case when the target isn't in either the L1 cache or the L2 cache.
204	 * Thus we normally use the FPU register method unless the overhead
205	 * would be too large.
206	 */
207	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
208	jb	intreg_i586_bzero
209
210	/*
211	 * The FPU registers may belong to an application or to fastmove()
212	 * or to another invocation of bcopy() or ourself in a higher level
213	 * interrupt or trap handler.  Preserving the registers is
214	 * complicated since we avoid it if possible at all levels.  We
215	 * want to localize the complications even when that increases them.
216	 * Here the extra work involves preserving CR0_TS in TS.
217	 * `npxproc != NULL' is supposed to be the condition that all the
218	 * FPU resources belong to an application, but npxproc and CR0_TS
219	 * aren't set atomically enough for this condition to work in
220	 * interrupt handlers.
221	 *
222	 * Case 1: FPU registers belong to the application: we must preserve
223	 * the registers if we use them, so we only use the FPU register
224	 * method if the target size is large enough to amortize the extra
225	 * overhead for preserving them.  CR0_TS must be preserved although
226	 * it is very likely to end up as set.
227	 *
228	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
229	 * makes the registers look like they belong to an application so
230	 * that cpu_switch() and savectx() don't have to know about it, so
231	 * this case reduces to case 1.
232	 *
233	 * Case 3: FPU registers belong to the kernel: don't use the FPU
234	 * register method.  This case is unlikely, and supporting it would
235	 * be more complicated and might take too much stack.
236	 *
237	 * Case 4: FPU registers don't belong to anyone: the FPU registers
238	 * don't need to be preserved, so we always use the FPU register
239	 * method.  CR0_TS must be preserved although it is very likely to
240	 * always end up as clear.
241	 */
242	cmpl	$0,_npxproc
243	je	i586_bz1
244	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
245	jb	intreg_i586_bzero
246	sarb	$1,kernel_fpu_lock
247	jc	intreg_i586_bzero
248	smsw	%ax
249	clts
250	subl	$108,%esp
251	fnsave	0(%esp)
252	jmp	i586_bz2
253
254i586_bz1:
255	sarb	$1,kernel_fpu_lock
256	jc	intreg_i586_bzero
257	smsw	%ax
258	clts
259	fninit				/* XXX should avoid needing this */
260i586_bz2:
261	fldz
262
263	/*
264	 * Align to an 8 byte boundary (misalignment in the main loop would
265	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
266	 * already aligned) by always zeroing 8 bytes and using the part up
267	 * to the _next_ alignment position.
268	 */
269	fstl	0(%edx)
270	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
271	addl	$8,%edx
272	andl	$~7,%edx
273	subl	%edx,%ecx
274
275	/*
276	 * Similarly align `len' to a multiple of 8.
277	 */
278	fstl	-8(%edx,%ecx)
279	decl	%ecx
280	andl	$~7,%ecx
281
282	/*
283	 * This wouldn't be any faster if it were unrolled, since the loop
284	 * control instructions are much faster than the fstl and/or done
285	 * in parallel with it so their overhead is insignificant.
286	 */
287fpureg_i586_bzero_loop:
288	fstl	0(%edx)
289	addl	$8,%edx
290	subl	$8,%ecx
291	cmpl	$8,%ecx
292	jae	fpureg_i586_bzero_loop
293
294	cmpl	$0,_npxproc
295	je	i586_bz3
296	frstor	0(%esp)
297	addl	$108,%esp
298	lmsw	%ax
299	movb	$0xfe,kernel_fpu_lock
300	ret
301
302i586_bz3:
303	fstpl	%st(0)
304	lmsw	%ax
305	movb	$0xfe,kernel_fpu_lock
306	ret
307
308intreg_i586_bzero:
309	/*
310	 * `rep stos' seems to be the best method in practice for small
311	 * counts.  Fancy methods usually take too long to start up due
312	 * to cache and BTB misses.
313	 */
314	pushl	%edi
315	movl	%edx,%edi
316	xorl	%eax,%eax
317	shrl	$2,%ecx
318	cld
319	rep
320	stosl
321	movl	12(%esp),%ecx
322	andl	$3,%ecx
323	jne	1f
324	popl	%edi
325	ret
326
3271:
328	rep
329	stosb
330	popl	%edi
331	ret
332#endif /* I586_CPU */
333
334/* fillw(pat, base, cnt) */
335ENTRY(fillw)
336	pushl	%edi
337	movl	8(%esp),%eax
338	movl	12(%esp),%edi
339	movl	16(%esp),%ecx
340	cld
341	rep
342	stosw
343	popl	%edi
344	ret
345
346ENTRY(bcopyb)
347bcopyb:
348	pushl	%esi
349	pushl	%edi
350	movl	12(%esp),%esi
351	movl	16(%esp),%edi
352	movl	20(%esp),%ecx
353	movl	%edi,%eax
354	subl	%esi,%eax
355	cmpl	%ecx,%eax			/* overlapping && src < dst? */
356	jb	1f
357	cld					/* nope, copy forwards */
358	rep
359	movsb
360	popl	%edi
361	popl	%esi
362	ret
363
364	ALIGN_TEXT
3651:
366	addl	%ecx,%edi			/* copy backwards. */
367	addl	%ecx,%esi
368	decl	%edi
369	decl	%esi
370	std
371	rep
372	movsb
373	popl	%edi
374	popl	%esi
375	cld
376	ret
377
378ENTRY(bcopy)
379	MEXITCOUNT
380	jmp	*_bcopy_vector
381
382ENTRY(ovbcopy)
383	MEXITCOUNT
384	jmp	*_ovbcopy_vector
385
386/*
387 * generic_bcopy(src, dst, cnt)
388 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
389 */
390ENTRY(generic_bcopy)
391	pushl	%esi
392	pushl	%edi
393	movl	12(%esp),%esi
394	movl	16(%esp),%edi
395	movl	20(%esp),%ecx
396
397	movl	%edi,%eax
398	subl	%esi,%eax
399	cmpl	%ecx,%eax			/* overlapping && src < dst? */
400	jb	1f
401
402	shrl	$2,%ecx				/* copy by 32-bit words */
403	cld					/* nope, copy forwards */
404	rep
405	movsl
406	movl	20(%esp),%ecx
407	andl	$3,%ecx				/* any bytes left? */
408	rep
409	movsb
410	popl	%edi
411	popl	%esi
412	ret
413
414	ALIGN_TEXT
4151:
416	addl	%ecx,%edi			/* copy backwards */
417	addl	%ecx,%esi
418	decl	%edi
419	decl	%esi
420	andl	$3,%ecx				/* any fractional bytes? */
421	std
422	rep
423	movsb
424	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
425	shrl	$2,%ecx
426	subl	$3,%esi
427	subl	$3,%edi
428	rep
429	movsl
430	popl	%edi
431	popl	%esi
432	cld
433	ret
434
435#ifdef I586_CPU
436ENTRY(i586_bcopy)
437	pushl	%esi
438	pushl	%edi
439	movl	12(%esp),%esi
440	movl	16(%esp),%edi
441	movl	20(%esp),%ecx
442
443	movl	%edi,%eax
444	subl	%esi,%eax
445	cmpl	%ecx,%eax			/* overlapping && src < dst? */
446	jb	1f
447
448	cmpl	$1024,%ecx
449	jb	small_i586_bcopy
450
451	sarb	$1,kernel_fpu_lock
452	jc	small_i586_bcopy
453	cmpl	$0,_npxproc
454	je	i586_bc1
455	smsw	%dx
456	clts
457	subl	$108,%esp
458	fnsave	0(%esp)
459	jmp	4f
460
461i586_bc1:
462	smsw	%dx
463	clts
464	fninit				/* XXX should avoid needing this */
465
466	ALIGN_TEXT
4674:
468	pushl	%ecx
469#define	DCACHE_SIZE	8192
470	cmpl	$(DCACHE_SIZE-512)/2,%ecx
471	jbe	2f
472	movl	$(DCACHE_SIZE-512)/2,%ecx
4732:
474	subl	%ecx,0(%esp)
475	cmpl	$256,%ecx
476	jb	5f			/* XXX should prefetch if %ecx >= 32 */
477	pushl	%esi
478	pushl	%ecx
479	ALIGN_TEXT
4803:
481	movl	0(%esi),%eax
482	movl	32(%esi),%eax
483	movl	64(%esi),%eax
484	movl	96(%esi),%eax
485	movl	128(%esi),%eax
486	movl	160(%esi),%eax
487	movl	192(%esi),%eax
488	movl	224(%esi),%eax
489	addl	$256,%esi
490	subl	$256,%ecx
491	cmpl	$256,%ecx
492	jae	3b
493	popl	%ecx
494	popl	%esi
4955:
496	ALIGN_TEXT
497large_i586_bcopy_loop:
498	fildq	0(%esi)
499	fildq	8(%esi)
500	fildq	16(%esi)
501	fildq	24(%esi)
502	fildq	32(%esi)
503	fildq	40(%esi)
504	fildq	48(%esi)
505	fildq	56(%esi)
506	fistpq	56(%edi)
507	fistpq	48(%edi)
508	fistpq	40(%edi)
509	fistpq	32(%edi)
510	fistpq	24(%edi)
511	fistpq	16(%edi)
512	fistpq	8(%edi)
513	fistpq	0(%edi)
514	addl	$64,%esi
515	addl	$64,%edi
516	subl	$64,%ecx
517	cmpl	$64,%ecx
518	jae	large_i586_bcopy_loop
519	popl	%eax
520	addl	%eax,%ecx
521	cmpl	$64,%ecx
522	jae	4b
523
524	cmpl	$0,_npxproc
525	je	i586_bc2
526	frstor	0(%esp)
527	addl	$108,%esp
528i586_bc2:
529	lmsw	%dx
530	movb	$0xfe,kernel_fpu_lock
531
532/*
533 * This is a duplicate of the main part of generic_bcopy.  See the comments
534 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
535 * would mess up high resolution profiling.
536 */
537	ALIGN_TEXT
538small_i586_bcopy:
539	shrl	$2,%ecx
540	cld
541	rep
542	movsl
543	movl	20(%esp),%ecx
544	andl	$3,%ecx
545	rep
546	movsb
547	popl	%edi
548	popl	%esi
549	ret
550
551	ALIGN_TEXT
5521:
553	addl	%ecx,%edi
554	addl	%ecx,%esi
555	decl	%edi
556	decl	%esi
557	andl	$3,%ecx
558	std
559	rep
560	movsb
561	movl	20(%esp),%ecx
562	shrl	$2,%ecx
563	subl	$3,%esi
564	subl	$3,%edi
565	rep
566	movsl
567	popl	%edi
568	popl	%esi
569	cld
570	ret
571#endif /* I586_CPU */
572
573/*
574 * Note: memcpy does not support overlapping copies
575 */
576ENTRY(memcpy)
577	pushl	%edi
578	pushl	%esi
579	movl	12(%esp),%edi
580	movl	16(%esp),%esi
581	movl	20(%esp),%ecx
582	movl	%edi,%eax
583	shrl	$2,%ecx				/* copy by 32-bit words */
584	cld					/* nope, copy forwards */
585	rep
586	movsl
587	movl	20(%esp),%ecx
588	andl	$3,%ecx				/* any bytes left? */
589	rep
590	movsb
591	popl	%esi
592	popl	%edi
593	ret
594
595
596/*****************************************************************************/
597/* copyout and fubyte family                                                 */
598/*****************************************************************************/
599/*
600 * Access user memory from inside the kernel. These routines and possibly
601 * the math- and DOS emulators should be the only places that do this.
602 *
603 * We have to access the memory with user's permissions, so use a segment
604 * selector with RPL 3. For writes to user space we have to additionally
605 * check the PTE for write permission, because the 386 does not check
606 * write permissions when we are executing with EPL 0. The 486 does check
607 * this if the WP bit is set in CR0, so we can use a simpler version here.
608 *
609 * These routines set curpcb->onfault for the time they execute. When a
610 * protection violation occurs inside the functions, the trap handler
611 * returns to *curpcb->onfault instead of the function.
612 */
613
614/* copyout(from_kernel, to_user, len) */
615ENTRY(copyout)
616	MEXITCOUNT
617	jmp	*_copyout_vector
618
619ENTRY(generic_copyout)
620	movl	_curpcb,%eax
621	movl	$copyout_fault,PCB_ONFAULT(%eax)
622	pushl	%esi
623	pushl	%edi
624	pushl	%ebx
625	movl	16(%esp),%esi
626	movl	20(%esp),%edi
627	movl	24(%esp),%ebx
628	testl	%ebx,%ebx			/* anything to do? */
629	jz	done_copyout
630
631	/*
632	 * Check explicitly for non-user addresses.  If 486 write protection
633	 * is being used, this check is essential because we are in kernel
634	 * mode so the h/w does not provide any protection against writing
635	 * kernel addresses.
636	 */
637
638	/*
639	 * First, prevent address wrapping.
640	 */
641	movl	%edi,%eax
642	addl	%ebx,%eax
643	jc	copyout_fault
644/*
645 * XXX STOP USING VM_MAXUSER_ADDRESS.
646 * It is an end address, not a max, so every time it is used correctly it
647 * looks like there is an off by one error, and of course it caused an off
648 * by one error in several places.
649 */
650	cmpl	$VM_MAXUSER_ADDRESS,%eax
651	ja	copyout_fault
652
653#if defined(I386_CPU)
654
655#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
656	cmpl	$CPUCLASS_386,_cpu_class
657	jne	3f
658#endif
659/*
660 * We have to check each PTE for user write permission.
661 * The checking may cause a page fault, so it is important to set
662 * up everything for return via copyout_fault before here.
663 */
664	/* compute number of pages */
665	movl	%edi,%ecx
666	andl	$PAGE_MASK,%ecx
667	addl	%ebx,%ecx
668	decl	%ecx
669	shrl	$IDXSHIFT+2,%ecx
670	incl	%ecx
671
672	/* compute PTE offset for start address */
673	movl	%edi,%edx
674	shrl	$IDXSHIFT,%edx
675	andb	$0xfc,%dl
676
6771:	/* check PTE for each page */
678	movb	_PTmap(%edx),%al
679	andb	$0x07,%al			/* Pages must be VALID + USERACC + WRITABLE */
680	cmpb	$0x07,%al
681	je	2f
682
683	/* simulate a trap */
684	pushl	%edx
685	pushl	%ecx
686	shll	$IDXSHIFT,%edx
687	pushl	%edx
688	call	_trapwrite			/* trapwrite(addr) */
689	popl	%edx
690	popl	%ecx
691	popl	%edx
692
693	testl	%eax,%eax			/* if not ok, return EFAULT */
694	jnz	copyout_fault
695
6962:
697	addl	$4,%edx
698	decl	%ecx
699	jnz	1b				/* check next page */
700#endif /* I386_CPU */
701
702	/* bcopy(%esi, %edi, %ebx) */
7033:
704	movl	%ebx,%ecx
705
706#ifdef I586_CPU
707	ALIGN_TEXT
708slow_copyout:
709#endif
710	shrl	$2,%ecx
711	cld
712	rep
713	movsl
714	movb	%bl,%cl
715	andb	$3,%cl
716	rep
717	movsb
718
719done_copyout:
720	popl	%ebx
721	popl	%edi
722	popl	%esi
723	xorl	%eax,%eax
724	movl	_curpcb,%edx
725	movl	%eax,PCB_ONFAULT(%edx)
726	ret
727
728	ALIGN_TEXT
729copyout_fault:
730	popl	%ebx
731	popl	%edi
732	popl	%esi
733	movl	_curpcb,%edx
734	movl	$0,PCB_ONFAULT(%edx)
735	movl	$EFAULT,%eax
736	ret
737
738#ifdef I586_CPU
739ENTRY(i586_copyout)
740	/*
741	 * Duplicated from generic_copyout.  Could be done a bit better.
742	 */
743	movl	_curpcb,%eax
744	movl	$copyout_fault,PCB_ONFAULT(%eax)
745	pushl	%esi
746	pushl	%edi
747	pushl	%ebx
748	movl	16(%esp),%esi
749	movl	20(%esp),%edi
750	movl	24(%esp),%ebx
751	testl	%ebx,%ebx			/* anything to do? */
752	jz	done_copyout
753
754	/*
755	 * Check explicitly for non-user addresses.  If 486 write protection
756	 * is being used, this check is essential because we are in kernel
757	 * mode so the h/w does not provide any protection against writing
758	 * kernel addresses.
759	 */
760
761	/*
762	 * First, prevent address wrapping.
763	 */
764	movl	%edi,%eax
765	addl	%ebx,%eax
766	jc	copyout_fault
767/*
768 * XXX STOP USING VM_MAXUSER_ADDRESS.
769 * It is an end address, not a max, so every time it is used correctly it
770 * looks like there is an off by one error, and of course it caused an off
771 * by one error in several places.
772 */
773	cmpl	$VM_MAXUSER_ADDRESS,%eax
774	ja	copyout_fault
775
776	/* bcopy(%esi, %edi, %ebx) */
7773:
778	movl	%ebx,%ecx
779	/*
780	 * End of duplicated code.
781	 */
782
783	cmpl	$1024,%ecx
784	jb	slow_copyout
785
786	pushl	%ecx
787	call	_fastmove
788	addl	$4,%esp
789	jmp	done_copyout
790#endif /* I586_CPU */
791
792/* copyin(from_user, to_kernel, len) */
793ENTRY(copyin)
794	MEXITCOUNT
795	jmp	*_copyin_vector
796
797ENTRY(generic_copyin)
798	movl	_curpcb,%eax
799	movl	$copyin_fault,PCB_ONFAULT(%eax)
800	pushl	%esi
801	pushl	%edi
802	movl	12(%esp),%esi			/* caddr_t from */
803	movl	16(%esp),%edi			/* caddr_t to */
804	movl	20(%esp),%ecx			/* size_t  len */
805
806	/*
807	 * make sure address is valid
808	 */
809	movl	%esi,%edx
810	addl	%ecx,%edx
811	jc	copyin_fault
812	cmpl	$VM_MAXUSER_ADDRESS,%edx
813	ja	copyin_fault
814
815#ifdef I586_CPU
816	ALIGN_TEXT
817slow_copyin:
818#endif
819	movb	%cl,%al
820	shrl	$2,%ecx				/* copy longword-wise */
821	cld
822	rep
823	movsl
824	movb	%al,%cl
825	andb	$3,%cl				/* copy remaining bytes */
826	rep
827	movsb
828
829#if defined(I586_CPU)
830	ALIGN_TEXT
831done_copyin:
832#endif /* I586_CPU */
833	popl	%edi
834	popl	%esi
835	xorl	%eax,%eax
836	movl	_curpcb,%edx
837	movl	%eax,PCB_ONFAULT(%edx)
838	ret
839
840	ALIGN_TEXT
841copyin_fault:
842	popl	%edi
843	popl	%esi
844	movl	_curpcb,%edx
845	movl	$0,PCB_ONFAULT(%edx)
846	movl	$EFAULT,%eax
847	ret
848
849#ifdef I586_CPU
850ENTRY(i586_copyin)
851	/*
852	 * Duplicated from generic_copyin.  Could be done a bit better.
853	 */
854	movl	_curpcb,%eax
855	movl	$copyin_fault,PCB_ONFAULT(%eax)
856	pushl	%esi
857	pushl	%edi
858	movl	12(%esp),%esi			/* caddr_t from */
859	movl	16(%esp),%edi			/* caddr_t to */
860	movl	20(%esp),%ecx			/* size_t  len */
861
862	/*
863	 * make sure address is valid
864	 */
865	movl	%esi,%edx
866	addl	%ecx,%edx
867	jc	copyin_fault
868	cmpl	$VM_MAXUSER_ADDRESS,%edx
869	ja	copyin_fault
870	/*
871	 * End of duplicated code.
872	 */
873
874	cmpl	$1024,%ecx
875	jb	slow_copyin
876
877	pushl	%ebx			/* XXX prepare for fastmove_fault */
878	pushl	%ecx
879	call	_fastmove
880	addl	$8,%esp
881	jmp	done_copyin
882#endif /* I586_CPU */
883
884#if defined(I586_CPU)
885/* fastmove(src, dst, len)
886	src in %esi
887	dst in %edi
888	len in %ecx		XXX changed to on stack for profiling
889	uses %eax and %edx for tmp. storage
890 */
891/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
892ENTRY(fastmove)
893	pushl	%ebp
894	movl	%esp,%ebp
895	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
896	movl	_curpcb,%eax
897	movl	$fastmove_fault,PCB_ONFAULT(%eax)
898
899	movl	8(%ebp),%ecx
900	cmpl	$63,%ecx
901	jbe	fastmove_tail
902
903	testl	$7,%esi	/* check if src addr is multiple of 8 */
904	jnz	fastmove_tail
905
906	testl	$7,%edi	/* check if dst addr is multiple of 8 */
907	jnz	fastmove_tail
908
909/* if (npxproc != NULL) { */
910	cmpl	$0,_npxproc
911	je	6f
912/*    fnsave(&curpcb->pcb_savefpu); */
913	movl	_curpcb,%eax
914	fnsave	PCB_SAVEFPU(%eax)
915/*   npxproc = NULL; */
916	movl	$0,_npxproc
917/* } */
9186:
919/* now we own the FPU. */
920
921/*
922 * The process' FP state is saved in the pcb, but if we get
923 * switched, the cpu_switch() will store our FP state in the
924 * pcb.  It should be possible to avoid all the copying for
925 * this, e.g., by setting a flag to tell cpu_switch() to
926 * save the state somewhere else.
927 */
928/* tmp = curpcb->pcb_savefpu; */
929	movl	%ecx,-12(%ebp)
930	movl	%esi,-8(%ebp)
931	movl	%edi,-4(%ebp)
932	movl	%esp,%edi
933	movl	_curpcb,%esi
934	addl	$PCB_SAVEFPU,%esi
935	cld
936	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
937	rep
938	movsl
939	movl	-12(%ebp),%ecx
940	movl	-8(%ebp),%esi
941	movl	-4(%ebp),%edi
942/* stop_emulating(); */
943	clts
944/* npxproc = curproc; */
945	movl	_curproc,%eax
946	movl	%eax,_npxproc
9474:
948	movl	%ecx,-12(%ebp)
949	cmpl	$1792,%ecx
950	jbe	2f
951	movl	$1792,%ecx
9522:
953	subl	%ecx,-12(%ebp)
954	cmpl	$256,%ecx
955	jb	5f
956	movl	%ecx,-8(%ebp)
957	movl	%esi,-4(%ebp)
958	ALIGN_TEXT
9593:
960	movl	0(%esi),%eax
961	movl	32(%esi),%eax
962	movl	64(%esi),%eax
963	movl	96(%esi),%eax
964	movl	128(%esi),%eax
965	movl	160(%esi),%eax
966	movl	192(%esi),%eax
967	movl	224(%esi),%eax
968	addl	$256,%esi
969	subl	$256,%ecx
970	cmpl	$256,%ecx
971	jae	3b
972	movl	-8(%ebp),%ecx
973	movl	-4(%ebp),%esi
9745:
975	ALIGN_TEXT
976fastmove_loop:
977	fildq	0(%esi)
978	fildq	8(%esi)
979	fildq	16(%esi)
980	fildq	24(%esi)
981	fildq	32(%esi)
982	fildq	40(%esi)
983	fildq	48(%esi)
984	fildq	56(%esi)
985	fistpq	56(%edi)
986	fistpq	48(%edi)
987	fistpq	40(%edi)
988	fistpq	32(%edi)
989	fistpq	24(%edi)
990	fistpq	16(%edi)
991	fistpq	8(%edi)
992	fistpq	0(%edi)
993	addl	$-64,%ecx
994	addl	$64,%esi
995	addl	$64,%edi
996	cmpl	$63,%ecx
997	ja	fastmove_loop
998	movl	-12(%ebp),%eax
999	addl	%eax,%ecx
1000	cmpl	$64,%ecx
1001	jae	4b
1002
1003/* curpcb->pcb_savefpu = tmp; */
1004	movl	%ecx,-12(%ebp)
1005	movl	%esi,-8(%ebp)
1006	movl	%edi,-4(%ebp)
1007	movl	_curpcb,%edi
1008	addl	$PCB_SAVEFPU,%edi
1009	movl	%esp,%esi
1010	cld
1011	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1012	rep
1013	movsl
1014	movl	-12(%ebp),%ecx
1015	movl	-8(%ebp),%esi
1016	movl	-4(%ebp),%edi
1017
1018/* start_emulating(); */
1019	smsw	%ax
1020	orb	$CR0_TS,%al
1021	lmsw	%ax
1022/* npxproc = NULL; */
1023	movl	$0,_npxproc
1024
1025	ALIGN_TEXT
1026fastmove_tail:
1027	movb	%cl,%al
1028	shrl	$2,%ecx				/* copy longword-wise */
1029	cld
1030	rep
1031	movsl
1032	movb	%al,%cl
1033	andb	$3,%cl				/* copy remaining bytes */
1034	rep
1035	movsb
1036
1037	movl	%ebp,%esp
1038	popl	%ebp
1039	ret
1040
1041	ALIGN_TEXT
1042fastmove_fault:
1043	movl	%ebp,%esp
1044	popl	%ebp
1045	addl	$8,%esp
1046	popl	%ebx
1047	popl	%edi
1048	popl	%esi
1049	movl	_curpcb,%edx
1050	movl	$0,PCB_ONFAULT(%edx)
1051	movl	$EFAULT,%eax
1052	ret
1053#endif /* I586_CPU */
1054
1055/*
1056 * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
1057 */
1058ENTRY(fuword)
1059	movl	_curpcb,%ecx
1060	movl	$fusufault,PCB_ONFAULT(%ecx)
1061	movl	4(%esp),%edx			/* from */
1062
1063	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1064	ja	fusufault
1065
1066	movl	(%edx),%eax
1067	movl	$0,PCB_ONFAULT(%ecx)
1068	ret
1069
1070/*
1071 * These two routines are called from the profiling code, potentially
1072 * at interrupt time. If they fail, that's okay, good things will
1073 * happen later. Fail all the time for now - until the trap code is
1074 * able to deal with this.
1075 */
1076ALTENTRY(suswintr)
1077ENTRY(fuswintr)
1078	movl	$-1,%eax
1079	ret
1080
1081ENTRY(fusword)
1082	movl	_curpcb,%ecx
1083	movl	$fusufault,PCB_ONFAULT(%ecx)
1084	movl	4(%esp),%edx
1085
1086	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
1087	ja	fusufault
1088
1089	movzwl	(%edx),%eax
1090	movl	$0,PCB_ONFAULT(%ecx)
1091	ret
1092
1093ENTRY(fubyte)
1094	movl	_curpcb,%ecx
1095	movl	$fusufault,PCB_ONFAULT(%ecx)
1096	movl	4(%esp),%edx
1097
1098	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1099	ja	fusufault
1100
1101	movzbl	(%edx),%eax
1102	movl	$0,PCB_ONFAULT(%ecx)
1103	ret
1104
1105	ALIGN_TEXT
1106fusufault:
1107	movl	_curpcb,%ecx
1108	xorl	%eax,%eax
1109	movl	%eax,PCB_ONFAULT(%ecx)
1110	decl	%eax
1111	ret
1112
1113/*
1114 * su{byte,sword,word}: write a byte (word, longword) to user memory
1115 */
1116ENTRY(suword)
1117	movl	_curpcb,%ecx
1118	movl	$fusufault,PCB_ONFAULT(%ecx)
1119	movl	4(%esp),%edx
1120
1121#if defined(I386_CPU)
1122
1123#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1124	cmpl	$CPUCLASS_386,_cpu_class
1125	jne	2f				/* we only have to set the right segment selector */
1126#endif /* I486_CPU || I586_CPU || I686_CPU */
1127
1128	/* XXX - page boundary crossing is still not handled */
1129	movl	%edx,%eax
1130	shrl	$IDXSHIFT,%edx
1131	andb	$0xfc,%dl
1132	movb	_PTmap(%edx),%dl
1133	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1134	cmpb	$0x7,%dl
1135	je	1f
1136
1137	/* simulate a trap */
1138	pushl	%eax
1139	call	_trapwrite
1140	popl	%edx				/* remove junk parameter from stack */
1141	movl	_curpcb,%ecx			/* restore trashed register */
1142	testl	%eax,%eax
1143	jnz	fusufault
11441:
1145	movl	4(%esp),%edx
1146#endif
1147
11482:
1149	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1150	ja	fusufault
1151
1152	movl	8(%esp),%eax
1153	movl	%eax,(%edx)
1154	xorl	%eax,%eax
1155	movl	%eax,PCB_ONFAULT(%ecx)
1156	ret
1157
1158ENTRY(susword)
1159	movl	_curpcb,%ecx
1160	movl	$fusufault,PCB_ONFAULT(%ecx)
1161	movl	4(%esp),%edx
1162
1163#if defined(I386_CPU)
1164
1165#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1166	cmpl	$CPUCLASS_386,_cpu_class
1167	jne	2f
1168#endif /* I486_CPU || I586_CPU || I686_CPU */
1169
1170	/* XXX - page boundary crossing is still not handled */
1171	movl	%edx,%eax
1172	shrl	$IDXSHIFT,%edx
1173	andb	$0xfc,%dl
1174	movb	_PTmap(%edx),%dl
1175	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1176	cmpb	$0x7,%dl
1177	je	1f
1178
1179	/* simulate a trap */
1180	pushl	%eax
1181	call	_trapwrite
1182	popl	%edx				/* remove junk parameter from stack */
1183	movl	_curpcb,%ecx			/* restore trashed register */
1184	testl	%eax,%eax
1185	jnz	fusufault
11861:
1187	movl	4(%esp),%edx
1188#endif
1189
11902:
1191	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1192	ja	fusufault
1193
1194	movw	8(%esp),%ax
1195	movw	%ax,(%edx)
1196	xorl	%eax,%eax
1197	movl	%eax,PCB_ONFAULT(%ecx)
1198	ret
1199
1200ALTENTRY(suibyte)
1201ENTRY(subyte)
1202	movl	_curpcb,%ecx
1203	movl	$fusufault,PCB_ONFAULT(%ecx)
1204	movl	4(%esp),%edx
1205
1206#if defined(I386_CPU)
1207
1208#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1209	cmpl	$CPUCLASS_386,_cpu_class
1210	jne	2f
1211#endif /* I486_CPU || I586_CPU || I686_CPU */
1212
1213	movl	%edx,%eax
1214	shrl	$IDXSHIFT,%edx
1215	andb	$0xfc,%dl
1216	movb	_PTmap(%edx),%dl
1217	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1218	cmpb	$0x7,%dl
1219	je	1f
1220
1221	/* simulate a trap */
1222	pushl	%eax
1223	call	_trapwrite
1224	popl	%edx				/* remove junk parameter from stack */
1225	movl	_curpcb,%ecx			/* restore trashed register */
1226	testl	%eax,%eax
1227	jnz	fusufault
12281:
1229	movl	4(%esp),%edx
1230#endif
1231
12322:
1233	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1234	ja	fusufault
1235
1236	movb	8(%esp),%al
1237	movb	%al,(%edx)
1238	xorl	%eax,%eax
1239	movl	%eax,PCB_ONFAULT(%ecx)
1240	ret
1241
1242/*
1243 * copyinstr(from, to, maxlen, int *lencopied)
1244 *	copy a string from from to to, stop when a 0 character is reached.
1245 *	return ENAMETOOLONG if string is longer than maxlen, and
1246 *	EFAULT on protection violations. If lencopied is non-zero,
1247 *	return the actual length in *lencopied.
1248 */
1249ENTRY(copyinstr)
1250	pushl	%esi
1251	pushl	%edi
1252	movl	_curpcb,%ecx
1253	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1254
1255	movl	12(%esp),%esi			/* %esi = from */
1256	movl	16(%esp),%edi			/* %edi = to */
1257	movl	20(%esp),%edx			/* %edx = maxlen */
1258
1259	movl	$VM_MAXUSER_ADDRESS,%eax
1260
1261	/* make sure 'from' is within bounds */
1262	subl	%esi,%eax
1263	jbe	cpystrflt
1264
1265	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1266	cmpl	%edx,%eax
1267	jae	1f
1268	movl	%eax,%edx
1269	movl	%eax,20(%esp)
12701:
1271	incl	%edx
1272	cld
1273
12742:
1275	decl	%edx
1276	jz	3f
1277
1278	lodsb
1279	stosb
1280	orb	%al,%al
1281	jnz	2b
1282
1283	/* Success -- 0 byte reached */
1284	decl	%edx
1285	xorl	%eax,%eax
1286	jmp	cpystrflt_x
12873:
1288	/* edx is zero - return ENAMETOOLONG or EFAULT */
1289	cmpl	$VM_MAXUSER_ADDRESS,%esi
1290	jae	cpystrflt
12914:
1292	movl	$ENAMETOOLONG,%eax
1293	jmp	cpystrflt_x
1294
1295cpystrflt:
1296	movl	$EFAULT,%eax
1297
1298cpystrflt_x:
1299	/* set *lencopied and return %eax */
1300	movl	_curpcb,%ecx
1301	movl	$0,PCB_ONFAULT(%ecx)
1302	movl	20(%esp),%ecx
1303	subl	%edx,%ecx
1304	movl	24(%esp),%edx
1305	testl	%edx,%edx
1306	jz	1f
1307	movl	%ecx,(%edx)
13081:
1309	popl	%edi
1310	popl	%esi
1311	ret
1312
1313
1314/*
1315 * copystr(from, to, maxlen, int *lencopied)
1316 */
1317ENTRY(copystr)
1318	pushl	%esi
1319	pushl	%edi
1320
1321	movl	12(%esp),%esi			/* %esi = from */
1322	movl	16(%esp),%edi			/* %edi = to */
1323	movl	20(%esp),%edx			/* %edx = maxlen */
1324	incl	%edx
1325	cld
13261:
1327	decl	%edx
1328	jz	4f
1329	lodsb
1330	stosb
1331	orb	%al,%al
1332	jnz	1b
1333
1334	/* Success -- 0 byte reached */
1335	decl	%edx
1336	xorl	%eax,%eax
1337	jmp	6f
13384:
1339	/* edx is zero -- return ENAMETOOLONG */
1340	movl	$ENAMETOOLONG,%eax
1341
13426:
1343	/* set *lencopied and return %eax */
1344	movl	20(%esp),%ecx
1345	subl	%edx,%ecx
1346	movl	24(%esp),%edx
1347	testl	%edx,%edx
1348	jz	7f
1349	movl	%ecx,(%edx)
13507:
1351	popl	%edi
1352	popl	%esi
1353	ret
1354
1355ENTRY(bcmp)
1356	pushl	%edi
1357	pushl	%esi
1358	movl	12(%esp),%edi
1359	movl	16(%esp),%esi
1360	movl	20(%esp),%edx
1361	xorl	%eax,%eax
1362
1363	movl	%edx,%ecx
1364	shrl	$2,%ecx
1365	cld					/* compare forwards */
1366	repe
1367	cmpsl
1368	jne	1f
1369
1370	movl	%edx,%ecx
1371	andl	$3,%ecx
1372	repe
1373	cmpsb
1374	je	2f
13751:
1376	incl	%eax
13772:
1378	popl	%esi
1379	popl	%edi
1380	ret
1381
1382
1383/*
1384 * Handling of special 386 registers and descriptor tables etc
1385 */
1386/* void lgdt(struct region_descriptor *rdp); */
1387ENTRY(lgdt)
1388	/* reload the descriptor table */
1389	movl	4(%esp),%eax
1390	lgdt	(%eax)
1391
1392	/* flush the prefetch q */
1393	jmp	1f
1394	nop
13951:
1396	/* reload "stale" selectors */
1397	movl	$KDSEL,%eax
1398	movl	%ax,%ds
1399	movl	%ax,%es
1400	movl	%ax,%ss
1401
1402	/* reload code selector by turning return into intersegmental return */
1403	movl	(%esp),%eax
1404	pushl	%eax
1405#	movl	$KCSEL,4(%esp)
1406	movl	$8,4(%esp)
1407	lret
1408
1409/*
1410 * void lidt(struct region_descriptor *rdp);
1411 */
1412ENTRY(lidt)
1413	movl	4(%esp),%eax
1414	lidt	(%eax)
1415	ret
1416
1417/*
1418 * void lldt(u_short sel)
1419 */
1420ENTRY(lldt)
1421	lldt	4(%esp)
1422	ret
1423
1424/*
1425 * void ltr(u_short sel)
1426 */
1427ENTRY(ltr)
1428	ltr	4(%esp)
1429	ret
1430
1431/* ssdtosd(*ssdp,*sdp) */
1432ENTRY(ssdtosd)
1433	pushl	%ebx
1434	movl	8(%esp),%ecx
1435	movl	8(%ecx),%ebx
1436	shll	$16,%ebx
1437	movl	(%ecx),%edx
1438	roll	$16,%edx
1439	movb	%dh,%bl
1440	movb	%dl,%bh
1441	rorl	$8,%ebx
1442	movl	4(%ecx),%eax
1443	movw	%ax,%dx
1444	andl	$0xf0000,%eax
1445	orl	%eax,%ebx
1446	movl	12(%esp),%ecx
1447	movl	%edx,(%ecx)
1448	movl	%ebx,4(%ecx)
1449	popl	%ebx
1450	ret
1451
1452/* load_cr0(cr0) */
1453ENTRY(load_cr0)
1454	movl	4(%esp),%eax
1455	movl	%eax,%cr0
1456	ret
1457
1458/* rcr0() */
1459ENTRY(rcr0)
1460	movl	%cr0,%eax
1461	ret
1462
1463/* rcr3() */
1464ENTRY(rcr3)
1465	movl	%cr3,%eax
1466	ret
1467
1468/* void load_cr3(caddr_t cr3) */
1469ENTRY(load_cr3)
1470	movl	4(%esp),%eax
1471	movl	%eax,%cr3
1472	ret
1473
1474
1475/*****************************************************************************/
1476/* setjump, longjump                                                         */
1477/*****************************************************************************/
1478
1479ENTRY(setjmp)
1480	movl	4(%esp),%eax
1481	movl	%ebx,(%eax)			/* save ebx */
1482	movl	%esp,4(%eax)			/* save esp */
1483	movl	%ebp,8(%eax)			/* save ebp */
1484	movl	%esi,12(%eax)			/* save esi */
1485	movl	%edi,16(%eax)			/* save edi */
1486	movl	(%esp),%edx			/* get rta */
1487	movl	%edx,20(%eax)			/* save eip */
1488	xorl	%eax,%eax			/* return(0); */
1489	ret
1490
1491ENTRY(longjmp)
1492	movl	4(%esp),%eax
1493	movl	(%eax),%ebx			/* restore ebx */
1494	movl	4(%eax),%esp			/* restore esp */
1495	movl	8(%eax),%ebp			/* restore ebp */
1496	movl	12(%eax),%esi			/* restore esi */
1497	movl	16(%eax),%edi			/* restore edi */
1498	movl	20(%eax),%edx			/* get rta */
1499	movl	%edx,(%esp)			/* put in return frame */
1500	xorl	%eax,%eax			/* return(1); */
1501	incl	%eax
1502	ret
1503
1504/*
1505 * Here for doing BB-profiling (gcc -a).
1506 * We rely on the "bbset" instead, but need a dummy function.
1507 */
1508NON_GPROF_ENTRY(__bb_init_func)
1509	movl	4(%esp),%eax
1510	movl	$1,(%eax)
1511	.byte	0xc3				/* avoid macro for `ret' */
1512