support.s revision 21278
1/*-
2 * Copyright (c) 1993 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	$Id: support.s,v 1.45 1996/11/29 14:32:30 bde Exp $
34 */
35
36#include "opt_cpu.h"
37
38#include <machine/asmacros.h>
39#include <machine/cputypes.h>
40#include <machine/specialreg.h>
41
42#include "assym.s"
43
44#define KDSEL		0x10			/* kernel data selector */
45#define IDXSHIFT	10
46
47	.data
48	.globl	_bcopy_vector
49_bcopy_vector:
50	.long	_generic_bcopy
51	.globl	_bzero
52_bzero:
53	.long	_generic_bzero
54	.globl	_copyin_vector
55_copyin_vector:
56	.long	_generic_copyin
57	.globl	_copyout_vector
58_copyout_vector:
59	.long	_generic_copyout
60	.globl	_ovbcopy_vector
61_ovbcopy_vector:
62	.long	_generic_bcopy
63kernel_fpu_lock:
64	.byte	0xfe
65	.space	3
66
67	.text
68
69/*
70 * bcopy family
71 * void bzero(void *buf, u_int len)
72 */
73
74ENTRY(generic_bzero)
75	pushl	%edi
76	movl	8(%esp),%edi
77	movl	12(%esp),%ecx
78	xorl	%eax,%eax
79	shrl	$2,%ecx
80	cld
81	rep
82	stosl
83	movl	12(%esp),%ecx
84	andl	$3,%ecx
85	rep
86	stosb
87	popl	%edi
88	ret
89
90#if defined(I486_CPU)
91ENTRY(i486_bzero)
92	movl	4(%esp),%edx
93	movl	8(%esp),%ecx
94	xorl	%eax,%eax
95/*
96 * do 64 byte chunks first
97 *
98 * XXX this is probably over-unrolled at least for DX2's
99 */
1002:
101	cmpl	$64,%ecx
102	jb	3f
103	movl	%eax,(%edx)
104	movl	%eax,4(%edx)
105	movl	%eax,8(%edx)
106	movl	%eax,12(%edx)
107	movl	%eax,16(%edx)
108	movl	%eax,20(%edx)
109	movl	%eax,24(%edx)
110	movl	%eax,28(%edx)
111	movl	%eax,32(%edx)
112	movl	%eax,36(%edx)
113	movl	%eax,40(%edx)
114	movl	%eax,44(%edx)
115	movl	%eax,48(%edx)
116	movl	%eax,52(%edx)
117	movl	%eax,56(%edx)
118	movl	%eax,60(%edx)
119	addl	$64,%edx
120	subl	$64,%ecx
121	jnz	2b
122	ret
123
124/*
125 * do 16 byte chunks
126 */
127	SUPERALIGN_TEXT
1283:
129	cmpl	$16,%ecx
130	jb	4f
131	movl	%eax,(%edx)
132	movl	%eax,4(%edx)
133	movl	%eax,8(%edx)
134	movl	%eax,12(%edx)
135	addl	$16,%edx
136	subl	$16,%ecx
137	jnz	3b
138	ret
139
140/*
141 * do 4 byte chunks
142 */
143	SUPERALIGN_TEXT
1444:
145	cmpl	$4,%ecx
146	jb	5f
147	movl	%eax,(%edx)
148	addl	$4,%edx
149	subl	$4,%ecx
150	jnz	4b
151	ret
152
153/*
154 * do 1 byte chunks
155 * a jump table seems to be faster than a loop or more range reductions
156 *
157 * XXX need a const section for non-text
158 */
159	.data
160jtab:
161	.long	do0
162	.long	do1
163	.long	do2
164	.long	do3
165
166	.text
167	SUPERALIGN_TEXT
1685:
169	jmp	jtab(,%ecx,4)
170
171	SUPERALIGN_TEXT
172do3:
173	movw	%ax,(%edx)
174	movb	%al,2(%edx)
175	ret
176
177	SUPERALIGN_TEXT
178do2:
179	movw	%ax,(%edx)
180	ret
181
182	SUPERALIGN_TEXT
183do1:
184	movb	%al,(%edx)
185	ret
186
187	SUPERALIGN_TEXT
188do0:
189	ret
190#endif
191
192#ifdef I586_CPU
193ENTRY(i586_bzero)
194	movl	4(%esp),%edx
195	movl	8(%esp),%ecx
196
197	/*
198	 * The FPU register method is twice as fast as the integer register
199	 * method unless the target is in the L1 cache and we pre-allocate a
200	 * cache line for it (then the integer register method is 4-5 times
201	 * faster).  However, we never pre-allocate cache lines, since that
202	 * would make the integer method 25% or more slower for the common
203	 * case when the target isn't in either the L1 cache or the L2 cache.
204	 * Thus we normally use the FPU register method unless the overhead
205	 * would be too large.
206	 */
207	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
208	jb	intreg_i586_bzero
209
210	/*
211	 * The FPU registers may belong to an application or to fastmove()
212	 * or to another invocation of bcopy() or ourself in a higher level
213	 * interrupt or trap handler.  Preserving the registers is
214	 * complicated since we avoid it if possible at all levels.  We
215	 * want to localize the complications even when that increases them.
216	 * Here the extra work involves preserving CR0_TS in TS.
217	 * `npxproc != NULL' is supposed to be the condition that all the
218	 * FPU resources belong to an application, but npxproc and CR0_TS
219	 * aren't set atomically enough for this condition to work in
220	 * interrupt handlers.
221	 *
222	 * Case 1: FPU registers belong to the application: we must preserve
223	 * the registers if we use them, so we only use the FPU register
224	 * method if the target size is large enough to amortize the extra
225	 * overhead for preserving them.  CR0_TS must be preserved although
226	 * it is very likely to end up as set.
227	 *
228	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
229	 * makes the registers look like they belong to an application so
230	 * that cpu_switch() and savectx() don't have to know about it, so
231	 * this case reduces to case 1.
232	 *
233	 * Case 3: FPU registers belong to the kernel: don't use the FPU
234	 * register method.  This case is unlikely, and supporting it would
235	 * be more complicated and might take too much stack.
236	 *
237	 * Case 4: FPU registers don't belong to anyone: the FPU registers
238	 * don't need to be preserved, so we always use the FPU register
239	 * method.  CR0_TS must be preserved although it is very likely to
240	 * always end up as clear.
241	 */
242	cmpl	$0,_npxproc
243	je	i586_bz1
244	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
245	jb	intreg_i586_bzero
246	sarb	$1,kernel_fpu_lock
247	jc	intreg_i586_bzero
248	smsw	%ax
249	clts
250	subl	$108,%esp
251	fnsave	0(%esp)
252	jmp	i586_bz2
253
254i586_bz1:
255	sarb	$1,kernel_fpu_lock
256	jc	intreg_i586_bzero
257	smsw	%ax
258	clts
259	fninit				/* XXX should avoid needing this */
260i586_bz2:
261	fldz
262
263	/*
264	 * Align to an 8 byte boundary (misalignment in the main loop would
265	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
266	 * already aligned) by always zeroing 8 bytes and using the part up
267	 * to the _next_ alignment position.
268	 */
269	fstl	0(%edx)
270	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
271	addl	$8,%edx
272	andl	$~7,%edx
273	subl	%edx,%ecx
274
275	/*
276	 * Similarly align `len' to a multiple of 8.
277	 */
278	fstl	-8(%edx,%ecx)
279	decl	%ecx
280	andl	$~7,%ecx
281
282	/*
283	 * This wouldn't be any faster if it were unrolled, since the loop
284	 * control instructions are much faster than the fstl and/or done
285	 * in parallel with it so their overhead is insignificant.
286	 */
287fpureg_i586_bzero_loop:
288	fstl	0(%edx)
289	addl	$8,%edx
290	subl	$8,%ecx
291	cmpl	$8,%ecx
292	jae	fpureg_i586_bzero_loop
293
294	cmpl	$0,_npxproc
295	je	i586_bz3
296	frstor	0(%esp)
297	addl	$108,%esp
298	lmsw	%ax
299	movb	$0xfe,kernel_fpu_lock
300	ret
301
302i586_bz3:
303	fstpl	%st(0)
304	lmsw	%ax
305	movb	$0xfe,kernel_fpu_lock
306	ret
307
308intreg_i586_bzero:
309	/*
310	 * `rep stos' seems to be the best method in practice for small
311	 * counts.  Fancy methods usually take too long to start up due
312	 * to cache and BTB misses.
313	 */
314	pushl	%edi
315	movl	%edx,%edi
316	xorl	%eax,%eax
317	shrl	$2,%ecx
318	cld
319	rep
320	stosl
321	movl	12(%esp),%ecx
322	andl	$3,%ecx
323	jne	1f
324	popl	%edi
325	ret
326
3271:
328	rep
329	stosb
330	popl	%edi
331	ret
332#endif /* I586_CPU */
333
334/* fillw(pat, base, cnt) */
335ENTRY(fillw)
336	pushl	%edi
337	movl	8(%esp),%eax
338	movl	12(%esp),%edi
339	movl	16(%esp),%ecx
340	cld
341	rep
342	stosw
343	popl	%edi
344	ret
345
346ENTRY(bcopyb)
347bcopyb:
348	pushl	%esi
349	pushl	%edi
350	movl	12(%esp),%esi
351	movl	16(%esp),%edi
352	movl	20(%esp),%ecx
353	movl	%edi,%eax
354	subl	%esi,%eax
355	cmpl	%ecx,%eax			/* overlapping && src < dst? */
356	jb	1f
357	cld					/* nope, copy forwards */
358	rep
359	movsb
360	popl	%edi
361	popl	%esi
362	ret
363
364	ALIGN_TEXT
3651:
366	addl	%ecx,%edi			/* copy backwards. */
367	addl	%ecx,%esi
368	decl	%edi
369	decl	%esi
370	std
371	rep
372	movsb
373	popl	%edi
374	popl	%esi
375	cld
376	ret
377
378ENTRY(bcopy)
379	MEXITCOUNT
380	jmp	*_bcopy_vector
381
382ENTRY(ovbcopy)
383	MEXITCOUNT
384	jmp	*_ovbcopy_vector
385
386/*
387 * generic_bcopy(src, dst, cnt)
388 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
389 */
390ENTRY(generic_bcopy)
391	pushl	%esi
392	pushl	%edi
393	movl	12(%esp),%esi
394	movl	16(%esp),%edi
395	movl	20(%esp),%ecx
396
397	movl	%edi,%eax
398	subl	%esi,%eax
399	cmpl	%ecx,%eax			/* overlapping && src < dst? */
400	jb	1f
401
402	shrl	$2,%ecx				/* copy by 32-bit words */
403	cld					/* nope, copy forwards */
404	rep
405	movsl
406	movl	20(%esp),%ecx
407	andl	$3,%ecx				/* any bytes left? */
408	rep
409	movsb
410	popl	%edi
411	popl	%esi
412	ret
413
414	ALIGN_TEXT
4151:
416	addl	%ecx,%edi			/* copy backwards */
417	addl	%ecx,%esi
418	decl	%edi
419	decl	%esi
420	andl	$3,%ecx				/* any fractional bytes? */
421	std
422	rep
423	movsb
424	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
425	shrl	$2,%ecx
426	subl	$3,%esi
427	subl	$3,%edi
428	rep
429	movsl
430	popl	%edi
431	popl	%esi
432	cld
433	ret
434
435#ifdef I586_CPU
436ENTRY(i586_bcopy)
437	pushl	%esi
438	pushl	%edi
439	movl	12(%esp),%esi
440	movl	16(%esp),%edi
441	movl	20(%esp),%ecx
442
443	movl	%edi,%eax
444	subl	%esi,%eax
445	cmpl	%ecx,%eax			/* overlapping && src < dst? */
446	jb	1f
447
448	cmpl	$1024,%ecx
449	jb	small_i586_bcopy
450
451	sarb	$1,kernel_fpu_lock
452	jc	small_i586_bcopy
453	cmpl	$0,_npxproc
454	je	i586_bc1
455	smsw	%dx
456	clts
457	subl	$108,%esp
458	fnsave	0(%esp)
459	jmp	4f
460
461i586_bc1:
462	smsw	%dx
463	clts
464	fninit				/* XXX should avoid needing this */
465
466	ALIGN_TEXT
4674:
468	pushl	%ecx
469#define	DCACHE_SIZE	8192
470	cmpl	$(DCACHE_SIZE-512)/2,%ecx
471	jbe	2f
472	movl	$(DCACHE_SIZE-512)/2,%ecx
4732:
474	subl	%ecx,0(%esp)
475	cmpl	$256,%ecx
476	jb	5f			/* XXX should prefetch if %ecx >= 32 */
477	pushl	%esi
478	pushl	%ecx
479	ALIGN_TEXT
4803:
481	movl	0(%esi),%eax
482	movl	32(%esi),%eax
483	movl	64(%esi),%eax
484	movl	96(%esi),%eax
485	movl	128(%esi),%eax
486	movl	160(%esi),%eax
487	movl	192(%esi),%eax
488	movl	224(%esi),%eax
489	addl	$256,%esi
490	subl	$256,%ecx
491	cmpl	$256,%ecx
492	jae	3b
493	popl	%ecx
494	popl	%esi
4955:
496	ALIGN_TEXT
497large_i586_bcopy_loop:
498	fildq	0(%esi)
499	fildq	8(%esi)
500	fildq	16(%esi)
501	fildq	24(%esi)
502	fildq	32(%esi)
503	fildq	40(%esi)
504	fildq	48(%esi)
505	fildq	56(%esi)
506	fistpq	56(%edi)
507	fistpq	48(%edi)
508	fistpq	40(%edi)
509	fistpq	32(%edi)
510	fistpq	24(%edi)
511	fistpq	16(%edi)
512	fistpq	8(%edi)
513	fistpq	0(%edi)
514	addl	$64,%esi
515	addl	$64,%edi
516	subl	$64,%ecx
517	cmpl	$64,%ecx
518	jae	large_i586_bcopy_loop
519	popl	%eax
520	addl	%eax,%ecx
521	cmpl	$64,%ecx
522	jae	4b
523
524	cmpl	$0,_npxproc
525	je	i586_bc2
526	frstor	0(%esp)
527	addl	$108,%esp
528i586_bc2:
529	lmsw	%dx
530	movb	$0xfe,kernel_fpu_lock
531
532/*
533 * This is a duplicate of the main part of generic_bcopy.  See the comments
534 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
535 * would mess up high resolution profiling.
536 */
537	ALIGN_TEXT
538small_i586_bcopy:
539	shrl	$2,%ecx
540	cld
541	rep
542	movsl
543	movl	20(%esp),%ecx
544	andl	$3,%ecx
545	rep
546	movsb
547	popl	%edi
548	popl	%esi
549	ret
550
551	ALIGN_TEXT
5521:
553	addl	%ecx,%edi
554	addl	%ecx,%esi
555	decl	%edi
556	decl	%esi
557	andl	$3,%ecx
558	std
559	rep
560	movsb
561	movl	20(%esp),%ecx
562	shrl	$2,%ecx
563	subl	$3,%esi
564	subl	$3,%edi
565	rep
566	movsl
567	popl	%edi
568	popl	%esi
569	cld
570	ret
571#endif /* I586_CPU */
572
573/*
574 * Note: memcpy does not support overlapping copies
575 */
576ENTRY(memcpy)
577	pushl	%edi
578	pushl	%esi
579	movl	12(%esp),%edi
580	movl	16(%esp),%esi
581	movl	20(%esp),%ecx
582	movl	%edi,%eax
583	shrl	$2,%ecx				/* copy by 32-bit words */
584	cld					/* nope, copy forwards */
585	rep
586	movsl
587	movl	20(%esp),%ecx
588	andl	$3,%ecx				/* any bytes left? */
589	rep
590	movsb
591	popl	%esi
592	popl	%edi
593	ret
594
595
596/*****************************************************************************/
597/* copyout and fubyte family                                                 */
598/*****************************************************************************/
599/*
600 * Access user memory from inside the kernel. These routines and possibly
601 * the math- and DOS emulators should be the only places that do this.
602 *
603 * We have to access the memory with user's permissions, so use a segment
604 * selector with RPL 3. For writes to user space we have to additionally
605 * check the PTE for write permission, because the 386 does not check
606 * write permissions when we are executing with EPL 0. The 486 does check
607 * this if the WP bit is set in CR0, so we can use a simpler version here.
608 *
609 * These routines set curpcb->onfault for the time they execute. When a
610 * protection violation occurs inside the functions, the trap handler
611 * returns to *curpcb->onfault instead of the function.
612 */
613
614/* copyout(from_kernel, to_user, len) */
615ENTRY(copyout)
616	MEXITCOUNT
617	jmp	*_copyout_vector
618
619ENTRY(generic_copyout)
620	movl	_curpcb,%eax
621	movl	$copyout_fault,PCB_ONFAULT(%eax)
622	pushl	%esi
623	pushl	%edi
624	pushl	%ebx
625	movl	16(%esp),%esi
626	movl	20(%esp),%edi
627	movl	24(%esp),%ebx
628	testl	%ebx,%ebx			/* anything to do? */
629	jz	done_copyout
630
631	/*
632	 * Check explicitly for non-user addresses.  If 486 write protection
633	 * is being used, this check is essential because we are in kernel
634	 * mode so the h/w does not provide any protection against writing
635	 * kernel addresses.
636	 */
637
638	/*
639	 * First, prevent address wrapping.
640	 */
641	movl	%edi,%eax
642	addl	%ebx,%eax
643	jc	copyout_fault
644/*
645 * XXX STOP USING VM_MAXUSER_ADDRESS.
646 * It is an end address, not a max, so every time it is used correctly it
647 * looks like there is an off by one error, and of course it caused an off
648 * by one error in several places.
649 */
650	cmpl	$VM_MAXUSER_ADDRESS,%eax
651	ja	copyout_fault
652
653#if defined(I386_CPU)
654
655#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
656	cmpl	$CPUCLASS_386,_cpu_class
657	jne	3f
658#endif
659/*
660 * We have to check each PTE for user write permission.
661 * The checking may cause a page fault, so it is important to set
662 * up everything for return via copyout_fault before here.
663 */
664	/* compute number of pages */
665	movl	%edi,%ecx
666	andl	$PAGE_MASK,%ecx
667	addl	%ebx,%ecx
668	decl	%ecx
669	shrl	$IDXSHIFT+2,%ecx
670	incl	%ecx
671
672	/* compute PTE offset for start address */
673	movl	%edi,%edx
674	shrl	$IDXSHIFT,%edx
675	andb	$0xfc,%dl
676
6771:	/* check PTE for each page */
678	movb	_PTmap(%edx),%al
679	andb	$0x07,%al			/* Pages must be VALID + USERACC + WRITABLE */
680	cmpb	$0x07,%al
681	je	2f
682
683	/* simulate a trap */
684	pushl	%edx
685	pushl	%ecx
686	shll	$IDXSHIFT,%edx
687	pushl	%edx
688	call	_trapwrite			/* trapwrite(addr) */
689	popl	%edx
690	popl	%ecx
691	popl	%edx
692
693	testl	%eax,%eax			/* if not ok, return EFAULT */
694	jnz	copyout_fault
695
6962:
697	addl	$4,%edx
698	decl	%ecx
699	jnz	1b				/* check next page */
700#endif /* I386_CPU */
701
702	/* bcopy(%esi, %edi, %ebx) */
7033:
704	movl	%ebx,%ecx
705
706#ifdef I586_CPU
707	ALIGN_TEXT
708slow_copyout:
709#endif
710	shrl	$2,%ecx
711	cld
712	rep
713	movsl
714	movb	%bl,%cl
715	andb	$3,%cl
716	rep
717	movsb
718
719done_copyout:
720	popl	%ebx
721	popl	%edi
722	popl	%esi
723	xorl	%eax,%eax
724	movl	_curpcb,%edx
725	movl	%eax,PCB_ONFAULT(%edx)
726	ret
727
728	ALIGN_TEXT
729copyout_fault:
730	popl	%ebx
731	popl	%edi
732	popl	%esi
733	movl	_curpcb,%edx
734	movl	$0,PCB_ONFAULT(%edx)
735	movl	$EFAULT,%eax
736	ret
737
738#ifdef I586_CPU
739ENTRY(i586_copyout)
740	/*
741	 * Duplicated from generic_copyout.  Could be done a bit better.
742	 */
743	movl	_curpcb,%eax
744	movl	$copyout_fault,PCB_ONFAULT(%eax)
745	pushl	%esi
746	pushl	%edi
747	pushl	%ebx
748	movl	16(%esp),%esi
749	movl	20(%esp),%edi
750	movl	24(%esp),%ebx
751	testl	%ebx,%ebx			/* anything to do? */
752	jz	done_copyout
753
754	/*
755	 * Check explicitly for non-user addresses.  If 486 write protection
756	 * is being used, this check is essential because we are in kernel
757	 * mode so the h/w does not provide any protection against writing
758	 * kernel addresses.
759	 */
760
761	/*
762	 * First, prevent address wrapping.
763	 */
764	movl	%edi,%eax
765	addl	%ebx,%eax
766	jc	copyout_fault
767/*
768 * XXX STOP USING VM_MAXUSER_ADDRESS.
769 * It is an end address, not a max, so every time it is used correctly it
770 * looks like there is an off by one error, and of course it caused an off
771 * by one error in several places.
772 */
773	cmpl	$VM_MAXUSER_ADDRESS,%eax
774	ja	copyout_fault
775
776	/* bcopy(%esi, %edi, %ebx) */
7773:
778	movl	%ebx,%ecx
779	/*
780	 * End of duplicated code.
781	 */
782
783	cmpl	$1024,%ecx
784	jb	slow_copyout
785
786	pushl	%ecx
787	call	_fastmove
788	addl	$4,%esp
789	jmp	done_copyout
790#endif /* I586_CPU */
791
792/* copyin(from_user, to_kernel, len) */
793ENTRY(copyin)
794	MEXITCOUNT
795	jmp	*_copyin_vector
796
797ENTRY(generic_copyin)
798	movl	_curpcb,%eax
799	movl	$copyin_fault,PCB_ONFAULT(%eax)
800	pushl	%esi
801	pushl	%edi
802	movl	12(%esp),%esi			/* caddr_t from */
803	movl	16(%esp),%edi			/* caddr_t to */
804	movl	20(%esp),%ecx			/* size_t  len */
805
806	/*
807	 * make sure address is valid
808	 */
809	movl	%esi,%edx
810	addl	%ecx,%edx
811	jc	copyin_fault
812	cmpl	$VM_MAXUSER_ADDRESS,%edx
813	ja	copyin_fault
814
815#ifdef I586_CPU
816	ALIGN_TEXT
817slow_copyin:
818#endif
819	movb	%cl,%al
820	shrl	$2,%ecx				/* copy longword-wise */
821	cld
822	rep
823	movsl
824	movb	%al,%cl
825	andb	$3,%cl				/* copy remaining bytes */
826	rep
827	movsb
828
829#if defined(I586_CPU)
830	ALIGN_TEXT
831done_copyin:
832#endif /* I586_CPU */
833	popl	%edi
834	popl	%esi
835	xorl	%eax,%eax
836	movl	_curpcb,%edx
837	movl	%eax,PCB_ONFAULT(%edx)
838	ret
839
840	ALIGN_TEXT
841copyin_fault:
842	popl	%edi
843	popl	%esi
844	movl	_curpcb,%edx
845	movl	$0,PCB_ONFAULT(%edx)
846	movl	$EFAULT,%eax
847	ret
848
849#ifdef I586_CPU
850ENTRY(i586_copyin)
851	/*
852	 * Duplicated from generic_copyin.  Could be done a bit better.
853	 */
854	movl	_curpcb,%eax
855	movl	$copyin_fault,PCB_ONFAULT(%eax)
856	pushl	%esi
857	pushl	%edi
858	movl	12(%esp),%esi			/* caddr_t from */
859	movl	16(%esp),%edi			/* caddr_t to */
860	movl	20(%esp),%ecx			/* size_t  len */
861
862	/*
863	 * make sure address is valid
864	 */
865	movl	%esi,%edx
866	addl	%ecx,%edx
867	jc	copyin_fault
868	cmpl	$VM_MAXUSER_ADDRESS,%edx
869	ja	copyin_fault
870	/*
871	 * End of duplicated code.
872	 */
873
874	cmpl	$1024,%ecx
875	jb	slow_copyin
876
877	pushl	%ebx			/* XXX prepare for fastmove_fault */
878	pushl	%ecx
879	call	_fastmove
880	addl	$8,%esp
881	jmp	done_copyin
882#endif /* I586_CPU */
883
884#if defined(I586_CPU)
885/* fastmove(src, dst, len)
886	src in %esi
887	dst in %edi
888	len in %ecx		XXX changed to on stack for profiling
889	uses %eax and %edx for tmp. storage
890 */
891/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
892ENTRY(fastmove)
893	pushl	%ebp
894	movl	%esp,%ebp
895	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
896
897	movl	8(%ebp),%ecx
898	cmpl	$63,%ecx
899	jbe	fastmove_tail
900
901	testl	$7,%esi	/* check if src addr is multiple of 8 */
902	jnz	fastmove_tail
903
904	testl	$7,%edi	/* check if dst addr is multiple of 8 */
905	jnz	fastmove_tail
906
907/* if (npxproc != NULL) { */
908	cmpl	$0,_npxproc
909	je	6f
910/*    fnsave(&curpcb->pcb_savefpu); */
911	movl	_curpcb,%eax
912	fnsave	PCB_SAVEFPU(%eax)
913/*   npxproc = NULL; */
914	movl	$0,_npxproc
915/* } */
9166:
917/* now we own the FPU. */
918
919/*
920 * The process' FP state is saved in the pcb, but if we get
921 * switched, the cpu_switch() will store our FP state in the
922 * pcb.  It should be possible to avoid all the copying for
923 * this, e.g., by setting a flag to tell cpu_switch() to
924 * save the state somewhere else.
925 */
926/* tmp = curpcb->pcb_savefpu; */
927	movl	%ecx,-12(%ebp)
928	movl	%esi,-8(%ebp)
929	movl	%edi,-4(%ebp)
930	movl	%esp,%edi
931	movl	_curpcb,%esi
932	addl	$PCB_SAVEFPU,%esi
933	cld
934	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
935	rep
936	movsl
937	movl	-12(%ebp),%ecx
938	movl	-8(%ebp),%esi
939	movl	-4(%ebp),%edi
940/* stop_emulating(); */
941	clts
942/* npxproc = curproc; */
943	movl	_curproc,%eax
944	movl	%eax,_npxproc
945	movl	_curpcb,%eax
946	movl	$fastmove_fault,PCB_ONFAULT(%eax)
9474:
948	movl	%ecx,-12(%ebp)
949	cmpl	$1792,%ecx
950	jbe	2f
951	movl	$1792,%ecx
9522:
953	subl	%ecx,-12(%ebp)
954	cmpl	$256,%ecx
955	jb	5f
956	movl	%ecx,-8(%ebp)
957	movl	%esi,-4(%ebp)
958	ALIGN_TEXT
9593:
960	movl	0(%esi),%eax
961	movl	32(%esi),%eax
962	movl	64(%esi),%eax
963	movl	96(%esi),%eax
964	movl	128(%esi),%eax
965	movl	160(%esi),%eax
966	movl	192(%esi),%eax
967	movl	224(%esi),%eax
968	addl	$256,%esi
969	subl	$256,%ecx
970	cmpl	$256,%ecx
971	jae	3b
972	movl	-8(%ebp),%ecx
973	movl	-4(%ebp),%esi
9745:
975	ALIGN_TEXT
976fastmove_loop:
977	fildq	0(%esi)
978	fildq	8(%esi)
979	fildq	16(%esi)
980	fildq	24(%esi)
981	fildq	32(%esi)
982	fildq	40(%esi)
983	fildq	48(%esi)
984	fildq	56(%esi)
985	fistpq	56(%edi)
986	fistpq	48(%edi)
987	fistpq	40(%edi)
988	fistpq	32(%edi)
989	fistpq	24(%edi)
990	fistpq	16(%edi)
991	fistpq	8(%edi)
992	fistpq	0(%edi)
993	addl	$-64,%ecx
994	addl	$64,%esi
995	addl	$64,%edi
996	cmpl	$63,%ecx
997	ja	fastmove_loop
998	movl	-12(%ebp),%eax
999	addl	%eax,%ecx
1000	cmpl	$64,%ecx
1001	jae	4b
1002
1003/* curpcb->pcb_savefpu = tmp; */
1004	movl	%ecx,-12(%ebp)
1005	movl	%esi,-8(%ebp)
1006	movl	%edi,-4(%ebp)
1007	movl	_curpcb,%edi
1008	addl	$PCB_SAVEFPU,%edi
1009	movl	%esp,%esi
1010	cld
1011	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1012	rep
1013	movsl
1014	movl	-12(%ebp),%ecx
1015	movl	-8(%ebp),%esi
1016	movl	-4(%ebp),%edi
1017
1018/* start_emulating(); */
1019	smsw	%ax
1020	orb	$CR0_TS,%al
1021	lmsw	%ax
1022/* npxproc = NULL; */
1023	movl	$0,_npxproc
1024
1025	ALIGN_TEXT
1026fastmove_tail:
1027	movl	_curpcb,%eax
1028	movl	$fastmove_tail_fault,PCB_ONFAULT(%eax)
1029
1030	movb	%cl,%al
1031	shrl	$2,%ecx				/* copy longword-wise */
1032	cld
1033	rep
1034	movsl
1035	movb	%al,%cl
1036	andb	$3,%cl				/* copy remaining bytes */
1037	rep
1038	movsb
1039
1040	movl	%ebp,%esp
1041	popl	%ebp
1042	ret
1043
1044	ALIGN_TEXT
1045fastmove_fault:
1046	movl	_curpcb,%edi
1047	addl	$PCB_SAVEFPU,%edi
1048	movl	%esp,%esi
1049	cld
1050	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1051	rep
1052	movsl
1053
1054	smsw	%ax
1055	orb	$CR0_TS,%al
1056	lmsw	%ax
1057	movl	$0,_npxproc
1058
1059fastmove_tail_fault:
1060	movl	%ebp,%esp
1061	popl	%ebp
1062	addl	$8,%esp
1063	popl	%ebx
1064	popl	%edi
1065	popl	%esi
1066	movl	_curpcb,%edx
1067	movl	$0,PCB_ONFAULT(%edx)
1068	movl	$EFAULT,%eax
1069	ret
1070#endif /* I586_CPU */
1071
1072/*
1073 * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
1074 */
1075ENTRY(fuword)
1076	movl	_curpcb,%ecx
1077	movl	$fusufault,PCB_ONFAULT(%ecx)
1078	movl	4(%esp),%edx			/* from */
1079
1080	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1081	ja	fusufault
1082
1083	movl	(%edx),%eax
1084	movl	$0,PCB_ONFAULT(%ecx)
1085	ret
1086
1087/*
1088 * These two routines are called from the profiling code, potentially
1089 * at interrupt time. If they fail, that's okay, good things will
1090 * happen later. Fail all the time for now - until the trap code is
1091 * able to deal with this.
1092 */
1093ALTENTRY(suswintr)
1094ENTRY(fuswintr)
1095	movl	$-1,%eax
1096	ret
1097
1098ENTRY(fusword)
1099	movl	_curpcb,%ecx
1100	movl	$fusufault,PCB_ONFAULT(%ecx)
1101	movl	4(%esp),%edx
1102
1103	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
1104	ja	fusufault
1105
1106	movzwl	(%edx),%eax
1107	movl	$0,PCB_ONFAULT(%ecx)
1108	ret
1109
1110ENTRY(fubyte)
1111	movl	_curpcb,%ecx
1112	movl	$fusufault,PCB_ONFAULT(%ecx)
1113	movl	4(%esp),%edx
1114
1115	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1116	ja	fusufault
1117
1118	movzbl	(%edx),%eax
1119	movl	$0,PCB_ONFAULT(%ecx)
1120	ret
1121
1122	ALIGN_TEXT
1123fusufault:
1124	movl	_curpcb,%ecx
1125	xorl	%eax,%eax
1126	movl	%eax,PCB_ONFAULT(%ecx)
1127	decl	%eax
1128	ret
1129
1130/*
1131 * su{byte,sword,word}: write a byte (word, longword) to user memory
1132 */
1133ENTRY(suword)
1134	movl	_curpcb,%ecx
1135	movl	$fusufault,PCB_ONFAULT(%ecx)
1136	movl	4(%esp),%edx
1137
1138#if defined(I386_CPU)
1139
1140#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1141	cmpl	$CPUCLASS_386,_cpu_class
1142	jne	2f				/* we only have to set the right segment selector */
1143#endif /* I486_CPU || I586_CPU || I686_CPU */
1144
1145	/* XXX - page boundary crossing is still not handled */
1146	movl	%edx,%eax
1147	shrl	$IDXSHIFT,%edx
1148	andb	$0xfc,%dl
1149	movb	_PTmap(%edx),%dl
1150	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1151	cmpb	$0x7,%dl
1152	je	1f
1153
1154	/* simulate a trap */
1155	pushl	%eax
1156	call	_trapwrite
1157	popl	%edx				/* remove junk parameter from stack */
1158	movl	_curpcb,%ecx			/* restore trashed register */
1159	testl	%eax,%eax
1160	jnz	fusufault
11611:
1162	movl	4(%esp),%edx
1163#endif
1164
11652:
1166	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1167	ja	fusufault
1168
1169	movl	8(%esp),%eax
1170	movl	%eax,(%edx)
1171	xorl	%eax,%eax
1172	movl	%eax,PCB_ONFAULT(%ecx)
1173	ret
1174
1175ENTRY(susword)
1176	movl	_curpcb,%ecx
1177	movl	$fusufault,PCB_ONFAULT(%ecx)
1178	movl	4(%esp),%edx
1179
1180#if defined(I386_CPU)
1181
1182#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1183	cmpl	$CPUCLASS_386,_cpu_class
1184	jne	2f
1185#endif /* I486_CPU || I586_CPU || I686_CPU */
1186
1187	/* XXX - page boundary crossing is still not handled */
1188	movl	%edx,%eax
1189	shrl	$IDXSHIFT,%edx
1190	andb	$0xfc,%dl
1191	movb	_PTmap(%edx),%dl
1192	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1193	cmpb	$0x7,%dl
1194	je	1f
1195
1196	/* simulate a trap */
1197	pushl	%eax
1198	call	_trapwrite
1199	popl	%edx				/* remove junk parameter from stack */
1200	movl	_curpcb,%ecx			/* restore trashed register */
1201	testl	%eax,%eax
1202	jnz	fusufault
12031:
1204	movl	4(%esp),%edx
1205#endif
1206
12072:
1208	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1209	ja	fusufault
1210
1211	movw	8(%esp),%ax
1212	movw	%ax,(%edx)
1213	xorl	%eax,%eax
1214	movl	%eax,PCB_ONFAULT(%ecx)
1215	ret
1216
1217ALTENTRY(suibyte)
1218ENTRY(subyte)
1219	movl	_curpcb,%ecx
1220	movl	$fusufault,PCB_ONFAULT(%ecx)
1221	movl	4(%esp),%edx
1222
1223#if defined(I386_CPU)
1224
1225#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1226	cmpl	$CPUCLASS_386,_cpu_class
1227	jne	2f
1228#endif /* I486_CPU || I586_CPU || I686_CPU */
1229
1230	movl	%edx,%eax
1231	shrl	$IDXSHIFT,%edx
1232	andb	$0xfc,%dl
1233	movb	_PTmap(%edx),%dl
1234	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1235	cmpb	$0x7,%dl
1236	je	1f
1237
1238	/* simulate a trap */
1239	pushl	%eax
1240	call	_trapwrite
1241	popl	%edx				/* remove junk parameter from stack */
1242	movl	_curpcb,%ecx			/* restore trashed register */
1243	testl	%eax,%eax
1244	jnz	fusufault
12451:
1246	movl	4(%esp),%edx
1247#endif
1248
12492:
1250	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1251	ja	fusufault
1252
1253	movb	8(%esp),%al
1254	movb	%al,(%edx)
1255	xorl	%eax,%eax
1256	movl	%eax,PCB_ONFAULT(%ecx)
1257	ret
1258
1259/*
1260 * copyinstr(from, to, maxlen, int *lencopied)
1261 *	copy a string from from to to, stop when a 0 character is reached.
1262 *	return ENAMETOOLONG if string is longer than maxlen, and
1263 *	EFAULT on protection violations. If lencopied is non-zero,
1264 *	return the actual length in *lencopied.
1265 */
1266ENTRY(copyinstr)
1267	pushl	%esi
1268	pushl	%edi
1269	movl	_curpcb,%ecx
1270	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1271
1272	movl	12(%esp),%esi			/* %esi = from */
1273	movl	16(%esp),%edi			/* %edi = to */
1274	movl	20(%esp),%edx			/* %edx = maxlen */
1275
1276	movl	$VM_MAXUSER_ADDRESS,%eax
1277
1278	/* make sure 'from' is within bounds */
1279	subl	%esi,%eax
1280	jbe	cpystrflt
1281
1282	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1283	cmpl	%edx,%eax
1284	jae	1f
1285	movl	%eax,%edx
1286	movl	%eax,20(%esp)
12871:
1288	incl	%edx
1289	cld
1290
12912:
1292	decl	%edx
1293	jz	3f
1294
1295	lodsb
1296	stosb
1297	orb	%al,%al
1298	jnz	2b
1299
1300	/* Success -- 0 byte reached */
1301	decl	%edx
1302	xorl	%eax,%eax
1303	jmp	cpystrflt_x
13043:
1305	/* edx is zero - return ENAMETOOLONG or EFAULT */
1306	cmpl	$VM_MAXUSER_ADDRESS,%esi
1307	jae	cpystrflt
13084:
1309	movl	$ENAMETOOLONG,%eax
1310	jmp	cpystrflt_x
1311
1312cpystrflt:
1313	movl	$EFAULT,%eax
1314
1315cpystrflt_x:
1316	/* set *lencopied and return %eax */
1317	movl	_curpcb,%ecx
1318	movl	$0,PCB_ONFAULT(%ecx)
1319	movl	20(%esp),%ecx
1320	subl	%edx,%ecx
1321	movl	24(%esp),%edx
1322	testl	%edx,%edx
1323	jz	1f
1324	movl	%ecx,(%edx)
13251:
1326	popl	%edi
1327	popl	%esi
1328	ret
1329
1330
1331/*
1332 * copystr(from, to, maxlen, int *lencopied)
1333 */
1334ENTRY(copystr)
1335	pushl	%esi
1336	pushl	%edi
1337
1338	movl	12(%esp),%esi			/* %esi = from */
1339	movl	16(%esp),%edi			/* %edi = to */
1340	movl	20(%esp),%edx			/* %edx = maxlen */
1341	incl	%edx
1342	cld
13431:
1344	decl	%edx
1345	jz	4f
1346	lodsb
1347	stosb
1348	orb	%al,%al
1349	jnz	1b
1350
1351	/* Success -- 0 byte reached */
1352	decl	%edx
1353	xorl	%eax,%eax
1354	jmp	6f
13554:
1356	/* edx is zero -- return ENAMETOOLONG */
1357	movl	$ENAMETOOLONG,%eax
1358
13596:
1360	/* set *lencopied and return %eax */
1361	movl	20(%esp),%ecx
1362	subl	%edx,%ecx
1363	movl	24(%esp),%edx
1364	testl	%edx,%edx
1365	jz	7f
1366	movl	%ecx,(%edx)
13677:
1368	popl	%edi
1369	popl	%esi
1370	ret
1371
1372ENTRY(bcmp)
1373	pushl	%edi
1374	pushl	%esi
1375	movl	12(%esp),%edi
1376	movl	16(%esp),%esi
1377	movl	20(%esp),%edx
1378	xorl	%eax,%eax
1379
1380	movl	%edx,%ecx
1381	shrl	$2,%ecx
1382	cld					/* compare forwards */
1383	repe
1384	cmpsl
1385	jne	1f
1386
1387	movl	%edx,%ecx
1388	andl	$3,%ecx
1389	repe
1390	cmpsb
1391	je	2f
13921:
1393	incl	%eax
13942:
1395	popl	%esi
1396	popl	%edi
1397	ret
1398
1399
1400/*
1401 * Handling of special 386 registers and descriptor tables etc
1402 */
1403/* void lgdt(struct region_descriptor *rdp); */
1404ENTRY(lgdt)
1405	/* reload the descriptor table */
1406	movl	4(%esp),%eax
1407	lgdt	(%eax)
1408
1409	/* flush the prefetch q */
1410	jmp	1f
1411	nop
14121:
1413	/* reload "stale" selectors */
1414	movl	$KDSEL,%eax
1415	movl	%ax,%ds
1416	movl	%ax,%es
1417	movl	%ax,%ss
1418
1419	/* reload code selector by turning return into intersegmental return */
1420	movl	(%esp),%eax
1421	pushl	%eax
1422#	movl	$KCSEL,4(%esp)
1423	movl	$8,4(%esp)
1424	lret
1425
1426/*
1427 * void lidt(struct region_descriptor *rdp);
1428 */
1429ENTRY(lidt)
1430	movl	4(%esp),%eax
1431	lidt	(%eax)
1432	ret
1433
1434/*
1435 * void lldt(u_short sel)
1436 */
1437ENTRY(lldt)
1438	lldt	4(%esp)
1439	ret
1440
1441/*
1442 * void ltr(u_short sel)
1443 */
1444ENTRY(ltr)
1445	ltr	4(%esp)
1446	ret
1447
1448/* ssdtosd(*ssdp,*sdp) */
1449ENTRY(ssdtosd)
1450	pushl	%ebx
1451	movl	8(%esp),%ecx
1452	movl	8(%ecx),%ebx
1453	shll	$16,%ebx
1454	movl	(%ecx),%edx
1455	roll	$16,%edx
1456	movb	%dh,%bl
1457	movb	%dl,%bh
1458	rorl	$8,%ebx
1459	movl	4(%ecx),%eax
1460	movw	%ax,%dx
1461	andl	$0xf0000,%eax
1462	orl	%eax,%ebx
1463	movl	12(%esp),%ecx
1464	movl	%edx,(%ecx)
1465	movl	%ebx,4(%ecx)
1466	popl	%ebx
1467	ret
1468
1469/* load_cr0(cr0) */
1470ENTRY(load_cr0)
1471	movl	4(%esp),%eax
1472	movl	%eax,%cr0
1473	ret
1474
1475/* rcr0() */
1476ENTRY(rcr0)
1477	movl	%cr0,%eax
1478	ret
1479
1480/* rcr3() */
1481ENTRY(rcr3)
1482	movl	%cr3,%eax
1483	ret
1484
1485/* void load_cr3(caddr_t cr3) */
1486ENTRY(load_cr3)
1487	movl	4(%esp),%eax
1488	movl	%eax,%cr3
1489	ret
1490
1491
1492/*****************************************************************************/
1493/* setjump, longjump                                                         */
1494/*****************************************************************************/
1495
1496ENTRY(setjmp)
1497	movl	4(%esp),%eax
1498	movl	%ebx,(%eax)			/* save ebx */
1499	movl	%esp,4(%eax)			/* save esp */
1500	movl	%ebp,8(%eax)			/* save ebp */
1501	movl	%esi,12(%eax)			/* save esi */
1502	movl	%edi,16(%eax)			/* save edi */
1503	movl	(%esp),%edx			/* get rta */
1504	movl	%edx,20(%eax)			/* save eip */
1505	xorl	%eax,%eax			/* return(0); */
1506	ret
1507
1508ENTRY(longjmp)
1509	movl	4(%esp),%eax
1510	movl	(%eax),%ebx			/* restore ebx */
1511	movl	4(%eax),%esp			/* restore esp */
1512	movl	8(%eax),%ebp			/* restore ebp */
1513	movl	12(%eax),%esi			/* restore esi */
1514	movl	16(%eax),%edi			/* restore edi */
1515	movl	20(%eax),%edx			/* get rta */
1516	movl	%edx,(%esp)			/* put in return frame */
1517	xorl	%eax,%eax			/* return(1); */
1518	incl	%eax
1519	ret
1520
1521/*
1522 * Here for doing BB-profiling (gcc -a).
1523 * We rely on the "bbset" instead, but need a dummy function.
1524 */
1525NON_GPROF_ENTRY(__bb_init_func)
1526	movl	4(%esp),%eax
1527	movl	$1,(%eax)
1528	.byte	0xc3				/* avoid macro for `ret' */
1529