support.s revision 18835
1/*-
2 * Copyright (c) 1993 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	$Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
34 */
35
36#include "opt_temporary.h"			/* for I586_*_B* */
37
38#include <machine/asmacros.h>
39#include <machine/cputypes.h>
40#include <machine/specialreg.h>
41
42#include "assym.s"
43
44#define KDSEL		0x10			/* kernel data selector */
45#define IDXSHIFT	10
46
47	.data
48	.globl	_bcopy_vector
49_bcopy_vector:
50	.long	_generic_bcopy
51	.globl	_bzero
52_bzero:
53	.long	_generic_bzero
54	.globl	_ovbcopy_vector
55_ovbcopy_vector:
56	.long	_generic_bcopy
57kernel_fpu_lock:
58	.byte	0xfe
59	.space	3
60
61	.text
62
63/*
64 * bcopy family
65 * void bzero(void *buf, u_int len)
66 */
67
68ENTRY(generic_bzero)
69	pushl	%edi
70	movl	8(%esp),%edi
71	movl	12(%esp),%ecx
72	xorl	%eax,%eax
73	shrl	$2,%ecx
74	cld
75	rep
76	stosl
77	movl	12(%esp),%ecx
78	andl	$3,%ecx
79	rep
80	stosb
81	popl	%edi
82	ret
83
84#if defined(I486_CPU)
85ENTRY(i486_bzero)
86	movl	4(%esp),%edx
87	movl	8(%esp),%ecx
88	xorl	%eax,%eax
89/*
90 * do 64 byte chunks first
91 *
92 * XXX this is probably over-unrolled at least for DX2's
93 */
942:
95	cmpl	$64,%ecx
96	jb	3f
97	movl	%eax,(%edx)
98	movl	%eax,4(%edx)
99	movl	%eax,8(%edx)
100	movl	%eax,12(%edx)
101	movl	%eax,16(%edx)
102	movl	%eax,20(%edx)
103	movl	%eax,24(%edx)
104	movl	%eax,28(%edx)
105	movl	%eax,32(%edx)
106	movl	%eax,36(%edx)
107	movl	%eax,40(%edx)
108	movl	%eax,44(%edx)
109	movl	%eax,48(%edx)
110	movl	%eax,52(%edx)
111	movl	%eax,56(%edx)
112	movl	%eax,60(%edx)
113	addl	$64,%edx
114	subl	$64,%ecx
115	jnz	2b
116	ret
117
118/*
119 * do 16 byte chunks
120 */
121	SUPERALIGN_TEXT
1223:
123	cmpl	$16,%ecx
124	jb	4f
125	movl	%eax,(%edx)
126	movl	%eax,4(%edx)
127	movl	%eax,8(%edx)
128	movl	%eax,12(%edx)
129	addl	$16,%edx
130	subl	$16,%ecx
131	jnz	3b
132	ret
133
134/*
135 * do 4 byte chunks
136 */
137	SUPERALIGN_TEXT
1384:
139	cmpl	$4,%ecx
140	jb	5f
141	movl	%eax,(%edx)
142	addl	$4,%edx
143	subl	$4,%ecx
144	jnz	4b
145	ret
146
147/*
148 * do 1 byte chunks
149 * a jump table seems to be faster than a loop or more range reductions
150 *
151 * XXX need a const section for non-text
152 */
153	.data
154jtab:
155	.long	do0
156	.long	do1
157	.long	do2
158	.long	do3
159
160	.text
161	SUPERALIGN_TEXT
1625:
163	jmp	jtab(,%ecx,4)
164
165	SUPERALIGN_TEXT
166do3:
167	movw	%ax,(%edx)
168	movb	%al,2(%edx)
169	ret
170
171	SUPERALIGN_TEXT
172do2:
173	movw	%ax,(%edx)
174	ret
175
176	SUPERALIGN_TEXT
177do1:
178	movb	%al,(%edx)
179	ret
180
181	SUPERALIGN_TEXT
182do0:
183	ret
184#endif
185
186#if defined(I586_CPU) || defined(I686_CPU)
187ENTRY(i586_bzero)
188	movl	4(%esp),%edx
189	movl	8(%esp),%ecx
190
191	/*
192	 * The FPU register method is twice as fast as the integer register
193	 * method unless the target is in the L1 cache and we pre-allocate a
194	 * cache line for it (then the integer register method is 4-5 times
195	 * faster).  However, we never pre-allocate cache lines, since that
196	 * would make the integer method 25% or more slower for the common
197	 * case when the target isn't in either the L1 cache or the L2 cache.
198	 * Thus we normally use the FPU register method unless the overhead
199	 * would be too large.
200	 */
201	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
202	jb	intreg_i586_bzero
203
204	/*
205	 * The FPU registers may belong to an application or to fastmove()
206	 * or to another invocation of bcopy() or ourself in a higher level
207	 * interrupt or trap handler.  Preserving the registers is
208	 * complicated since we avoid it if possible at all levels.  We
209	 * want to localize the complications even when that increases them.
210	 * Here the extra work involves preserving CR0_TS in TS.
211	 * `npxproc != NULL' is supposed to be the condition that all the
212	 * FPU resources belong to an application, but npxproc and CR0_TS
213	 * aren't set atomically enough for this condition to work in
214	 * interrupt handlers.
215	 *
216	 * Case 1: FPU registers belong to the application: we must preserve
217	 * the registers if we use them, so we only use the FPU register
218	 * method if the target size is large enough to amortize the extra
219	 * overhead for preserving them.  CR0_TS must be preserved although
220	 * it is very likely to end up as set.
221	 *
222	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
223	 * makes the registers look like they belong to an application so
224	 * that cpu_switch() and savectx() don't have to know about it, so
225	 * this case reduces to case 1.
226	 *
227	 * Case 3: FPU registers belong to the kernel: don't use the FPU
228	 * register method.  This case is unlikely, and supporting it would
229	 * be more complicated and might take too much stack.
230	 *
231	 * Case 4: FPU registers don't belong to anyone: the FPU registers
232	 * don't need to be preserved, so we always use the FPU register
233	 * method.  CR0_TS must be preserved although it is very likely to
234	 * always end up as clear.
235	 */
236	cmpl	$0,_npxproc
237	je	i586_bz1
238	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
239	jb	intreg_i586_bzero
240	sarb	$1,kernel_fpu_lock
241	jc	intreg_i586_bzero
242	smsw	%ax
243	clts
244	subl	$108,%esp
245	fnsave	0(%esp)
246	jmp	i586_bz2
247
248i586_bz1:
249	sarb	$1,kernel_fpu_lock
250	jc	intreg_i586_bzero
251	smsw	%ax
252	clts
253	fninit				/* XXX should avoid needing this */
254i586_bz2:
255	fldz
256
257	/*
258	 * Align to an 8 byte boundary (misalignment in the main loop would
259	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
260	 * already aligned) by always zeroing 8 bytes and using the part up
261	 * to the _next_ alignment position.
262	 */
263	fstl	0(%edx)
264	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
265	addl	$8,%edx
266	andl	$~7,%edx
267	subl	%edx,%ecx
268
269	/*
270	 * Similarly align `len' to a multiple of 8.
271	 */
272	fstl	-8(%edx,%ecx)
273	decl	%ecx
274	andl	$~7,%ecx
275
276	/*
277	 * This wouldn't be any faster if it were unrolled, since the loop
278	 * control instructions are much faster than the fstl and/or done
279	 * in parallel with it so their overhead is insignificant.
280	 */
281fpureg_i586_bzero_loop:
282	fstl	0(%edx)
283	addl	$8,%edx
284	subl	$8,%ecx
285	cmpl	$8,%ecx
286	jae	fpureg_i586_bzero_loop
287
288	cmpl	$0,_npxproc
289	je	i586_bz3
290	frstor	0(%esp)
291	addl	$108,%esp
292	lmsw	%ax
293	movb	$0xfe,kernel_fpu_lock
294	ret
295
296i586_bz3:
297	fstpl	%st(0)
298	lmsw	%ax
299	movb	$0xfe,kernel_fpu_lock
300	ret
301
302intreg_i586_bzero:
303	/*
304	 * `rep stos' seems to be the best method in practice for small
305	 * counts.  Fancy methods usually take too long to start up due
306	 * to cache and BTB misses.
307	 */
308	pushl	%edi
309	movl	%edx,%edi
310	xorl	%eax,%eax
311	shrl	$2,%ecx
312	cld
313	rep
314	stosl
315	movl	12(%esp),%ecx
316	andl	$3,%ecx
317	jne	1f
318	popl	%edi
319	ret
320
3211:
322	rep
323	stosb
324	popl	%edi
325	ret
326#endif /* I586_CPU || I686_CPU */
327
328/* fillw(pat, base, cnt) */
329ENTRY(fillw)
330	pushl	%edi
331	movl	8(%esp),%eax
332	movl	12(%esp),%edi
333	movl	16(%esp),%ecx
334	cld
335	rep
336	stosw
337	popl	%edi
338	ret
339
340ENTRY(bcopyb)
341bcopyb:
342	pushl	%esi
343	pushl	%edi
344	movl	12(%esp),%esi
345	movl	16(%esp),%edi
346	movl	20(%esp),%ecx
347	movl	%edi,%eax
348	subl	%esi,%eax
349	cmpl	%ecx,%eax			/* overlapping && src < dst? */
350	jb	1f
351	cld					/* nope, copy forwards */
352	rep
353	movsb
354	popl	%edi
355	popl	%esi
356	ret
357
358	ALIGN_TEXT
3591:
360	addl	%ecx,%edi			/* copy backwards. */
361	addl	%ecx,%esi
362	decl	%edi
363	decl	%esi
364	std
365	rep
366	movsb
367	popl	%edi
368	popl	%esi
369	cld
370	ret
371
372ENTRY(bcopy)
373	MEXITCOUNT
374	jmp	*_bcopy_vector
375
376ENTRY(ovbcopy)
377	MEXITCOUNT
378	jmp	*_ovbcopy_vector
379
380/*
381 * generic_bcopy(src, dst, cnt)
382 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
383 */
384ENTRY(generic_bcopy)
385	pushl	%esi
386	pushl	%edi
387	movl	12(%esp),%esi
388	movl	16(%esp),%edi
389	movl	20(%esp),%ecx
390
391	movl	%edi,%eax
392	subl	%esi,%eax
393	cmpl	%ecx,%eax			/* overlapping && src < dst? */
394	jb	1f
395
396	shrl	$2,%ecx				/* copy by 32-bit words */
397	cld					/* nope, copy forwards */
398	rep
399	movsl
400	movl	20(%esp),%ecx
401	andl	$3,%ecx				/* any bytes left? */
402	rep
403	movsb
404	popl	%edi
405	popl	%esi
406	ret
407
408	ALIGN_TEXT
4091:
410	addl	%ecx,%edi			/* copy backwards */
411	addl	%ecx,%esi
412	decl	%edi
413	decl	%esi
414	andl	$3,%ecx				/* any fractional bytes? */
415	std
416	rep
417	movsb
418	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
419	shrl	$2,%ecx
420	subl	$3,%esi
421	subl	$3,%edi
422	rep
423	movsl
424	popl	%edi
425	popl	%esi
426	cld
427	ret
428
429ENTRY(i586_bcopy)
430	pushl	%esi
431	pushl	%edi
432	movl	12(%esp),%esi
433	movl	16(%esp),%edi
434	movl	20(%esp),%ecx
435
436	movl	%edi,%eax
437	subl	%esi,%eax
438	cmpl	%ecx,%eax			/* overlapping && src < dst? */
439	jb	1f
440
441	cmpl	$1024,%ecx
442	jb	small_i586_bcopy
443
444	sarb	$1,kernel_fpu_lock
445	jc	small_i586_bcopy
446	cmpl	$0,_npxproc
447	je	i586_bc1
448	smsw	%dx
449	clts
450	subl	$108,%esp
451	fnsave	0(%esp)
452	jmp	4f
453
454i586_bc1:
455	smsw	%dx
456	clts
457	fninit				/* XXX should avoid needing this */
458
459	ALIGN_TEXT
4604:
461	pushl	%ecx
462#define	DCACHE_SIZE	8192
463	cmpl	$(DCACHE_SIZE-512)/2,%ecx
464	jbe	2f
465	movl	$(DCACHE_SIZE-512)/2,%ecx
4662:
467	subl	%ecx,0(%esp)
468	cmpl	$256,%ecx
469	jb	5f			/* XXX should prefetch if %ecx >= 32 */
470	pushl	%esi
471	pushl	%ecx
472	ALIGN_TEXT
4733:
474	movl	0(%esi),%eax
475	movl	32(%esi),%eax
476	movl	64(%esi),%eax
477	movl	96(%esi),%eax
478	movl	128(%esi),%eax
479	movl	160(%esi),%eax
480	movl	192(%esi),%eax
481	movl	224(%esi),%eax
482	addl	$256,%esi
483	subl	$256,%ecx
484	cmpl	$256,%ecx
485	jae	3b
486	popl	%ecx
487	popl	%esi
4885:
489	ALIGN_TEXT
490large_i586_bcopy_loop:
491	fildq	0(%esi)
492	fildq	8(%esi)
493	fildq	16(%esi)
494	fildq	24(%esi)
495	fildq	32(%esi)
496	fildq	40(%esi)
497	fildq	48(%esi)
498	fildq	56(%esi)
499	fistpq	56(%edi)
500	fistpq	48(%edi)
501	fistpq	40(%edi)
502	fistpq	32(%edi)
503	fistpq	24(%edi)
504	fistpq	16(%edi)
505	fistpq	8(%edi)
506	fistpq	0(%edi)
507	addl	$64,%esi
508	addl	$64,%edi
509	subl	$64,%ecx
510	cmpl	$64,%ecx
511	jae	large_i586_bcopy_loop
512	popl	%eax
513	addl	%eax,%ecx
514	cmpl	$64,%ecx
515	jae	4b
516
517	cmpl	$0,_npxproc
518	je	i586_bc2
519	frstor	0(%esp)
520	addl	$108,%esp
521i586_bc2:
522	lmsw	%dx
523	movb	$0xfe,kernel_fpu_lock
524
525/*
526 * This is a duplicate of the main part of generic_bcopy.  See the comments
527 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
528 * would mess up high resolution profiling.
529 */
530	ALIGN_TEXT
531small_i586_bcopy:
532	shrl	$2,%ecx
533	cld
534	rep
535	movsl
536	movl	20(%esp),%ecx
537	andl	$3,%ecx
538	rep
539	movsb
540	popl	%edi
541	popl	%esi
542	ret
543
544	ALIGN_TEXT
5451:
546	addl	%ecx,%edi
547	addl	%ecx,%esi
548	decl	%edi
549	decl	%esi
550	andl	$3,%ecx
551	std
552	rep
553	movsb
554	movl	20(%esp),%ecx
555	shrl	$2,%ecx
556	subl	$3,%esi
557	subl	$3,%edi
558	rep
559	movsl
560	popl	%edi
561	popl	%esi
562	cld
563	ret
564
565/*
566 * Note: memcpy does not support overlapping copies
567 */
568ENTRY(memcpy)
569	pushl	%edi
570	pushl	%esi
571	movl	12(%esp),%edi
572	movl	16(%esp),%esi
573	movl	20(%esp),%ecx
574	movl	%edi,%eax
575	shrl	$2,%ecx				/* copy by 32-bit words */
576	cld					/* nope, copy forwards */
577	rep
578	movsl
579	movl	20(%esp),%ecx
580	andl	$3,%ecx				/* any bytes left? */
581	rep
582	movsb
583	popl	%esi
584	popl	%edi
585	ret
586
587
588/*****************************************************************************/
589/* copyout and fubyte family                                                 */
590/*****************************************************************************/
591/*
592 * Access user memory from inside the kernel. These routines and possibly
593 * the math- and DOS emulators should be the only places that do this.
594 *
595 * We have to access the memory with user's permissions, so use a segment
596 * selector with RPL 3. For writes to user space we have to additionally
597 * check the PTE for write permission, because the 386 does not check
598 * write permissions when we are executing with EPL 0. The 486 does check
599 * this if the WP bit is set in CR0, so we can use a simpler version here.
600 *
601 * These routines set curpcb->onfault for the time they execute. When a
602 * protection violation occurs inside the functions, the trap handler
603 * returns to *curpcb->onfault instead of the function.
604 */
605
606
607ENTRY(copyout)					/* copyout(from_kernel, to_user, len) */
608	movl	_curpcb,%eax
609	movl	$copyout_fault,PCB_ONFAULT(%eax)
610	pushl	%esi
611	pushl	%edi
612	pushl	%ebx
613	movl	16(%esp),%esi
614	movl	20(%esp),%edi
615	movl	24(%esp),%ebx
616	testl	%ebx,%ebx			/* anything to do? */
617	jz	done_copyout
618
619	/*
620	 * Check explicitly for non-user addresses.  If 486 write protection
621	 * is being used, this check is essential because we are in kernel
622	 * mode so the h/w does not provide any protection against writing
623	 * kernel addresses.
624	 */
625
626	/*
627	 * First, prevent address wrapping.
628	 */
629	movl	%edi,%eax
630	addl	%ebx,%eax
631	jc	copyout_fault
632/*
633 * XXX STOP USING VM_MAXUSER_ADDRESS.
634 * It is an end address, not a max, so every time it is used correctly it
635 * looks like there is an off by one error, and of course it caused an off
636 * by one error in several places.
637 */
638	cmpl	$VM_MAXUSER_ADDRESS,%eax
639	ja	copyout_fault
640
641#if defined(I386_CPU)
642
643#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
644	cmpl	$CPUCLASS_386,_cpu_class
645	jne	3f
646#endif
647/*
648 * We have to check each PTE for user write permission.
649 * The checking may cause a page fault, so it is important to set
650 * up everything for return via copyout_fault before here.
651 */
652	/* compute number of pages */
653	movl	%edi,%ecx
654	andl	$PAGE_MASK,%ecx
655	addl	%ebx,%ecx
656	decl	%ecx
657	shrl	$IDXSHIFT+2,%ecx
658	incl	%ecx
659
660	/* compute PTE offset for start address */
661	movl	%edi,%edx
662	shrl	$IDXSHIFT,%edx
663	andb	$0xfc,%dl
664
6651:	/* check PTE for each page */
666	movb	_PTmap(%edx),%al
667	andb	$0x07,%al			/* Pages must be VALID + USERACC + WRITABLE */
668	cmpb	$0x07,%al
669	je	2f
670
671	/* simulate a trap */
672	pushl	%edx
673	pushl	%ecx
674	shll	$IDXSHIFT,%edx
675	pushl	%edx
676	call	_trapwrite			/* trapwrite(addr) */
677	popl	%edx
678	popl	%ecx
679	popl	%edx
680
681	testl	%eax,%eax			/* if not ok, return EFAULT */
682	jnz	copyout_fault
683
6842:
685	addl	$4,%edx
686	decl	%ecx
687	jnz	1b				/* check next page */
688#endif /* I386_CPU */
689
690	/* bcopy(%esi, %edi, %ebx) */
6913:
692	movl	%ebx,%ecx
693#if defined(I586_CPU) && defined(I586_FAST_BCOPY)
694	cmpl	$1024,%ecx
695	jb	slow_copyout
696
697#if defined(I386_CPU) || defined(I486_CPU) || defined(I686_CPU)
698	cmpl	$CPUCLASS_586,_cpu_class
699	jne	slow_copyout
700#endif /* I386_CPU || I486_CPU || I686_CPU */
701
702	pushl	%ecx
703	call	_fastmove
704	addl	$4,%esp
705	jmp	done_copyout
706
707	ALIGN_TEXT
708slow_copyout:
709#endif /* I586_CPU && I586_FAST_BCOPY */
710	shrl	$2,%ecx
711	cld
712	rep
713	movsl
714	movb	%bl,%cl
715	andb	$3,%cl
716	rep
717	movsb
718
719done_copyout:
720	popl	%ebx
721	popl	%edi
722	popl	%esi
723	xorl	%eax,%eax
724	movl	_curpcb,%edx
725	movl	%eax,PCB_ONFAULT(%edx)
726	ret
727
728	ALIGN_TEXT
729copyout_fault:
730	popl	%ebx
731	popl	%edi
732	popl	%esi
733	movl	_curpcb,%edx
734	movl	$0,PCB_ONFAULT(%edx)
735	movl	$EFAULT,%eax
736	ret
737
738/* copyin(from_user, to_kernel, len) */
739ENTRY(copyin)
740	movl	_curpcb,%eax
741	movl	$copyin_fault,PCB_ONFAULT(%eax)
742	pushl	%esi
743	pushl	%edi
744	movl	12(%esp),%esi			/* caddr_t from */
745	movl	16(%esp),%edi			/* caddr_t to */
746	movl	20(%esp),%ecx			/* size_t  len */
747
748	/*
749	 * make sure address is valid
750	 */
751	movl	%esi,%edx
752	addl	%ecx,%edx
753	jc	copyin_fault
754	cmpl	$VM_MAXUSER_ADDRESS,%edx
755	ja	copyin_fault
756
757#if defined(I586_CPU) && defined(I586_FAST_BCOPY)
758	cmpl	$1024,%ecx
759	jb	slow_copyin
760
761#if defined(I386_CPU) || defined(I486_CPU) || defined(I686_CPU)
762	cmpl	$CPUCLASS_586,_cpu_class
763	jne	slow_copyin
764#endif /* I386_CPU || I486_CPU || I686_CPU */
765
766	pushl	%ecx
767	call	_fastmove
768	addl	$4,%esp
769	jmp	done_copyin
770
771	ALIGN_TEXT
772slow_copyin:
773#endif /* I586_CPU && I586_FAST_BCOPY */
774	movb	%cl,%al
775	shrl	$2,%ecx				/* copy longword-wise */
776	cld
777	rep
778	movsl
779	movb	%al,%cl
780	andb	$3,%cl				/* copy remaining bytes */
781	rep
782	movsb
783
784#if defined(I586_CPU) && defined(I586_FAST_BCOPY)
785	ALIGN_TEXT
786done_copyin:
787#endif /* I586_CPU && I586_FAST_BCOPY */
788	popl	%edi
789	popl	%esi
790	xorl	%eax,%eax
791	movl	_curpcb,%edx
792	movl	%eax,PCB_ONFAULT(%edx)
793	ret
794
795	ALIGN_TEXT
796copyin_fault:
797	popl	%edi
798	popl	%esi
799	movl	_curpcb,%edx
800	movl	$0,PCB_ONFAULT(%edx)
801	movl	$EFAULT,%eax
802	ret
803
804#if defined(I586_CPU) && defined(I586_FAST_BCOPY)
805/* fastmove(src, dst, len)
806	src in %esi
807	dst in %edi
808	len in %ecx		XXX changed to on stack for profiling
809	uses %eax and %edx for tmp. storage
810 */
811/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
812ENTRY(fastmove)
813	movl	4(%esp),%ecx
814	cmpl	$63,%ecx
815	jbe	fastmove_tail
816
817	testl	$7,%esi	/* check if src addr is multiple of 8 */
818	jnz	fastmove_tail
819
820	testl	$7,%edi	/* check if dst addr is multiple of 8 */
821	jnz	fastmove_tail
822
823	pushl	%ebp
824	movl	%esp,%ebp
825	subl	$PCB_SAVEFPU_SIZE,%esp
826
827/* if (npxproc != NULL) { */
828	cmpl	$0,_npxproc
829	je	6f
830/*    fnsave(&curpcb->pcb_savefpu); */
831	movl	_curpcb,%eax
832	fnsave	PCB_SAVEFPU(%eax)
833/*   npxproc = NULL; */
834	movl	$0,_npxproc
835/* } */
8366:
837/* now we own the FPU. */
838
839/*
840 * The process' FP state is saved in the pcb, but if we get
841 * switched, the cpu_switch() will store our FP state in the
842 * pcb.  It should be possible to avoid all the copying for
843 * this, e.g., by setting a flag to tell cpu_switch() to
844 * save the state somewhere else.
845 */
846/* tmp = curpcb->pcb_savefpu; */
847	pushl	%edi
848	pushl	%esi
849	pushl	%ecx
850	leal	-PCB_SAVEFPU_SIZE(%ebp),%edi
851	movl	_curpcb,%esi
852	addl	$PCB_SAVEFPU,%esi
853	cld
854	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
855	rep
856	movsl
857	popl	%ecx
858	popl	%esi
859	popl	%edi
860/* stop_emulating(); */
861	clts
862/* npxproc = curproc; */
863	movl	_curproc,%eax
864	movl	%eax,_npxproc
8654:
866	pushl	%ecx
867	cmpl	$1792,%ecx
868	jbe	2f
869	movl	$1792,%ecx
8702:
871	subl	%ecx,0(%esp)
872	cmpl	$256,%ecx
873	jb	5f
874	pushl	%esi
875	pushl	%ecx
876	ALIGN_TEXT
8773:
878	movl	0(%esi),%eax
879	movl	32(%esi),%eax
880	movl	64(%esi),%eax
881	movl	96(%esi),%eax
882	movl	128(%esi),%eax
883	movl	160(%esi),%eax
884	movl	192(%esi),%eax
885	movl	224(%esi),%eax
886	addl	$256,%esi
887	subl	$256,%ecx
888	cmpl	$256,%ecx
889	jae	3b
890	popl	%ecx
891	popl	%esi
8925:
893	ALIGN_TEXT
894fastmove_loop:
895	fildq	0(%esi)
896	fildq	8(%esi)
897	fildq	16(%esi)
898	fildq	24(%esi)
899	fildq	32(%esi)
900	fildq	40(%esi)
901	fildq	48(%esi)
902	fildq	56(%esi)
903	fistpq	56(%edi)
904	fistpq	48(%edi)
905	fistpq	40(%edi)
906	fistpq	32(%edi)
907	fistpq	24(%edi)
908	fistpq	16(%edi)
909	fistpq	8(%edi)
910	fistpq	0(%edi)
911	addl	$-64,%ecx
912	addl	$64,%esi
913	addl	$64,%edi
914	cmpl	$63,%ecx
915	ja	fastmove_loop
916	popl	%eax
917	addl	%eax,%ecx
918	cmpl	$64,%ecx
919	jae	4b
920
921/* curpcb->pcb_savefpu = tmp; */
922	pushl	%edi
923	pushl	%esi
924	pushl	%ecx
925	movl	_curpcb,%edi
926	addl	$PCB_SAVEFPU,%edi
927	leal	-PCB_SAVEFPU_SIZE(%ebp),%esi
928	cld
929	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
930	rep
931	movsl
932	popl	%ecx
933	popl	%esi
934	popl	%edi
935
936/* start_emulating(); */
937	smsw	%ax
938	orb	$CR0_TS,%al
939	lmsw	%ax
940/* npxproc = NULL; */
941	movl	$0,_npxproc
942	movl	%ebp,%esp
943	popl	%ebp
944
945	ALIGN_TEXT
946fastmove_tail:
947	movb	%cl,%al
948	shrl	$2,%ecx				/* copy longword-wise */
949	cld
950	rep
951	movsl
952	movb	%al,%cl
953	andb	$3,%cl				/* copy remaining bytes */
954	rep
955	movsb
956
957	ret
958#endif /* I586_CPU && I586_FAST_BCOPY */
959
960/*
961 * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
962 */
963ENTRY(fuword)
964	movl	_curpcb,%ecx
965	movl	$fusufault,PCB_ONFAULT(%ecx)
966	movl	4(%esp),%edx			/* from */
967
968	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
969	ja	fusufault
970
971	movl	(%edx),%eax
972	movl	$0,PCB_ONFAULT(%ecx)
973	ret
974
975/*
976 * These two routines are called from the profiling code, potentially
977 * at interrupt time. If they fail, that's okay, good things will
978 * happen later. Fail all the time for now - until the trap code is
979 * able to deal with this.
980 */
981ALTENTRY(suswintr)
982ENTRY(fuswintr)
983	movl	$-1,%eax
984	ret
985
986ENTRY(fusword)
987	movl	_curpcb,%ecx
988	movl	$fusufault,PCB_ONFAULT(%ecx)
989	movl	4(%esp),%edx
990
991	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
992	ja	fusufault
993
994	movzwl	(%edx),%eax
995	movl	$0,PCB_ONFAULT(%ecx)
996	ret
997
998ENTRY(fubyte)
999	movl	_curpcb,%ecx
1000	movl	$fusufault,PCB_ONFAULT(%ecx)
1001	movl	4(%esp),%edx
1002
1003	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1004	ja	fusufault
1005
1006	movzbl	(%edx),%eax
1007	movl	$0,PCB_ONFAULT(%ecx)
1008	ret
1009
1010	ALIGN_TEXT
1011fusufault:
1012	movl	_curpcb,%ecx
1013	xorl	%eax,%eax
1014	movl	%eax,PCB_ONFAULT(%ecx)
1015	decl	%eax
1016	ret
1017
1018/*
1019 * su{byte,sword,word}: write a byte (word, longword) to user memory
1020 */
1021ENTRY(suword)
1022	movl	_curpcb,%ecx
1023	movl	$fusufault,PCB_ONFAULT(%ecx)
1024	movl	4(%esp),%edx
1025
1026#if defined(I386_CPU)
1027
1028#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1029	cmpl	$CPUCLASS_386,_cpu_class
1030	jne	2f				/* we only have to set the right segment selector */
1031#endif /* I486_CPU || I586_CPU || I686_CPU */
1032
1033	/* XXX - page boundary crossing is still not handled */
1034	movl	%edx,%eax
1035	shrl	$IDXSHIFT,%edx
1036	andb	$0xfc,%dl
1037	movb	_PTmap(%edx),%dl
1038	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1039	cmpb	$0x7,%dl
1040	je	1f
1041
1042	/* simulate a trap */
1043	pushl	%eax
1044	call	_trapwrite
1045	popl	%edx				/* remove junk parameter from stack */
1046	movl	_curpcb,%ecx			/* restore trashed register */
1047	testl	%eax,%eax
1048	jnz	fusufault
10491:
1050	movl	4(%esp),%edx
1051#endif
1052
10532:
1054	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1055	ja	fusufault
1056
1057	movl	8(%esp),%eax
1058	movl	%eax,(%edx)
1059	xorl	%eax,%eax
1060	movl	%eax,PCB_ONFAULT(%ecx)
1061	ret
1062
1063ENTRY(susword)
1064	movl	_curpcb,%ecx
1065	movl	$fusufault,PCB_ONFAULT(%ecx)
1066	movl	4(%esp),%edx
1067
1068#if defined(I386_CPU)
1069
1070#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1071	cmpl	$CPUCLASS_386,_cpu_class
1072	jne	2f
1073#endif /* I486_CPU || I586_CPU || I686_CPU */
1074
1075	/* XXX - page boundary crossing is still not handled */
1076	movl	%edx,%eax
1077	shrl	$IDXSHIFT,%edx
1078	andb	$0xfc,%dl
1079	movb	_PTmap(%edx),%dl
1080	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1081	cmpb	$0x7,%dl
1082	je	1f
1083
1084	/* simulate a trap */
1085	pushl	%eax
1086	call	_trapwrite
1087	popl	%edx				/* remove junk parameter from stack */
1088	movl	_curpcb,%ecx			/* restore trashed register */
1089	testl	%eax,%eax
1090	jnz	fusufault
10911:
1092	movl	4(%esp),%edx
1093#endif
1094
10952:
1096	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1097	ja	fusufault
1098
1099	movw	8(%esp),%ax
1100	movw	%ax,(%edx)
1101	xorl	%eax,%eax
1102	movl	%eax,PCB_ONFAULT(%ecx)
1103	ret
1104
1105ALTENTRY(suibyte)
1106ENTRY(subyte)
1107	movl	_curpcb,%ecx
1108	movl	$fusufault,PCB_ONFAULT(%ecx)
1109	movl	4(%esp),%edx
1110
1111#if defined(I386_CPU)
1112
1113#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1114	cmpl	$CPUCLASS_386,_cpu_class
1115	jne	2f
1116#endif /* I486_CPU || I586_CPU || I686_CPU */
1117
1118	movl	%edx,%eax
1119	shrl	$IDXSHIFT,%edx
1120	andb	$0xfc,%dl
1121	movb	_PTmap(%edx),%dl
1122	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1123	cmpb	$0x7,%dl
1124	je	1f
1125
1126	/* simulate a trap */
1127	pushl	%eax
1128	call	_trapwrite
1129	popl	%edx				/* remove junk parameter from stack */
1130	movl	_curpcb,%ecx			/* restore trashed register */
1131	testl	%eax,%eax
1132	jnz	fusufault
11331:
1134	movl	4(%esp),%edx
1135#endif
1136
11372:
1138	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1139	ja	fusufault
1140
1141	movb	8(%esp),%al
1142	movb	%al,(%edx)
1143	xorl	%eax,%eax
1144	movl	%eax,PCB_ONFAULT(%ecx)
1145	ret
1146
1147/*
1148 * copyinstr(from, to, maxlen, int *lencopied)
1149 *	copy a string from from to to, stop when a 0 character is reached.
1150 *	return ENAMETOOLONG if string is longer than maxlen, and
1151 *	EFAULT on protection violations. If lencopied is non-zero,
1152 *	return the actual length in *lencopied.
1153 */
1154ENTRY(copyinstr)
1155	pushl	%esi
1156	pushl	%edi
1157	movl	_curpcb,%ecx
1158	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1159
1160	movl	12(%esp),%esi			/* %esi = from */
1161	movl	16(%esp),%edi			/* %edi = to */
1162	movl	20(%esp),%edx			/* %edx = maxlen */
1163
1164	movl	$VM_MAXUSER_ADDRESS,%eax
1165
1166	/* make sure 'from' is within bounds */
1167	subl	%esi,%eax
1168	jbe	cpystrflt
1169
1170	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1171	cmpl	%edx,%eax
1172	jae	1f
1173	movl	%eax,%edx
1174	movl	%eax,20(%esp)
11751:
1176	incl	%edx
1177	cld
1178
11792:
1180	decl	%edx
1181	jz	3f
1182
1183	lodsb
1184	stosb
1185	orb	%al,%al
1186	jnz	2b
1187
1188	/* Success -- 0 byte reached */
1189	decl	%edx
1190	xorl	%eax,%eax
1191	jmp	cpystrflt_x
11923:
1193	/* edx is zero - return ENAMETOOLONG or EFAULT */
1194	cmpl	$VM_MAXUSER_ADDRESS,%esi
1195	jae	cpystrflt
11964:
1197	movl	$ENAMETOOLONG,%eax
1198	jmp	cpystrflt_x
1199
1200cpystrflt:
1201	movl	$EFAULT,%eax
1202
1203cpystrflt_x:
1204	/* set *lencopied and return %eax */
1205	movl	_curpcb,%ecx
1206	movl	$0,PCB_ONFAULT(%ecx)
1207	movl	20(%esp),%ecx
1208	subl	%edx,%ecx
1209	movl	24(%esp),%edx
1210	testl	%edx,%edx
1211	jz	1f
1212	movl	%ecx,(%edx)
12131:
1214	popl	%edi
1215	popl	%esi
1216	ret
1217
1218
1219/*
1220 * copystr(from, to, maxlen, int *lencopied)
1221 */
1222ENTRY(copystr)
1223	pushl	%esi
1224	pushl	%edi
1225
1226	movl	12(%esp),%esi			/* %esi = from */
1227	movl	16(%esp),%edi			/* %edi = to */
1228	movl	20(%esp),%edx			/* %edx = maxlen */
1229	incl	%edx
1230	cld
12311:
1232	decl	%edx
1233	jz	4f
1234	lodsb
1235	stosb
1236	orb	%al,%al
1237	jnz	1b
1238
1239	/* Success -- 0 byte reached */
1240	decl	%edx
1241	xorl	%eax,%eax
1242	jmp	6f
12434:
1244	/* edx is zero -- return ENAMETOOLONG */
1245	movl	$ENAMETOOLONG,%eax
1246
12476:
1248	/* set *lencopied and return %eax */
1249	movl	20(%esp),%ecx
1250	subl	%edx,%ecx
1251	movl	24(%esp),%edx
1252	testl	%edx,%edx
1253	jz	7f
1254	movl	%ecx,(%edx)
12557:
1256	popl	%edi
1257	popl	%esi
1258	ret
1259
1260ENTRY(bcmp)
1261	pushl	%edi
1262	pushl	%esi
1263	movl	12(%esp),%edi
1264	movl	16(%esp),%esi
1265	movl	20(%esp),%edx
1266	xorl	%eax,%eax
1267
1268	movl	%edx,%ecx
1269	shrl	$2,%ecx
1270	cld					/* compare forwards */
1271	repe
1272	cmpsl
1273	jne	1f
1274
1275	movl	%edx,%ecx
1276	andl	$3,%ecx
1277	repe
1278	cmpsb
1279	je	2f
12801:
1281	incl	%eax
12822:
1283	popl	%esi
1284	popl	%edi
1285	ret
1286
1287
1288/*
1289 * Handling of special 386 registers and descriptor tables etc
1290 */
1291/* void lgdt(struct region_descriptor *rdp); */
1292ENTRY(lgdt)
1293	/* reload the descriptor table */
1294	movl	4(%esp),%eax
1295	lgdt	(%eax)
1296
1297	/* flush the prefetch q */
1298	jmp	1f
1299	nop
13001:
1301	/* reload "stale" selectors */
1302	movl	$KDSEL,%eax
1303	movl	%ax,%ds
1304	movl	%ax,%es
1305	movl	%ax,%ss
1306
1307	/* reload code selector by turning return into intersegmental return */
1308	movl	(%esp),%eax
1309	pushl	%eax
1310#	movl	$KCSEL,4(%esp)
1311	movl	$8,4(%esp)
1312	lret
1313
1314/*
1315 * void lidt(struct region_descriptor *rdp);
1316 */
1317ENTRY(lidt)
1318	movl	4(%esp),%eax
1319	lidt	(%eax)
1320	ret
1321
1322/*
1323 * void lldt(u_short sel)
1324 */
1325ENTRY(lldt)
1326	lldt	4(%esp)
1327	ret
1328
1329/*
1330 * void ltr(u_short sel)
1331 */
1332ENTRY(ltr)
1333	ltr	4(%esp)
1334	ret
1335
1336/* ssdtosd(*ssdp,*sdp) */
1337ENTRY(ssdtosd)
1338	pushl	%ebx
1339	movl	8(%esp),%ecx
1340	movl	8(%ecx),%ebx
1341	shll	$16,%ebx
1342	movl	(%ecx),%edx
1343	roll	$16,%edx
1344	movb	%dh,%bl
1345	movb	%dl,%bh
1346	rorl	$8,%ebx
1347	movl	4(%ecx),%eax
1348	movw	%ax,%dx
1349	andl	$0xf0000,%eax
1350	orl	%eax,%ebx
1351	movl	12(%esp),%ecx
1352	movl	%edx,(%ecx)
1353	movl	%ebx,4(%ecx)
1354	popl	%ebx
1355	ret
1356
1357/* load_cr0(cr0) */
1358ENTRY(load_cr0)
1359	movl	4(%esp),%eax
1360	movl	%eax,%cr0
1361	ret
1362
1363/* rcr0() */
1364ENTRY(rcr0)
1365	movl	%cr0,%eax
1366	ret
1367
1368/* rcr3() */
1369ENTRY(rcr3)
1370	movl	%cr3,%eax
1371	ret
1372
1373/* void load_cr3(caddr_t cr3) */
1374ENTRY(load_cr3)
1375	movl	4(%esp),%eax
1376	movl	%eax,%cr3
1377	ret
1378
1379
1380/*****************************************************************************/
1381/* setjump, longjump                                                         */
1382/*****************************************************************************/
1383
1384ENTRY(setjmp)
1385	movl	4(%esp),%eax
1386	movl	%ebx,(%eax)			/* save ebx */
1387	movl	%esp,4(%eax)			/* save esp */
1388	movl	%ebp,8(%eax)			/* save ebp */
1389	movl	%esi,12(%eax)			/* save esi */
1390	movl	%edi,16(%eax)			/* save edi */
1391	movl	(%esp),%edx			/* get rta */
1392	movl	%edx,20(%eax)			/* save eip */
1393	xorl	%eax,%eax			/* return(0); */
1394	ret
1395
1396ENTRY(longjmp)
1397	movl	4(%esp),%eax
1398	movl	(%eax),%ebx			/* restore ebx */
1399	movl	4(%eax),%esp			/* restore esp */
1400	movl	8(%eax),%ebp			/* restore ebp */
1401	movl	12(%eax),%esi			/* restore esi */
1402	movl	16(%eax),%edi			/* restore edi */
1403	movl	20(%eax),%edx			/* get rta */
1404	movl	%edx,(%esp)			/* put in return frame */
1405	xorl	%eax,%eax			/* return(1); */
1406	incl	%eax
1407	ret
1408
1409/*
1410 * Here for doing BB-profiling (gcc -a).
1411 * We rely on the "bbset" instead, but need a dummy function.
1412 */
1413NON_GPROF_ENTRY(__bb_init_func)
1414	movl	4(%esp),%eax
1415	movl	$1,(%eax)
1416	.byte	0xc3				/* avoid macro for `ret' */
1417