support.s revision 120627
1/*-
2 * Copyright (c) 1993 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * $FreeBSD: head/sys/i386/i386/support.s 120627 2003-10-01 08:52:46Z jeff $
34 */
35
36#include "opt_npx.h"
37
38#include <machine/asmacros.h>
39#include <machine/cputypes.h>
40#include <machine/pmap.h>
41#include <machine/specialreg.h>
42
43#include "assym.s"
44
45#define IDXSHIFT	10
46
47	.data
48	.globl	bcopy_vector
49bcopy_vector:
50	.long	generic_bcopy
51	.globl	bzero_vector
52bzero_vector:
53	.long	generic_bzero
54	.globl	copyin_vector
55copyin_vector:
56	.long	generic_copyin
57	.globl	copyout_vector
58copyout_vector:
59	.long	generic_copyout
60#if defined(I586_CPU) && defined(DEV_NPX)
61kernel_fpu_lock:
62	.byte	0xfe
63	.space	3
64#endif
65
66	.text
67
68/*
69 * bcopy family
70 * void bzero(void *buf, u_int len)
71 */
72
73ENTRY(bzero)
74	MEXITCOUNT
75	jmp	*bzero_vector
76
77ENTRY(generic_bzero)
78	pushl	%edi
79	movl	8(%esp),%edi
80	movl	12(%esp),%ecx
81	xorl	%eax,%eax
82	shrl	$2,%ecx
83	cld
84	rep
85	stosl
86	movl	12(%esp),%ecx
87	andl	$3,%ecx
88	rep
89	stosb
90	popl	%edi
91	ret
92
93#ifdef I486_CPU
94ENTRY(i486_bzero)
95	movl	4(%esp),%edx
96	movl	8(%esp),%ecx
97	xorl	%eax,%eax
98/*
99 * do 64 byte chunks first
100 *
101 * XXX this is probably over-unrolled at least for DX2's
102 */
1032:
104	cmpl	$64,%ecx
105	jb	3f
106	movl	%eax,(%edx)
107	movl	%eax,4(%edx)
108	movl	%eax,8(%edx)
109	movl	%eax,12(%edx)
110	movl	%eax,16(%edx)
111	movl	%eax,20(%edx)
112	movl	%eax,24(%edx)
113	movl	%eax,28(%edx)
114	movl	%eax,32(%edx)
115	movl	%eax,36(%edx)
116	movl	%eax,40(%edx)
117	movl	%eax,44(%edx)
118	movl	%eax,48(%edx)
119	movl	%eax,52(%edx)
120	movl	%eax,56(%edx)
121	movl	%eax,60(%edx)
122	addl	$64,%edx
123	subl	$64,%ecx
124	jnz	2b
125	ret
126
127/*
128 * do 16 byte chunks
129 */
130	SUPERALIGN_TEXT
1313:
132	cmpl	$16,%ecx
133	jb	4f
134	movl	%eax,(%edx)
135	movl	%eax,4(%edx)
136	movl	%eax,8(%edx)
137	movl	%eax,12(%edx)
138	addl	$16,%edx
139	subl	$16,%ecx
140	jnz	3b
141	ret
142
143/*
144 * do 4 byte chunks
145 */
146	SUPERALIGN_TEXT
1474:
148	cmpl	$4,%ecx
149	jb	5f
150	movl	%eax,(%edx)
151	addl	$4,%edx
152	subl	$4,%ecx
153	jnz	4b
154	ret
155
156/*
157 * do 1 byte chunks
158 * a jump table seems to be faster than a loop or more range reductions
159 *
160 * XXX need a const section for non-text
161 */
162	.data
163jtab:
164	.long	do0
165	.long	do1
166	.long	do2
167	.long	do3
168
169	.text
170	SUPERALIGN_TEXT
1715:
172	jmp	*jtab(,%ecx,4)
173
174	SUPERALIGN_TEXT
175do3:
176	movw	%ax,(%edx)
177	movb	%al,2(%edx)
178	ret
179
180	SUPERALIGN_TEXT
181do2:
182	movw	%ax,(%edx)
183	ret
184
185	SUPERALIGN_TEXT
186do1:
187	movb	%al,(%edx)
188	ret
189
190	SUPERALIGN_TEXT
191do0:
192	ret
193#endif
194
195#if defined(I586_CPU) && defined(DEV_NPX)
196ENTRY(i586_bzero)
197	movl	4(%esp),%edx
198	movl	8(%esp),%ecx
199
200	/*
201	 * The FPU register method is twice as fast as the integer register
202	 * method unless the target is in the L1 cache and we pre-allocate a
203	 * cache line for it (then the integer register method is 4-5 times
204	 * faster).  However, we never pre-allocate cache lines, since that
205	 * would make the integer method 25% or more slower for the common
206	 * case when the target isn't in either the L1 cache or the L2 cache.
207	 * Thus we normally use the FPU register method unless the overhead
208	 * would be too large.
209	 */
210	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
211	jb	intreg_i586_bzero
212
213	/*
214	 * The FPU registers may belong to an application or to fastmove()
215	 * or to another invocation of bcopy() or ourself in a higher level
216	 * interrupt or trap handler.  Preserving the registers is
217	 * complicated since we avoid it if possible at all levels.  We
218	 * want to localize the complications even when that increases them.
219	 * Here the extra work involves preserving CR0_TS in TS.
220	 * `fpcurthread != NULL' is supposed to be the condition that all the
221	 * FPU resources belong to an application, but fpcurthread and CR0_TS
222	 * aren't set atomically enough for this condition to work in
223	 * interrupt handlers.
224	 *
225	 * Case 1: FPU registers belong to the application: we must preserve
226	 * the registers if we use them, so we only use the FPU register
227	 * method if the target size is large enough to amortize the extra
228	 * overhead for preserving them.  CR0_TS must be preserved although
229	 * it is very likely to end up as set.
230	 *
231	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
232	 * makes the registers look like they belong to an application so
233	 * that cpu_switch() and savectx() don't have to know about it, so
234	 * this case reduces to case 1.
235	 *
236	 * Case 3: FPU registers belong to the kernel: don't use the FPU
237	 * register method.  This case is unlikely, and supporting it would
238	 * be more complicated and might take too much stack.
239	 *
240	 * Case 4: FPU registers don't belong to anyone: the FPU registers
241	 * don't need to be preserved, so we always use the FPU register
242	 * method.  CR0_TS must be preserved although it is very likely to
243	 * always end up as clear.
244	 */
245	cmpl	$0,PCPU(FPCURTHREAD)
246	je	i586_bz1
247
248	/*
249	 * XXX don't use the FPU for cases 1 and 2, since preemptive
250	 * scheduling of ithreads broke these cases.  Note that we can
251	 * no longer get here from an interrupt handler, since the
252	 * context sitch to the interrupt handler will have saved the
253	 * FPU state.
254	 */
255	jmp	intreg_i586_bzero
256
257	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
258	jb	intreg_i586_bzero
259	sarb	$1,kernel_fpu_lock
260	jc	intreg_i586_bzero
261	smsw	%ax
262	clts
263	subl	$108,%esp
264	fnsave	0(%esp)
265	jmp	i586_bz2
266
267i586_bz1:
268	sarb	$1,kernel_fpu_lock
269	jc	intreg_i586_bzero
270	smsw	%ax
271	clts
272	fninit				/* XXX should avoid needing this */
273i586_bz2:
274	fldz
275
276	/*
277	 * Align to an 8 byte boundary (misalignment in the main loop would
278	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
279	 * already aligned) by always zeroing 8 bytes and using the part up
280	 * to the _next_ alignment position.
281	 */
282	fstl	0(%edx)
283	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
284	addl	$8,%edx
285	andl	$~7,%edx
286	subl	%edx,%ecx
287
288	/*
289	 * Similarly align `len' to a multiple of 8.
290	 */
291	fstl	-8(%edx,%ecx)
292	decl	%ecx
293	andl	$~7,%ecx
294
295	/*
296	 * This wouldn't be any faster if it were unrolled, since the loop
297	 * control instructions are much faster than the fstl and/or done
298	 * in parallel with it so their overhead is insignificant.
299	 */
300fpureg_i586_bzero_loop:
301	fstl	0(%edx)
302	addl	$8,%edx
303	subl	$8,%ecx
304	cmpl	$8,%ecx
305	jae	fpureg_i586_bzero_loop
306
307	cmpl	$0,PCPU(FPCURTHREAD)
308	je	i586_bz3
309
310	/* XXX check that the condition for cases 1-2 stayed false. */
311i586_bzero_oops:
312	int	$3
313	jmp	i586_bzero_oops
314
315	frstor	0(%esp)
316	addl	$108,%esp
317	lmsw	%ax
318	movb	$0xfe,kernel_fpu_lock
319	ret
320
321i586_bz3:
322	fstp	%st(0)
323	lmsw	%ax
324	movb	$0xfe,kernel_fpu_lock
325	ret
326
327intreg_i586_bzero:
328	/*
329	 * `rep stos' seems to be the best method in practice for small
330	 * counts.  Fancy methods usually take too long to start up due
331	 * to cache and BTB misses.
332	 */
333	pushl	%edi
334	movl	%edx,%edi
335	xorl	%eax,%eax
336	shrl	$2,%ecx
337	cld
338	rep
339	stosl
340	movl	12(%esp),%ecx
341	andl	$3,%ecx
342	jne	1f
343	popl	%edi
344	ret
345
3461:
347	rep
348	stosb
349	popl	%edi
350	ret
351#endif /* I586_CPU && defined(DEV_NPX) */
352
353ENTRY(sse2_pagezero)
354	pushl	%ebx
355	movl	8(%esp),%ecx
356	movl	%ecx,%eax
357	addl	$4096,%eax
358	xor	%ebx,%ebx
3591:
360	movnti	%ebx,(%ecx)
361	addl	$4,%ecx
362	cmpl	%ecx,%eax
363	jne	1b
364	sfence
365	popl	%ebx
366	ret
367
368ENTRY(i686_pagezero)
369	pushl	%edi
370	pushl	%ebx
371
372	movl	12(%esp), %edi
373	movl	$1024, %ecx
374	cld
375
376	ALIGN_TEXT
3771:
378	xorl	%eax, %eax
379	repe
380	scasl
381	jnz	2f
382
383	popl	%ebx
384	popl	%edi
385	ret
386
387	ALIGN_TEXT
388
3892:
390	incl	%ecx
391	subl	$4, %edi
392
393	movl	%ecx, %edx
394	cmpl	$16, %ecx
395
396	jge	3f
397
398	movl	%edi, %ebx
399	andl	$0x3f, %ebx
400	shrl	%ebx
401	shrl	%ebx
402	movl	$16, %ecx
403	subl	%ebx, %ecx
404
4053:
406	subl	%ecx, %edx
407	rep
408	stosl
409
410	movl	%edx, %ecx
411	testl	%edx, %edx
412	jnz	1b
413
414	popl	%ebx
415	popl	%edi
416	ret
417
418/* fillw(pat, base, cnt) */
419ENTRY(fillw)
420	pushl	%edi
421	movl	8(%esp),%eax
422	movl	12(%esp),%edi
423	movl	16(%esp),%ecx
424	cld
425	rep
426	stosw
427	popl	%edi
428	ret
429
430ENTRY(bcopyb)
431	pushl	%esi
432	pushl	%edi
433	movl	12(%esp),%esi
434	movl	16(%esp),%edi
435	movl	20(%esp),%ecx
436	movl	%edi,%eax
437	subl	%esi,%eax
438	cmpl	%ecx,%eax			/* overlapping && src < dst? */
439	jb	1f
440	cld					/* nope, copy forwards */
441	rep
442	movsb
443	popl	%edi
444	popl	%esi
445	ret
446
447	ALIGN_TEXT
4481:
449	addl	%ecx,%edi			/* copy backwards. */
450	addl	%ecx,%esi
451	decl	%edi
452	decl	%esi
453	std
454	rep
455	movsb
456	popl	%edi
457	popl	%esi
458	cld
459	ret
460
461ENTRY(bcopy)
462	MEXITCOUNT
463	jmp	*bcopy_vector
464
465/*
466 * generic_bcopy(src, dst, cnt)
467 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
468 */
469ENTRY(generic_bcopy)
470	pushl	%esi
471	pushl	%edi
472	movl	12(%esp),%esi
473	movl	16(%esp),%edi
474	movl	20(%esp),%ecx
475
476	movl	%edi,%eax
477	subl	%esi,%eax
478	cmpl	%ecx,%eax			/* overlapping && src < dst? */
479	jb	1f
480
481	shrl	$2,%ecx				/* copy by 32-bit words */
482	cld					/* nope, copy forwards */
483	rep
484	movsl
485	movl	20(%esp),%ecx
486	andl	$3,%ecx				/* any bytes left? */
487	rep
488	movsb
489	popl	%edi
490	popl	%esi
491	ret
492
493	ALIGN_TEXT
4941:
495	addl	%ecx,%edi			/* copy backwards */
496	addl	%ecx,%esi
497	decl	%edi
498	decl	%esi
499	andl	$3,%ecx				/* any fractional bytes? */
500	std
501	rep
502	movsb
503	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
504	shrl	$2,%ecx
505	subl	$3,%esi
506	subl	$3,%edi
507	rep
508	movsl
509	popl	%edi
510	popl	%esi
511	cld
512	ret
513
514#if defined(I586_CPU) && defined(DEV_NPX)
515ENTRY(i586_bcopy)
516	pushl	%esi
517	pushl	%edi
518	movl	12(%esp),%esi
519	movl	16(%esp),%edi
520	movl	20(%esp),%ecx
521
522	movl	%edi,%eax
523	subl	%esi,%eax
524	cmpl	%ecx,%eax			/* overlapping && src < dst? */
525	jb	1f
526
527	cmpl	$1024,%ecx
528	jb	small_i586_bcopy
529
530	sarb	$1,kernel_fpu_lock
531	jc	small_i586_bcopy
532	cmpl	$0,PCPU(FPCURTHREAD)
533	je	i586_bc1
534
535	/* XXX turn off handling of cases 1-2, as above. */
536	movb	$0xfe,kernel_fpu_lock
537	jmp	small_i586_bcopy
538
539	smsw	%dx
540	clts
541	subl	$108,%esp
542	fnsave	0(%esp)
543	jmp	4f
544
545i586_bc1:
546	smsw	%dx
547	clts
548	fninit				/* XXX should avoid needing this */
549
550	ALIGN_TEXT
5514:
552	pushl	%ecx
553#define	DCACHE_SIZE	8192
554	cmpl	$(DCACHE_SIZE-512)/2,%ecx
555	jbe	2f
556	movl	$(DCACHE_SIZE-512)/2,%ecx
5572:
558	subl	%ecx,0(%esp)
559	cmpl	$256,%ecx
560	jb	5f			/* XXX should prefetch if %ecx >= 32 */
561	pushl	%esi
562	pushl	%ecx
563	ALIGN_TEXT
5643:
565	movl	0(%esi),%eax
566	movl	32(%esi),%eax
567	movl	64(%esi),%eax
568	movl	96(%esi),%eax
569	movl	128(%esi),%eax
570	movl	160(%esi),%eax
571	movl	192(%esi),%eax
572	movl	224(%esi),%eax
573	addl	$256,%esi
574	subl	$256,%ecx
575	cmpl	$256,%ecx
576	jae	3b
577	popl	%ecx
578	popl	%esi
5795:
580	ALIGN_TEXT
581large_i586_bcopy_loop:
582	fildq	0(%esi)
583	fildq	8(%esi)
584	fildq	16(%esi)
585	fildq	24(%esi)
586	fildq	32(%esi)
587	fildq	40(%esi)
588	fildq	48(%esi)
589	fildq	56(%esi)
590	fistpq	56(%edi)
591	fistpq	48(%edi)
592	fistpq	40(%edi)
593	fistpq	32(%edi)
594	fistpq	24(%edi)
595	fistpq	16(%edi)
596	fistpq	8(%edi)
597	fistpq	0(%edi)
598	addl	$64,%esi
599	addl	$64,%edi
600	subl	$64,%ecx
601	cmpl	$64,%ecx
602	jae	large_i586_bcopy_loop
603	popl	%eax
604	addl	%eax,%ecx
605	cmpl	$64,%ecx
606	jae	4b
607
608	cmpl	$0,PCPU(FPCURTHREAD)
609	je	i586_bc2
610
611	/* XXX check that the condition for cases 1-2 stayed false. */
612i586_bcopy_oops:
613	int	$3
614	jmp	i586_bcopy_oops
615
616	frstor	0(%esp)
617	addl	$108,%esp
618i586_bc2:
619	lmsw	%dx
620	movb	$0xfe,kernel_fpu_lock
621
622/*
623 * This is a duplicate of the main part of generic_bcopy.  See the comments
624 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
625 * would mess up high resolution profiling.
626 */
627	ALIGN_TEXT
628small_i586_bcopy:
629	shrl	$2,%ecx
630	cld
631	rep
632	movsl
633	movl	20(%esp),%ecx
634	andl	$3,%ecx
635	rep
636	movsb
637	popl	%edi
638	popl	%esi
639	ret
640
641	ALIGN_TEXT
6421:
643	addl	%ecx,%edi
644	addl	%ecx,%esi
645	decl	%edi
646	decl	%esi
647	andl	$3,%ecx
648	std
649	rep
650	movsb
651	movl	20(%esp),%ecx
652	shrl	$2,%ecx
653	subl	$3,%esi
654	subl	$3,%edi
655	rep
656	movsl
657	popl	%edi
658	popl	%esi
659	cld
660	ret
661#endif /* I586_CPU && defined(DEV_NPX) */
662
663/*
664 * Note: memcpy does not support overlapping copies
665 */
666ENTRY(memcpy)
667	pushl	%edi
668	pushl	%esi
669	movl	12(%esp),%edi
670	movl	16(%esp),%esi
671	movl	20(%esp),%ecx
672	movl	%edi,%eax
673	shrl	$2,%ecx				/* copy by 32-bit words */
674	cld					/* nope, copy forwards */
675	rep
676	movsl
677	movl	20(%esp),%ecx
678	andl	$3,%ecx				/* any bytes left? */
679	rep
680	movsb
681	popl	%esi
682	popl	%edi
683	ret
684
685
686/*****************************************************************************/
687/* copyout and fubyte family                                                 */
688/*****************************************************************************/
689/*
690 * Access user memory from inside the kernel. These routines and possibly
691 * the math- and DOS emulators should be the only places that do this.
692 *
693 * We have to access the memory with user's permissions, so use a segment
694 * selector with RPL 3. For writes to user space we have to additionally
695 * check the PTE for write permission, because the 386 does not check
696 * write permissions when we are executing with EPL 0. The 486 does check
697 * this if the WP bit is set in CR0, so we can use a simpler version here.
698 *
699 * These routines set curpcb->onfault for the time they execute. When a
700 * protection violation occurs inside the functions, the trap handler
701 * returns to *curpcb->onfault instead of the function.
702 */
703
704/*
705 * copyout(from_kernel, to_user, len)  - MP SAFE (if not I386_CPU)
706 */
707ENTRY(copyout)
708	MEXITCOUNT
709	jmp	*copyout_vector
710
711ENTRY(generic_copyout)
712	movl	PCPU(CURPCB),%eax
713	movl	$copyout_fault,PCB_ONFAULT(%eax)
714	pushl	%esi
715	pushl	%edi
716	pushl	%ebx
717	movl	16(%esp),%esi
718	movl	20(%esp),%edi
719	movl	24(%esp),%ebx
720	testl	%ebx,%ebx			/* anything to do? */
721	jz	done_copyout
722
723	/*
724	 * Check explicitly for non-user addresses.  If 486 write protection
725	 * is being used, this check is essential because we are in kernel
726	 * mode so the h/w does not provide any protection against writing
727	 * kernel addresses.
728	 */
729
730	/*
731	 * First, prevent address wrapping.
732	 */
733	movl	%edi,%eax
734	addl	%ebx,%eax
735	jc	copyout_fault
736/*
737 * XXX STOP USING VM_MAXUSER_ADDRESS.
738 * It is an end address, not a max, so every time it is used correctly it
739 * looks like there is an off by one error, and of course it caused an off
740 * by one error in several places.
741 */
742	cmpl	$VM_MAXUSER_ADDRESS,%eax
743	ja	copyout_fault
744
745#ifdef I386_CPU
746
747/*
748 * We have to check each PTE for user write permission.
749 * The checking may cause a page fault, so it is important to set
750 * up everything for return via copyout_fault before here.
751 */
752	/* compute number of pages */
753	movl	%edi,%ecx
754	andl	$PAGE_MASK,%ecx
755	addl	%ebx,%ecx
756	decl	%ecx
757	shrl	$IDXSHIFT+2,%ecx
758	incl	%ecx
759
760	/* compute PTE offset for start address */
761	movl	%edi,%edx
762	shrl	$IDXSHIFT,%edx
763	andb	$0xfc,%dl
764
7651:
766	/* check PTE for each page */
767	leal	PTmap(%edx),%eax
768	shrl	$IDXSHIFT,%eax
769	andb	$0xfc,%al
770	testb	$PG_V,PTmap(%eax)		/* PTE page must be valid */
771	je	4f
772	movb	PTmap(%edx),%al
773	andb	$PG_V|PG_RW|PG_U,%al		/* page must be valid and user writable */
774	cmpb	$PG_V|PG_RW|PG_U,%al
775	je	2f
776
7774:
778	/* simulate a trap */
779	pushl	%edx
780	pushl	%ecx
781	shll	$IDXSHIFT,%edx
782	pushl	%edx
783	call	trapwrite			/* trapwrite(addr) */
784	popl	%edx
785	popl	%ecx
786	popl	%edx
787
788	testl	%eax,%eax			/* if not ok, return EFAULT */
789	jnz	copyout_fault
790
7912:
792	addl	$4,%edx
793	decl	%ecx
794	jnz	1b				/* check next page */
795#endif /* I386_CPU */
796
797	/* bcopy(%esi, %edi, %ebx) */
798	movl	%ebx,%ecx
799
800#if defined(I586_CPU) && defined(DEV_NPX)
801	ALIGN_TEXT
802slow_copyout:
803#endif
804	shrl	$2,%ecx
805	cld
806	rep
807	movsl
808	movb	%bl,%cl
809	andb	$3,%cl
810	rep
811	movsb
812
813done_copyout:
814	popl	%ebx
815	popl	%edi
816	popl	%esi
817	xorl	%eax,%eax
818	movl	PCPU(CURPCB),%edx
819	movl	%eax,PCB_ONFAULT(%edx)
820	ret
821
822	ALIGN_TEXT
823copyout_fault:
824	popl	%ebx
825	popl	%edi
826	popl	%esi
827	movl	PCPU(CURPCB),%edx
828	movl	$0,PCB_ONFAULT(%edx)
829	movl	$EFAULT,%eax
830	ret
831
832#if defined(I586_CPU) && defined(DEV_NPX)
833ENTRY(i586_copyout)
834	/*
835	 * Duplicated from generic_copyout.  Could be done a bit better.
836	 */
837	movl	PCPU(CURPCB),%eax
838	movl	$copyout_fault,PCB_ONFAULT(%eax)
839	pushl	%esi
840	pushl	%edi
841	pushl	%ebx
842	movl	16(%esp),%esi
843	movl	20(%esp),%edi
844	movl	24(%esp),%ebx
845	testl	%ebx,%ebx			/* anything to do? */
846	jz	done_copyout
847
848	/*
849	 * Check explicitly for non-user addresses.  If 486 write protection
850	 * is being used, this check is essential because we are in kernel
851	 * mode so the h/w does not provide any protection against writing
852	 * kernel addresses.
853	 */
854
855	/*
856	 * First, prevent address wrapping.
857	 */
858	movl	%edi,%eax
859	addl	%ebx,%eax
860	jc	copyout_fault
861/*
862 * XXX STOP USING VM_MAXUSER_ADDRESS.
863 * It is an end address, not a max, so every time it is used correctly it
864 * looks like there is an off by one error, and of course it caused an off
865 * by one error in several places.
866 */
867	cmpl	$VM_MAXUSER_ADDRESS,%eax
868	ja	copyout_fault
869
870	/* bcopy(%esi, %edi, %ebx) */
8713:
872	movl	%ebx,%ecx
873	/*
874	 * End of duplicated code.
875	 */
876
877	cmpl	$1024,%ecx
878	jb	slow_copyout
879
880	pushl	%ecx
881	call	fastmove
882	addl	$4,%esp
883	jmp	done_copyout
884#endif /* I586_CPU && defined(DEV_NPX) */
885
886/*
887 * copyin(from_user, to_kernel, len) - MP SAFE
888 */
889ENTRY(copyin)
890	MEXITCOUNT
891	jmp	*copyin_vector
892
893ENTRY(generic_copyin)
894	movl	PCPU(CURPCB),%eax
895	movl	$copyin_fault,PCB_ONFAULT(%eax)
896	pushl	%esi
897	pushl	%edi
898	movl	12(%esp),%esi			/* caddr_t from */
899	movl	16(%esp),%edi			/* caddr_t to */
900	movl	20(%esp),%ecx			/* size_t  len */
901
902	/*
903	 * make sure address is valid
904	 */
905	movl	%esi,%edx
906	addl	%ecx,%edx
907	jc	copyin_fault
908	cmpl	$VM_MAXUSER_ADDRESS,%edx
909	ja	copyin_fault
910
911#if defined(I586_CPU) && defined(DEV_NPX)
912	ALIGN_TEXT
913slow_copyin:
914#endif
915	movb	%cl,%al
916	shrl	$2,%ecx				/* copy longword-wise */
917	cld
918	rep
919	movsl
920	movb	%al,%cl
921	andb	$3,%cl				/* copy remaining bytes */
922	rep
923	movsb
924
925#if defined(I586_CPU) && defined(DEV_NPX)
926	ALIGN_TEXT
927done_copyin:
928#endif
929	popl	%edi
930	popl	%esi
931	xorl	%eax,%eax
932	movl	PCPU(CURPCB),%edx
933	movl	%eax,PCB_ONFAULT(%edx)
934	ret
935
936	ALIGN_TEXT
937copyin_fault:
938	popl	%edi
939	popl	%esi
940	movl	PCPU(CURPCB),%edx
941	movl	$0,PCB_ONFAULT(%edx)
942	movl	$EFAULT,%eax
943	ret
944
945#if defined(I586_CPU) && defined(DEV_NPX)
946ENTRY(i586_copyin)
947	/*
948	 * Duplicated from generic_copyin.  Could be done a bit better.
949	 */
950	movl	PCPU(CURPCB),%eax
951	movl	$copyin_fault,PCB_ONFAULT(%eax)
952	pushl	%esi
953	pushl	%edi
954	movl	12(%esp),%esi			/* caddr_t from */
955	movl	16(%esp),%edi			/* caddr_t to */
956	movl	20(%esp),%ecx			/* size_t  len */
957
958	/*
959	 * make sure address is valid
960	 */
961	movl	%esi,%edx
962	addl	%ecx,%edx
963	jc	copyin_fault
964	cmpl	$VM_MAXUSER_ADDRESS,%edx
965	ja	copyin_fault
966	/*
967	 * End of duplicated code.
968	 */
969
970	cmpl	$1024,%ecx
971	jb	slow_copyin
972
973	pushl	%ebx			/* XXX prepare for fastmove_fault */
974	pushl	%ecx
975	call	fastmove
976	addl	$8,%esp
977	jmp	done_copyin
978#endif /* I586_CPU && defined(DEV_NPX) */
979
980#if defined(I586_CPU) && defined(DEV_NPX)
981/* fastmove(src, dst, len)
982	src in %esi
983	dst in %edi
984	len in %ecx		XXX changed to on stack for profiling
985	uses %eax and %edx for tmp. storage
986 */
987/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
988ENTRY(fastmove)
989	pushl	%ebp
990	movl	%esp,%ebp
991	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
992
993	movl	8(%ebp),%ecx
994	cmpl	$63,%ecx
995	jbe	fastmove_tail
996
997	testl	$7,%esi	/* check if src addr is multiple of 8 */
998	jnz	fastmove_tail
999
1000	testl	$7,%edi	/* check if dst addr is multiple of 8 */
1001	jnz	fastmove_tail
1002
1003	/* XXX grab FPU context atomically. */
1004	cli
1005
1006/* if (fpcurthread != NULL) { */
1007	cmpl	$0,PCPU(FPCURTHREAD)
1008	je	6f
1009/*    fnsave(&curpcb->pcb_savefpu); */
1010	movl	PCPU(CURPCB),%eax
1011	fnsave	PCB_SAVEFPU(%eax)
1012/*   FPCURTHREAD = NULL; */
1013	movl	$0,PCPU(FPCURTHREAD)
1014/* } */
10156:
1016/* now we own the FPU. */
1017
1018/*
1019 * The process' FP state is saved in the pcb, but if we get
1020 * switched, the cpu_switch() will store our FP state in the
1021 * pcb.  It should be possible to avoid all the copying for
1022 * this, e.g., by setting a flag to tell cpu_switch() to
1023 * save the state somewhere else.
1024 */
1025/* tmp = curpcb->pcb_savefpu; */
1026	movl	%ecx,-12(%ebp)
1027	movl	%esi,-8(%ebp)
1028	movl	%edi,-4(%ebp)
1029	movl	%esp,%edi
1030	movl	PCPU(CURPCB),%esi
1031	addl	$PCB_SAVEFPU,%esi
1032	cld
1033	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1034	rep
1035	movsl
1036	movl	-12(%ebp),%ecx
1037	movl	-8(%ebp),%esi
1038	movl	-4(%ebp),%edi
1039/* stop_emulating(); */
1040	clts
1041/* fpcurthread = curthread; */
1042	movl	PCPU(CURTHREAD),%eax
1043	movl	%eax,PCPU(FPCURTHREAD)
1044	movl	PCPU(CURPCB),%eax
1045
1046	/* XXX end of atomic FPU context grab. */
1047	sti
1048
1049	movl	$fastmove_fault,PCB_ONFAULT(%eax)
10504:
1051	movl	%ecx,-12(%ebp)
1052	cmpl	$1792,%ecx
1053	jbe	2f
1054	movl	$1792,%ecx
10552:
1056	subl	%ecx,-12(%ebp)
1057	cmpl	$256,%ecx
1058	jb	5f
1059	movl	%ecx,-8(%ebp)
1060	movl	%esi,-4(%ebp)
1061	ALIGN_TEXT
10623:
1063	movl	0(%esi),%eax
1064	movl	32(%esi),%eax
1065	movl	64(%esi),%eax
1066	movl	96(%esi),%eax
1067	movl	128(%esi),%eax
1068	movl	160(%esi),%eax
1069	movl	192(%esi),%eax
1070	movl	224(%esi),%eax
1071	addl	$256,%esi
1072	subl	$256,%ecx
1073	cmpl	$256,%ecx
1074	jae	3b
1075	movl	-8(%ebp),%ecx
1076	movl	-4(%ebp),%esi
10775:
1078	ALIGN_TEXT
1079fastmove_loop:
1080	fildq	0(%esi)
1081	fildq	8(%esi)
1082	fildq	16(%esi)
1083	fildq	24(%esi)
1084	fildq	32(%esi)
1085	fildq	40(%esi)
1086	fildq	48(%esi)
1087	fildq	56(%esi)
1088	fistpq	56(%edi)
1089	fistpq	48(%edi)
1090	fistpq	40(%edi)
1091	fistpq	32(%edi)
1092	fistpq	24(%edi)
1093	fistpq	16(%edi)
1094	fistpq	8(%edi)
1095	fistpq	0(%edi)
1096	addl	$-64,%ecx
1097	addl	$64,%esi
1098	addl	$64,%edi
1099	cmpl	$63,%ecx
1100	ja	fastmove_loop
1101	movl	-12(%ebp),%eax
1102	addl	%eax,%ecx
1103	cmpl	$64,%ecx
1104	jae	4b
1105
1106	/* XXX ungrab FPU context atomically. */
1107	cli
1108
1109/* curpcb->pcb_savefpu = tmp; */
1110	movl	%ecx,-12(%ebp)
1111	movl	%esi,-8(%ebp)
1112	movl	%edi,-4(%ebp)
1113	movl	PCPU(CURPCB),%edi
1114	addl	$PCB_SAVEFPU,%edi
1115	movl	%esp,%esi
1116	cld
1117	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1118	rep
1119	movsl
1120	movl	-12(%ebp),%ecx
1121	movl	-8(%ebp),%esi
1122	movl	-4(%ebp),%edi
1123
1124/* start_emulating(); */
1125	smsw	%ax
1126	orb	$CR0_TS,%al
1127	lmsw	%ax
1128/* fpcurthread = NULL; */
1129	movl	$0,PCPU(FPCURTHREAD)
1130
1131	/* XXX end of atomic FPU context ungrab. */
1132	sti
1133
1134	ALIGN_TEXT
1135fastmove_tail:
1136	movl	PCPU(CURPCB),%eax
1137	movl	$fastmove_tail_fault,PCB_ONFAULT(%eax)
1138
1139	movb	%cl,%al
1140	shrl	$2,%ecx				/* copy longword-wise */
1141	cld
1142	rep
1143	movsl
1144	movb	%al,%cl
1145	andb	$3,%cl				/* copy remaining bytes */
1146	rep
1147	movsb
1148
1149	movl	%ebp,%esp
1150	popl	%ebp
1151	ret
1152
1153	ALIGN_TEXT
1154fastmove_fault:
1155	/* XXX ungrab FPU context atomically. */
1156	cli
1157
1158	movl	PCPU(CURPCB),%edi
1159	addl	$PCB_SAVEFPU,%edi
1160	movl	%esp,%esi
1161	cld
1162	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1163	rep
1164	movsl
1165
1166	smsw	%ax
1167	orb	$CR0_TS,%al
1168	lmsw	%ax
1169	movl	$0,PCPU(FPCURTHREAD)
1170
1171	/* XXX end of atomic FPU context ungrab. */
1172	sti
1173
1174fastmove_tail_fault:
1175	movl	%ebp,%esp
1176	popl	%ebp
1177	addl	$8,%esp
1178	popl	%ebx
1179	popl	%edi
1180	popl	%esi
1181	movl	PCPU(CURPCB),%edx
1182	movl	$0,PCB_ONFAULT(%edx)
1183	movl	$EFAULT,%eax
1184	ret
1185#endif /* I586_CPU && defined(DEV_NPX) */
1186
1187/*
1188 * casuptr.  Compare and set user pointer.  Returns -1 or the current value.
1189 */
1190ENTRY(casuptr)
1191	movl	PCPU(CURPCB),%ecx
1192	movl	$fusufault,PCB_ONFAULT(%ecx)
1193	movl	4(%esp),%edx			/* dst */
1194	movl	8(%esp),%eax			/* old */
1195	movl	12(%esp),%ecx			/* new */
1196
1197	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1198	ja	fusufault
1199
1200#ifdef SMP
1201	lock
1202#endif
1203	cmpxchgl %ecx, (%edx)			/* Compare and set. */
1204
1205	/*
1206	 * The old value is in %eax.  If the store succeeded it will be the
1207	 * value we expected (old) from before the store, otherwise it will
1208	 * be the current value.
1209	 */
1210
1211	movl	PCPU(CURPCB),%ecx
1212	movl	$fusufault,PCB_ONFAULT(%ecx)
1213	movl	$0,PCB_ONFAULT(%ecx)
1214	ret
1215
1216/*
1217 * fu{byte,sword,word} - MP SAFE
1218 *
1219 *	Fetch a byte (sword, word) from user memory
1220 */
1221ENTRY(fuword)
1222	movl	PCPU(CURPCB),%ecx
1223	movl	$fusufault,PCB_ONFAULT(%ecx)
1224	movl	4(%esp),%edx			/* from */
1225
1226	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1227	ja	fusufault
1228
1229	movl	(%edx),%eax
1230	movl	$0,PCB_ONFAULT(%ecx)
1231	ret
1232
1233ENTRY(fuword32)
1234	jmp	fuword
1235
1236/*
1237 * These two routines are called from the profiling code, potentially
1238 * at interrupt time. If they fail, that's okay, good things will
1239 * happen later. Fail all the time for now - until the trap code is
1240 * able to deal with this.
1241 */
1242ALTENTRY(suswintr)
1243ENTRY(fuswintr)
1244	movl	$-1,%eax
1245	ret
1246
1247/*
1248 * fuword16 - MP SAFE
1249 */
1250ENTRY(fuword16)
1251	movl	PCPU(CURPCB),%ecx
1252	movl	$fusufault,PCB_ONFAULT(%ecx)
1253	movl	4(%esp),%edx
1254
1255	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
1256	ja	fusufault
1257
1258	movzwl	(%edx),%eax
1259	movl	$0,PCB_ONFAULT(%ecx)
1260	ret
1261
1262/*
1263 * fubyte - MP SAFE
1264 */
1265ENTRY(fubyte)
1266	movl	PCPU(CURPCB),%ecx
1267	movl	$fusufault,PCB_ONFAULT(%ecx)
1268	movl	4(%esp),%edx
1269
1270	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1271	ja	fusufault
1272
1273	movzbl	(%edx),%eax
1274	movl	$0,PCB_ONFAULT(%ecx)
1275	ret
1276
1277	ALIGN_TEXT
1278fusufault:
1279	movl	PCPU(CURPCB),%ecx
1280	xorl	%eax,%eax
1281	movl	%eax,PCB_ONFAULT(%ecx)
1282	decl	%eax
1283	ret
1284
1285/*
1286 * su{byte,sword,word} - MP SAFE (if not I386_CPU)
1287 *
1288 *	Write a byte (word, longword) to user memory
1289 */
1290ENTRY(suword)
1291	movl	PCPU(CURPCB),%ecx
1292	movl	$fusufault,PCB_ONFAULT(%ecx)
1293	movl	4(%esp),%edx
1294
1295#ifdef I386_CPU
1296
1297	/* XXX - page boundary crossing is still not handled */
1298	movl	%edx,%eax
1299	shrl	$IDXSHIFT,%edx
1300	andb	$0xfc,%dl
1301
1302	leal	PTmap(%edx),%ecx
1303	shrl	$IDXSHIFT,%ecx
1304	andb	$0xfc,%cl
1305	testb	$PG_V,PTmap(%ecx)		/* PTE page must be valid */
1306	je	4f
1307	movb	PTmap(%edx),%dl
1308	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1309	cmpb	$PG_V|PG_RW|PG_U,%dl
1310	je	1f
1311
13124:
1313	/* simulate a trap */
1314	pushl	%eax
1315	call	trapwrite
1316	popl	%edx				/* remove junk parameter from stack */
1317	testl	%eax,%eax
1318	jnz	fusufault
13191:
1320	movl	4(%esp),%edx
1321#endif
1322
1323	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1324	ja	fusufault
1325
1326	movl	8(%esp),%eax
1327	movl	%eax,(%edx)
1328	xorl	%eax,%eax
1329	movl	PCPU(CURPCB),%ecx
1330	movl	%eax,PCB_ONFAULT(%ecx)
1331	ret
1332
1333ENTRY(suword32)
1334	jmp	suword
1335
1336/*
1337 * suword16 - MP SAFE (if not I386_CPU)
1338 */
1339ENTRY(suword16)
1340	movl	PCPU(CURPCB),%ecx
1341	movl	$fusufault,PCB_ONFAULT(%ecx)
1342	movl	4(%esp),%edx
1343
1344#ifdef I386_CPU
1345
1346	/* XXX - page boundary crossing is still not handled */
1347	movl	%edx,%eax
1348	shrl	$IDXSHIFT,%edx
1349	andb	$0xfc,%dl
1350
1351	leal	PTmap(%edx),%ecx
1352	shrl	$IDXSHIFT,%ecx
1353	andb	$0xfc,%cl
1354	testb	$PG_V,PTmap(%ecx)		/* PTE page must be valid */
1355	je	4f
1356	movb	PTmap(%edx),%dl
1357	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1358	cmpb	$PG_V|PG_RW|PG_U,%dl
1359	je	1f
1360
13614:
1362	/* simulate a trap */
1363	pushl	%eax
1364	call	trapwrite
1365	popl	%edx				/* remove junk parameter from stack */
1366	testl	%eax,%eax
1367	jnz	fusufault
13681:
1369	movl	4(%esp),%edx
1370#endif
1371
1372	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1373	ja	fusufault
1374
1375	movw	8(%esp),%ax
1376	movw	%ax,(%edx)
1377	xorl	%eax,%eax
1378	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
1379	movl	%eax,PCB_ONFAULT(%ecx)
1380	ret
1381
1382/*
1383 * subyte - MP SAFE (if not I386_CPU)
1384 */
1385ENTRY(subyte)
1386	movl	PCPU(CURPCB),%ecx
1387	movl	$fusufault,PCB_ONFAULT(%ecx)
1388	movl	4(%esp),%edx
1389
1390#ifdef I386_CPU
1391
1392	movl	%edx,%eax
1393	shrl	$IDXSHIFT,%edx
1394	andb	$0xfc,%dl
1395
1396	leal	PTmap(%edx),%ecx
1397	shrl	$IDXSHIFT,%ecx
1398	andb	$0xfc,%cl
1399	testb	$PG_V,PTmap(%ecx)		/* PTE page must be valid */
1400	je	4f
1401	movb	PTmap(%edx),%dl
1402	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1403	cmpb	$PG_V|PG_RW|PG_U,%dl
1404	je	1f
1405
14064:
1407	/* simulate a trap */
1408	pushl	%eax
1409	call	trapwrite
1410	popl	%edx				/* remove junk parameter from stack */
1411	testl	%eax,%eax
1412	jnz	fusufault
14131:
1414	movl	4(%esp),%edx
1415#endif
1416
1417	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1418	ja	fusufault
1419
1420	movb	8(%esp),%al
1421	movb	%al,(%edx)
1422	xorl	%eax,%eax
1423	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
1424	movl	%eax,PCB_ONFAULT(%ecx)
1425	ret
1426
1427/*
1428 * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
1429 *
1430 *	copy a string from from to to, stop when a 0 character is reached.
1431 *	return ENAMETOOLONG if string is longer than maxlen, and
1432 *	EFAULT on protection violations. If lencopied is non-zero,
1433 *	return the actual length in *lencopied.
1434 */
1435ENTRY(copyinstr)
1436	pushl	%esi
1437	pushl	%edi
1438	movl	PCPU(CURPCB),%ecx
1439	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1440
1441	movl	12(%esp),%esi			/* %esi = from */
1442	movl	16(%esp),%edi			/* %edi = to */
1443	movl	20(%esp),%edx			/* %edx = maxlen */
1444
1445	movl	$VM_MAXUSER_ADDRESS,%eax
1446
1447	/* make sure 'from' is within bounds */
1448	subl	%esi,%eax
1449	jbe	cpystrflt
1450
1451	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1452	cmpl	%edx,%eax
1453	jae	1f
1454	movl	%eax,%edx
1455	movl	%eax,20(%esp)
14561:
1457	incl	%edx
1458	cld
1459
14602:
1461	decl	%edx
1462	jz	3f
1463
1464	lodsb
1465	stosb
1466	orb	%al,%al
1467	jnz	2b
1468
1469	/* Success -- 0 byte reached */
1470	decl	%edx
1471	xorl	%eax,%eax
1472	jmp	cpystrflt_x
14733:
1474	/* edx is zero - return ENAMETOOLONG or EFAULT */
1475	cmpl	$VM_MAXUSER_ADDRESS,%esi
1476	jae	cpystrflt
14774:
1478	movl	$ENAMETOOLONG,%eax
1479	jmp	cpystrflt_x
1480
1481cpystrflt:
1482	movl	$EFAULT,%eax
1483
1484cpystrflt_x:
1485	/* set *lencopied and return %eax */
1486	movl	PCPU(CURPCB),%ecx
1487	movl	$0,PCB_ONFAULT(%ecx)
1488	movl	20(%esp),%ecx
1489	subl	%edx,%ecx
1490	movl	24(%esp),%edx
1491	testl	%edx,%edx
1492	jz	1f
1493	movl	%ecx,(%edx)
14941:
1495	popl	%edi
1496	popl	%esi
1497	ret
1498
1499
1500/*
1501 * copystr(from, to, maxlen, int *lencopied) - MP SAFE
1502 */
1503ENTRY(copystr)
1504	pushl	%esi
1505	pushl	%edi
1506
1507	movl	12(%esp),%esi			/* %esi = from */
1508	movl	16(%esp),%edi			/* %edi = to */
1509	movl	20(%esp),%edx			/* %edx = maxlen */
1510	incl	%edx
1511	cld
15121:
1513	decl	%edx
1514	jz	4f
1515	lodsb
1516	stosb
1517	orb	%al,%al
1518	jnz	1b
1519
1520	/* Success -- 0 byte reached */
1521	decl	%edx
1522	xorl	%eax,%eax
1523	jmp	6f
15244:
1525	/* edx is zero -- return ENAMETOOLONG */
1526	movl	$ENAMETOOLONG,%eax
1527
15286:
1529	/* set *lencopied and return %eax */
1530	movl	20(%esp),%ecx
1531	subl	%edx,%ecx
1532	movl	24(%esp),%edx
1533	testl	%edx,%edx
1534	jz	7f
1535	movl	%ecx,(%edx)
15367:
1537	popl	%edi
1538	popl	%esi
1539	ret
1540
1541ENTRY(bcmp)
1542	pushl	%edi
1543	pushl	%esi
1544	movl	12(%esp),%edi
1545	movl	16(%esp),%esi
1546	movl	20(%esp),%edx
1547	xorl	%eax,%eax
1548
1549	movl	%edx,%ecx
1550	shrl	$2,%ecx
1551	cld					/* compare forwards */
1552	repe
1553	cmpsl
1554	jne	1f
1555
1556	movl	%edx,%ecx
1557	andl	$3,%ecx
1558	repe
1559	cmpsb
1560	je	2f
15611:
1562	incl	%eax
15632:
1564	popl	%esi
1565	popl	%edi
1566	ret
1567
1568
1569/*
1570 * Handling of special 386 registers and descriptor tables etc
1571 */
1572/* void lgdt(struct region_descriptor *rdp); */
1573ENTRY(lgdt)
1574	/* reload the descriptor table */
1575	movl	4(%esp),%eax
1576	lgdt	(%eax)
1577
1578	/* flush the prefetch q */
1579	jmp	1f
1580	nop
15811:
1582	/* reload "stale" selectors */
1583	movl	$KDSEL,%eax
1584	mov	%ax,%ds
1585	mov	%ax,%es
1586	mov	%ax,%gs
1587	mov	%ax,%ss
1588	movl	$KPSEL,%eax
1589	mov	%ax,%fs
1590
1591	/* reload code selector by turning return into intersegmental return */
1592	movl	(%esp),%eax
1593	pushl	%eax
1594	movl	$KCSEL,4(%esp)
1595	lret
1596
1597/* ssdtosd(*ssdp,*sdp) */
1598ENTRY(ssdtosd)
1599	pushl	%ebx
1600	movl	8(%esp),%ecx
1601	movl	8(%ecx),%ebx
1602	shll	$16,%ebx
1603	movl	(%ecx),%edx
1604	roll	$16,%edx
1605	movb	%dh,%bl
1606	movb	%dl,%bh
1607	rorl	$8,%ebx
1608	movl	4(%ecx),%eax
1609	movw	%ax,%dx
1610	andl	$0xf0000,%eax
1611	orl	%eax,%ebx
1612	movl	12(%esp),%ecx
1613	movl	%edx,(%ecx)
1614	movl	%ebx,4(%ecx)
1615	popl	%ebx
1616	ret
1617
1618/* void reset_dbregs() */
1619ENTRY(reset_dbregs)
1620	movl    $0,%eax
1621	movl    %eax,%dr7     /* disable all breapoints first */
1622	movl    %eax,%dr0
1623	movl    %eax,%dr1
1624	movl    %eax,%dr2
1625	movl    %eax,%dr3
1626	movl    %eax,%dr6
1627	ret
1628
1629/*****************************************************************************/
1630/* setjump, longjump                                                         */
1631/*****************************************************************************/
1632
1633ENTRY(setjmp)
1634	movl	4(%esp),%eax
1635	movl	%ebx,(%eax)			/* save ebx */
1636	movl	%esp,4(%eax)			/* save esp */
1637	movl	%ebp,8(%eax)			/* save ebp */
1638	movl	%esi,12(%eax)			/* save esi */
1639	movl	%edi,16(%eax)			/* save edi */
1640	movl	(%esp),%edx			/* get rta */
1641	movl	%edx,20(%eax)			/* save eip */
1642	xorl	%eax,%eax			/* return(0); */
1643	ret
1644
1645ENTRY(longjmp)
1646	movl	4(%esp),%eax
1647	movl	(%eax),%ebx			/* restore ebx */
1648	movl	4(%eax),%esp			/* restore esp */
1649	movl	8(%eax),%ebp			/* restore ebp */
1650	movl	12(%eax),%esi			/* restore esi */
1651	movl	16(%eax),%edi			/* restore edi */
1652	movl	20(%eax),%edx			/* get rta */
1653	movl	%edx,(%esp)			/* put in return frame */
1654	xorl	%eax,%eax			/* return(1); */
1655	incl	%eax
1656	ret
1657
1658/*
1659 * Support for BB-profiling (gcc -a).  The kernbb program will extract
1660 * the data from the kernel.
1661 */
1662
1663	.data
1664	ALIGN_DATA
1665	.globl bbhead
1666bbhead:
1667	.long 0
1668
1669	.text
1670NON_GPROF_ENTRY(__bb_init_func)
1671	movl	4(%esp),%eax
1672	movl	$1,(%eax)
1673	movl	bbhead,%edx
1674	movl	%edx,16(%eax)
1675	movl	%eax,bbhead
1676	NON_GPROF_RET
1677