support.s revision 70714
1177391Sobrien/*-
2107484Speter * Copyright (c) 1993 The Regents of the University of California.
381404Speter * All rights reserved.
4175261Sobrien *
5177391Sobrien * Redistribution and use in source and binary forms, with or without
681404Speter * modification, are permitted provided that the following conditions
781404Speter * are met:
881404Speter * 1. Redistributions of source code must retain the above copyright
981404Speter *    notice, this list of conditions and the following disclaimer.
1081404Speter * 2. Redistributions in binary form must reproduce the above copyright
1181404Speter *    notice, this list of conditions and the following disclaimer in the
1281404Speter *    documentation and/or other materials provided with the distribution.
1381404Speter * 3. All advertising materials mentioning features or use of this software
1481404Speter *    must display the following acknowledgement:
1581404Speter *	This product includes software developed by the University of
1681404Speter *	California, Berkeley and its contributors.
1717721Speter * 4. Neither the name of the University nor the names of its contributors
1817721Speter *    may be used to endorse or promote products derived from this software
19175261Sobrien *    without specific prior written permission.
20175261Sobrien *
21175261Sobrien * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22175261Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23175261Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2417721Speter * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2517721Speter * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2617721Speter * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2717721Speter * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2817721Speter * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2917721Speter * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3017721Speter * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3117721Speter * SUCH DAMAGE.
3217721Speter *
3317721Speter * $FreeBSD: head/sys/i386/i386/support.s 70714 2001-01-06 17:40:04Z jake $
3481404Speter */
35175261Sobrien
3681404Speter#include "npx.h"
3781404Speter
3881404Speter#include <machine/asmacros.h>
3981404Speter#include <machine/cputypes.h>
40107484Speter#include <machine/pmap.h>
41107484Speter#include <machine/specialreg.h>
42107484Speter
43107484Speter#include "assym.s"
4481404Speter
45128266Speter#define IDXSHIFT	10
4681404Speter
4781404Speter	.data
4881404Speter	.globl	_bcopy_vector
4981404Speter_bcopy_vector:
5081404Speter	.long	_generic_bcopy
5181404Speter	.globl	_bzero
52175261Sobrien_bzero:
53175261Sobrien	.long	_generic_bzero
54175261Sobrien	.globl	_copyin_vector
55175261Sobrien_copyin_vector:
56175261Sobrien	.long	_generic_copyin
57175261Sobrien	.globl	_copyout_vector
58175261Sobrien_copyout_vector:
59175261Sobrien	.long	_generic_copyout
60175261Sobrien	.globl	_ovbcopy_vector
61175261Sobrien_ovbcopy_vector:
62175261Sobrien	.long	_generic_bcopy
63175261Sobrien#if defined(I586_CPU) && NNPX > 0
64175261Sobrienkernel_fpu_lock:
65175261Sobrien	.byte	0xfe
66175261Sobrien	.space	3
67175261Sobrien#endif
68175261Sobrien
69175261Sobrien	.text
70175261Sobrien
71175261Sobrien/*
72175261Sobrien * bcopy family
73175261Sobrien * void bzero(void *buf, u_int len)
74175261Sobrien */
75175261Sobrien
76175261SobrienENTRY(generic_bzero)
77175261Sobrien	pushl	%edi
78175261Sobrien	movl	8(%esp),%edi
79175261Sobrien	movl	12(%esp),%ecx
80175261Sobrien	xorl	%eax,%eax
81175261Sobrien	shrl	$2,%ecx
82128266Speter	cld
8381404Speter	rep
84128266Speter	stosl
85128266Speter	movl	12(%esp),%ecx
86128266Speter	andl	$3,%ecx
8781404Speter	rep
8881404Speter	stosb
89128266Speter	popl	%edi
90128266Speter	ret
91128266Speter
92128266Speter#if defined(I486_CPU)
9381404SpeterENTRY(i486_bzero)
94128266Speter	movl	4(%esp),%edx
95128266Speter	movl	8(%esp),%ecx
9681404Speter	xorl	%eax,%eax
97128266Speter/*
98128266Speter * do 64 byte chunks first
99128266Speter *
100107484Speter * XXX this is probably over-unrolled at least for DX2's
101128266Speter */
102128266Speter2:
103177391Sobrien	cmpl	$64,%ecx
104177391Sobrien	jb	3f
105128266Speter	movl	%eax,(%edx)
106128266Speter	movl	%eax,4(%edx)
107128266Speter	movl	%eax,8(%edx)
10881404Speter	movl	%eax,12(%edx)
10981404Speter	movl	%eax,16(%edx)
110128266Speter	movl	%eax,20(%edx)
111128266Speter	movl	%eax,24(%edx)
112128266Speter	movl	%eax,28(%edx)
11381404Speter	movl	%eax,32(%edx)
114128266Speter	movl	%eax,36(%edx)
115128266Speter	movl	%eax,40(%edx)
116128266Speter	movl	%eax,44(%edx)
117177391Sobrien	movl	%eax,48(%edx)
118128266Speter	movl	%eax,52(%edx)
119128266Speter	movl	%eax,56(%edx)
12081404Speter	movl	%eax,60(%edx)
121128266Speter	addl	$64,%edx
122128266Speter	subl	$64,%ecx
123128266Speter	jnz	2b
124128266Speter	ret
125128266Speter
126128266Speter/*
12781404Speter * do 16 byte chunks
12881404Speter */
12981404Speter	SUPERALIGN_TEXT
13081404Speter3:
13181404Speter	cmpl	$16,%ecx
132128266Speter	jb	4f
133128266Speter	movl	%eax,(%edx)
134128266Speter	movl	%eax,4(%edx)
135107484Speter	movl	%eax,8(%edx)
13681404Speter	movl	%eax,12(%edx)
13781404Speter	addl	$16,%edx
13881404Speter	subl	$16,%ecx
139177391Sobrien	jnz	3b
140177391Sobrien	ret
141177391Sobrien
142177391Sobrien/*
143177391Sobrien * do 4 byte chunks
144128266Speter */
145128266Speter	SUPERALIGN_TEXT
146102840Speter4:
147128266Speter	cmpl	$4,%ecx
148102840Speter	jb	5f
149175261Sobrien	movl	%eax,(%edx)
150175261Sobrien	addl	$4,%edx
151128266Speter	subl	$4,%ecx
152128266Speter	jnz	4b
153177391Sobrien	ret
154128266Speter
155177391Sobrien/*
156177391Sobrien * do 1 byte chunks
157177391Sobrien * a jump table seems to be faster than a loop or more range reductions
158128266Speter *
159128266Speter * XXX need a const section for non-text
160177391Sobrien */
161128266Speter	.data
16281404Speterjtab:
163128266Speter	.long	do0
16481404Speter	.long	do1
165128266Speter	.long	do2
166128266Speter	.long	do3
167177391Sobrien
168128266Speter	.text
169128266Speter	SUPERALIGN_TEXT
170175261Sobrien5:
171128266Speter	jmp	*jtab(,%ecx,4)
172177391Sobrien
173128266Speter	SUPERALIGN_TEXT
174128266Speterdo3:
175177391Sobrien	movw	%ax,(%edx)
176128266Speter	movb	%al,2(%edx)
177128266Speter	ret
178177391Sobrien
179128266Speter	SUPERALIGN_TEXT
180128266Speterdo2:
181177391Sobrien	movw	%ax,(%edx)
182177391Sobrien	ret
183175261Sobrien
184177391Sobrien	SUPERALIGN_TEXT
18581404Speterdo1:
18681404Speter	movb	%al,(%edx)
187107484Speter	ret
18881404Speter
18981404Speter	SUPERALIGN_TEXT
19081404Speterdo0:
19181404Speter	ret
19281404Speter#endif
19381404Speter
19481404Speter#if defined(I586_CPU) && NNPX > 0
19581404SpeterENTRY(i586_bzero)
19681404Speter	movl	4(%esp),%edx
19781404Speter	movl	8(%esp),%ecx
198102840Speter
19981404Speter	/*
20081404Speter	 * The FPU register method is twice as fast as the integer register
20181404Speter	 * method unless the target is in the L1 cache and we pre-allocate a
20281404Speter	 * cache line for it (then the integer register method is 4-5 times
20317721Speter	 * faster).  However, we never pre-allocate cache lines, since that
20481404Speter	 * would make the integer method 25% or more slower for the common
20581404Speter	 * case when the target isn't in either the L1 cache or the L2 cache.
20681404Speter	 * Thus we normally use the FPU register method unless the overhead
20717721Speter	 * would be too large.
20881404Speter	 */
20981404Speter	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
21017721Speter	jb	intreg_i586_bzero
21181404Speter
21281404Speter	/*
21381404Speter	 * The FPU registers may belong to an application or to fastmove()
21481404Speter	 * or to another invocation of bcopy() or ourself in a higher level
21581404Speter	 * interrupt or trap handler.  Preserving the registers is
21681404Speter	 * complicated since we avoid it if possible at all levels.  We
21781404Speter	 * want to localize the complications even when that increases them.
21881404Speter	 * Here the extra work involves preserving CR0_TS in TS.
21981404Speter	 * `npxproc != NULL' is supposed to be the condition that all the
22081404Speter	 * FPU resources belong to an application, but npxproc and CR0_TS
221175261Sobrien	 * aren't set atomically enough for this condition to work in
22281404Speter	 * interrupt handlers.
22381404Speter	 *
22481404Speter	 * Case 1: FPU registers belong to the application: we must preserve
22581404Speter	 * the registers if we use them, so we only use the FPU register
22681404Speter	 * method if the target size is large enough to amortize the extra
22781404Speter	 * overhead for preserving them.  CR0_TS must be preserved although
22881404Speter	 * it is very likely to end up as set.
22981404Speter	 *
23081404Speter	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
23181404Speter	 * makes the registers look like they belong to an application so
23281404Speter	 * that cpu_switch() and savectx() don't have to know about it, so
23381404Speter	 * this case reduces to case 1.
23481404Speter	 *
23581404Speter	 * Case 3: FPU registers belong to the kernel: don't use the FPU
236175261Sobrien	 * register method.  This case is unlikely, and supporting it would
237175261Sobrien	 * be more complicated and might take too much stack.
238175261Sobrien	 *
239175261Sobrien	 * Case 4: FPU registers don't belong to anyone: the FPU registers
240175261Sobrien	 * don't need to be preserved, so we always use the FPU register
241175261Sobrien	 * method.  CR0_TS must be preserved although it is very likely to
242175261Sobrien	 * always end up as clear.
243175261Sobrien	 */
244175261Sobrien	cmpl	$0,PCPU(NPXPROC)
245175261Sobrien	je	i586_bz1
24681404Speter	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
24781404Speter	jb	intreg_i586_bzero
248175261Sobrien	sarb	$1,kernel_fpu_lock
249175261Sobrien	jc	intreg_i586_bzero
250175261Sobrien	smsw	%ax
251175261Sobrien	clts
252175261Sobrien	subl	$108,%esp
253175261Sobrien	fnsave	0(%esp)
254175261Sobrien	jmp	i586_bz2
255175261Sobrien
256175261Sobrieni586_bz1:
257175261Sobrien	sarb	$1,kernel_fpu_lock
258175261Sobrien	jc	intreg_i586_bzero
259175261Sobrien	smsw	%ax
260175261Sobrien	clts
261175261Sobrien	fninit				/* XXX should avoid needing this */
262175261Sobrieni586_bz2:
263175261Sobrien	fldz
264175261Sobrien
265175261Sobrien	/*
266107484Speter	 * Align to an 8 byte boundary (misalignment in the main loop would
267175261Sobrien	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
268107484Speter	 * already aligned) by always zeroing 8 bytes and using the part up
269175261Sobrien	 * to the _next_ alignment position.
270107484Speter	 */
271175261Sobrien	fstl	0(%edx)
272107484Speter	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
273175261Sobrien	addl	$8,%edx
274107484Speter	andl	$~7,%edx
275175261Sobrien	subl	%edx,%ecx
276107484Speter
277175261Sobrien	/*
278107484Speter	 * Similarly align `len' to a multiple of 8.
279175261Sobrien	 */
280107484Speter	fstl	-8(%edx,%ecx)
281175261Sobrien	decl	%ecx
282107484Speter	andl	$~7,%ecx
283175261Sobrien
284128266Speter	/*
285175261Sobrien	 * This wouldn't be any faster if it were unrolled, since the loop
286107484Speter	 * control instructions are much faster than the fstl and/or done
287175261Sobrien	 * in parallel with it so their overhead is insignificant.
288107484Speter	 */
28981404Speterfpureg_i586_bzero_loop:
29081404Speter	fstl	0(%edx)
291177391Sobrien	addl	$8,%edx
29281404Speter	subl	$8,%ecx
293107484Speter	cmpl	$8,%ecx
294107484Speter	jae	fpureg_i586_bzero_loop
295107484Speter
296175261Sobrien	cmpl	$0,PCPU(NPXPROC)
297175261Sobrien	je	i586_bz3
29881404Speter	frstor	0(%esp)
29917721Speter	addl	$108,%esp
30081404Speter	lmsw	%ax
30181404Speter	movb	$0xfe,kernel_fpu_lock
30281404Speter	ret
30381404Speter
304107484Speteri586_bz3:
305175261Sobrien	fstp	%st(0)
306175261Sobrien	lmsw	%ax
30717721Speter	movb	$0xfe,kernel_fpu_lock
30881404Speter	ret
30981404Speter
310177391Sobrienintreg_i586_bzero:
31181404Speter	/*
31281404Speter	 * `rep stos' seems to be the best method in practice for small
313175261Sobrien	 * counts.  Fancy methods usually take too long to start up due
314175261Sobrien	 * to cache and BTB misses.
315175261Sobrien	 */
31617721Speter	pushl	%edi
31717721Speter	movl	%edx,%edi
31881404Speter	xorl	%eax,%eax
31981404Speter	shrl	$2,%ecx
32081404Speter	cld
321175261Sobrien	rep
322175261Sobrien	stosl
323175261Sobrien	movl	12(%esp),%ecx
32481404Speter	andl	$3,%ecx
32581404Speter	jne	1f
32681404Speter	popl	%edi
32781404Speter	ret
328128266Speter
329128266Speter1:
330128266Speter	rep
33181404Speter	stosb
33281404Speter	popl	%edi
333177391Sobrien	ret
334177391Sobrien#endif /* I586_CPU && NNPX > 0 */
335177391Sobrien
336177391SobrienENTRY(i686_pagezero)
337177391Sobrien	pushl	%edi
338177391Sobrien	pushl	%ebx
339177391Sobrien
340177391Sobrien	movl	12(%esp), %edi
341177391Sobrien	movl	$1024, %ecx
342177391Sobrien	cld
343177391Sobrien
344177391Sobrien	ALIGN_TEXT
345107484Speter1:
34681404Speter	xorl	%eax, %eax
347177391Sobrien	repe
348107484Speter	scasl
349107484Speter	jnz	2f
350107484Speter
351107484Speter	popl	%ebx
35281404Speter	popl	%edi
35381404Speter	ret
35481404Speter
35581404Speter	ALIGN_TEXT
35681404Speter
35781404Speter2:
35881404Speter	incl	%ecx
35981404Speter	subl	$4, %edi
36081404Speter
36125839Speter	movl	%ecx, %edx
362175261Sobrien	cmpl	$16, %ecx
363177391Sobrien
364175261Sobrien	jge	3f
36581404Speter
36681404Speter	movl	%edi, %ebx
36781404Speter	andl	$0x3f, %ebx
36881404Speter	shrl	%ebx
36917721Speter	shrl	%ebx
37081404Speter	movl	$16, %ecx
37181404Speter	subl	%ebx, %ecx
37217721Speter
37381404Speter3:
37481404Speter	subl	%ecx, %edx
37581404Speter	rep
376128266Speter	stosl
377102840Speter
378102840Speter	movl	%edx, %ecx
37981404Speter	testl	%edx, %edx
38017721Speter	jnz	1b
38181404Speter
38281404Speter	popl	%ebx
38317721Speter	popl	%edi
38481404Speter	ret
385175261Sobrien
38681404Speter/* fillw(pat, base, cnt) */
38781404SpeterENTRY(fillw)
38881404Speter	pushl	%edi
38981404Speter	movl	8(%esp),%eax
39081404Speter	movl	12(%esp),%edi
39181404Speter	movl	16(%esp),%ecx
39281404Speter	cld
39381404Speter	rep
39481404Speter	stosw
395128266Speter	popl	%edi
39681404Speter	ret
39781404Speter
398102840SpeterENTRY(bcopyb)
39981404Speter	pushl	%esi
40081404Speter	pushl	%edi
40181404Speter	movl	12(%esp),%esi
402175261Sobrien	movl	16(%esp),%edi
403175261Sobrien	movl	20(%esp),%ecx
404102840Speter	movl	%edi,%eax
40581404Speter	subl	%esi,%eax
40681404Speter	cmpl	%ecx,%eax			/* overlapping && src < dst? */
40781404Speter	jb	1f
40881404Speter	cld					/* nope, copy forwards */
40981404Speter	rep
41081404Speter	movsb
411177391Sobrien	popl	%edi
412177391Sobrien	popl	%esi
41381404Speter	ret
41481404Speter
415177391Sobrien	ALIGN_TEXT
416177391Sobrien1:
417102840Speter	addl	%ecx,%edi			/* copy backwards. */
41881404Speter	addl	%ecx,%esi
41981404Speter	decl	%edi
42081404Speter	decl	%esi
421177391Sobrien	std
422177391Sobrien	rep
423177391Sobrien	movsb
424177391Sobrien	popl	%edi
42581404Speter	popl	%esi
42681404Speter	cld
42781404Speter	ret
428128266Speter
42981404SpeterENTRY(bcopy)
43081404Speter	MEXITCOUNT
43181404Speter	jmp	*_bcopy_vector
43281404Speter
43381404SpeterENTRY(ovbcopy)
43481404Speter	MEXITCOUNT
435128266Speter	jmp	*_ovbcopy_vector
436128266Speter
437128266Speter/*
438128266Speter * generic_bcopy(src, dst, cnt)
439128266Speter *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
440128266Speter */
441128266SpeterENTRY(generic_bcopy)
442128266Speter	pushl	%esi
44381404Speter	pushl	%edi
444177391Sobrien	movl	12(%esp),%esi
44581404Speter	movl	16(%esp),%edi
446177391Sobrien	movl	20(%esp),%ecx
447177391Sobrien
44881404Speter	movl	%edi,%eax
449175261Sobrien	subl	%esi,%eax
450175261Sobrien	cmpl	%ecx,%eax			/* overlapping && src < dst? */
451177391Sobrien	jb	1f
452177391Sobrien
453177391Sobrien	shrl	$2,%ecx				/* copy by 32-bit words */
454177391Sobrien	cld					/* nope, copy forwards */
455102840Speter	rep
456102840Speter	movsl
457128266Speter	movl	20(%esp),%ecx
458177391Sobrien	andl	$3,%ecx				/* any bytes left? */
45981404Speter	rep
46081404Speter	movsb
46181404Speter	popl	%edi
46281404Speter	popl	%esi
463102840Speter	ret
464102840Speter
46581404Speter	ALIGN_TEXT
46681404Speter1:
46781404Speter	addl	%ecx,%edi			/* copy backwards */
468109655Speter	addl	%ecx,%esi
46981404Speter	decl	%edi
470109655Speter	decl	%esi
47181404Speter	andl	$3,%ecx				/* any fractional bytes? */
47281404Speter	std
47381404Speter	rep
47481404Speter	movsb
47581404Speter	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
47681404Speter	shrl	$2,%ecx
47781404Speter	subl	$3,%esi
47881404Speter	subl	$3,%edi
47917721Speter	rep
48017721Speter	movsl
48181404Speter	popl	%edi
48281404Speter	popl	%esi
48381404Speter	cld
48481404Speter	ret
48581404Speter
48681404Speter#if defined(I586_CPU) && NNPX > 0
48781404SpeterENTRY(i586_bcopy)
48881404Speter	pushl	%esi
48981404Speter	pushl	%edi
49081404Speter	movl	12(%esp),%esi
49181404Speter	movl	16(%esp),%edi
49281404Speter	movl	20(%esp),%ecx
49381404Speter
49481404Speter	movl	%edi,%eax
49581404Speter	subl	%esi,%eax
49681404Speter	cmpl	%ecx,%eax			/* overlapping && src < dst? */
49781404Speter	jb	1f
498
499	cmpl	$1024,%ecx
500	jb	small_i586_bcopy
501
502	sarb	$1,kernel_fpu_lock
503	jc	small_i586_bcopy
504	cmpl	$0,PCPU(NPXPROC)
505	je	i586_bc1
506	smsw	%dx
507	clts
508	subl	$108,%esp
509	fnsave	0(%esp)
510	jmp	4f
511
512i586_bc1:
513	smsw	%dx
514	clts
515	fninit				/* XXX should avoid needing this */
516
517	ALIGN_TEXT
5184:
519	pushl	%ecx
520#define	DCACHE_SIZE	8192
521	cmpl	$(DCACHE_SIZE-512)/2,%ecx
522	jbe	2f
523	movl	$(DCACHE_SIZE-512)/2,%ecx
5242:
525	subl	%ecx,0(%esp)
526	cmpl	$256,%ecx
527	jb	5f			/* XXX should prefetch if %ecx >= 32 */
528	pushl	%esi
529	pushl	%ecx
530	ALIGN_TEXT
5313:
532	movl	0(%esi),%eax
533	movl	32(%esi),%eax
534	movl	64(%esi),%eax
535	movl	96(%esi),%eax
536	movl	128(%esi),%eax
537	movl	160(%esi),%eax
538	movl	192(%esi),%eax
539	movl	224(%esi),%eax
540	addl	$256,%esi
541	subl	$256,%ecx
542	cmpl	$256,%ecx
543	jae	3b
544	popl	%ecx
545	popl	%esi
5465:
547	ALIGN_TEXT
548large_i586_bcopy_loop:
549	fildq	0(%esi)
550	fildq	8(%esi)
551	fildq	16(%esi)
552	fildq	24(%esi)
553	fildq	32(%esi)
554	fildq	40(%esi)
555	fildq	48(%esi)
556	fildq	56(%esi)
557	fistpq	56(%edi)
558	fistpq	48(%edi)
559	fistpq	40(%edi)
560	fistpq	32(%edi)
561	fistpq	24(%edi)
562	fistpq	16(%edi)
563	fistpq	8(%edi)
564	fistpq	0(%edi)
565	addl	$64,%esi
566	addl	$64,%edi
567	subl	$64,%ecx
568	cmpl	$64,%ecx
569	jae	large_i586_bcopy_loop
570	popl	%eax
571	addl	%eax,%ecx
572	cmpl	$64,%ecx
573	jae	4b
574
575	cmpl	$0,PCPU(NPXPROC)
576	je	i586_bc2
577	frstor	0(%esp)
578	addl	$108,%esp
579i586_bc2:
580	lmsw	%dx
581	movb	$0xfe,kernel_fpu_lock
582
583/*
584 * This is a duplicate of the main part of generic_bcopy.  See the comments
585 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
586 * would mess up high resolution profiling.
587 */
588	ALIGN_TEXT
589small_i586_bcopy:
590	shrl	$2,%ecx
591	cld
592	rep
593	movsl
594	movl	20(%esp),%ecx
595	andl	$3,%ecx
596	rep
597	movsb
598	popl	%edi
599	popl	%esi
600	ret
601
602	ALIGN_TEXT
6031:
604	addl	%ecx,%edi
605	addl	%ecx,%esi
606	decl	%edi
607	decl	%esi
608	andl	$3,%ecx
609	std
610	rep
611	movsb
612	movl	20(%esp),%ecx
613	shrl	$2,%ecx
614	subl	$3,%esi
615	subl	$3,%edi
616	rep
617	movsl
618	popl	%edi
619	popl	%esi
620	cld
621	ret
622#endif /* I586_CPU && NNPX > 0 */
623
624/*
625 * Note: memcpy does not support overlapping copies
626 */
627ENTRY(memcpy)
628	pushl	%edi
629	pushl	%esi
630	movl	12(%esp),%edi
631	movl	16(%esp),%esi
632	movl	20(%esp),%ecx
633	movl	%edi,%eax
634	shrl	$2,%ecx				/* copy by 32-bit words */
635	cld					/* nope, copy forwards */
636	rep
637	movsl
638	movl	20(%esp),%ecx
639	andl	$3,%ecx				/* any bytes left? */
640	rep
641	movsb
642	popl	%esi
643	popl	%edi
644	ret
645
646
647/*****************************************************************************/
648/* copyout and fubyte family                                                 */
649/*****************************************************************************/
650/*
651 * Access user memory from inside the kernel. These routines and possibly
652 * the math- and DOS emulators should be the only places that do this.
653 *
654 * We have to access the memory with user's permissions, so use a segment
655 * selector with RPL 3. For writes to user space we have to additionally
656 * check the PTE for write permission, because the 386 does not check
657 * write permissions when we are executing with EPL 0. The 486 does check
658 * this if the WP bit is set in CR0, so we can use a simpler version here.
659 *
660 * These routines set curpcb->onfault for the time they execute. When a
661 * protection violation occurs inside the functions, the trap handler
662 * returns to *curpcb->onfault instead of the function.
663 */
664
665/*
666 * copyout(from_kernel, to_user, len)  - MP SAFE (if not I386_CPU)
667 */
668ENTRY(copyout)
669	MEXITCOUNT
670	jmp	*_copyout_vector
671
672ENTRY(generic_copyout)
673	movl	PCPU(CURPCB),%eax
674	movl	$copyout_fault,PCB_ONFAULT(%eax)
675	pushl	%esi
676	pushl	%edi
677	pushl	%ebx
678	movl	16(%esp),%esi
679	movl	20(%esp),%edi
680	movl	24(%esp),%ebx
681	testl	%ebx,%ebx			/* anything to do? */
682	jz	done_copyout
683
684	/*
685	 * Check explicitly for non-user addresses.  If 486 write protection
686	 * is being used, this check is essential because we are in kernel
687	 * mode so the h/w does not provide any protection against writing
688	 * kernel addresses.
689	 */
690
691	/*
692	 * First, prevent address wrapping.
693	 */
694	movl	%edi,%eax
695	addl	%ebx,%eax
696	jc	copyout_fault
697/*
698 * XXX STOP USING VM_MAXUSER_ADDRESS.
699 * It is an end address, not a max, so every time it is used correctly it
700 * looks like there is an off by one error, and of course it caused an off
701 * by one error in several places.
702 */
703	cmpl	$VM_MAXUSER_ADDRESS,%eax
704	ja	copyout_fault
705
706#if defined(I386_CPU)
707
708#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
709	cmpl	$CPUCLASS_386,_cpu_class
710	jne	3f
711#endif
712/*
713 * We have to check each PTE for user write permission.
714 * The checking may cause a page fault, so it is important to set
715 * up everything for return via copyout_fault before here.
716 */
717	/* compute number of pages */
718	movl	%edi,%ecx
719	andl	$PAGE_MASK,%ecx
720	addl	%ebx,%ecx
721	decl	%ecx
722	shrl	$IDXSHIFT+2,%ecx
723	incl	%ecx
724
725	/* compute PTE offset for start address */
726	movl	%edi,%edx
727	shrl	$IDXSHIFT,%edx
728	andb	$0xfc,%dl
729
7301:
731	/* check PTE for each page */
732	leal	_PTmap(%edx),%eax
733	shrl	$IDXSHIFT,%eax
734	andb	$0xfc,%al
735	testb	$PG_V,_PTmap(%eax)		/* PTE page must be valid */
736	je	4f
737	movb	_PTmap(%edx),%al
738	andb	$PG_V|PG_RW|PG_U,%al		/* page must be valid and user writable */
739	cmpb	$PG_V|PG_RW|PG_U,%al
740	je	2f
741
7424:
743	/* simulate a trap */
744	pushl	%edx
745	pushl	%ecx
746	shll	$IDXSHIFT,%edx
747	pushl	%edx
748	call	_trapwrite			/* trapwrite(addr) */
749	popl	%edx
750	popl	%ecx
751	popl	%edx
752
753	testl	%eax,%eax			/* if not ok, return EFAULT */
754	jnz	copyout_fault
755
7562:
757	addl	$4,%edx
758	decl	%ecx
759	jnz	1b				/* check next page */
760#endif /* I386_CPU */
761
762	/* bcopy(%esi, %edi, %ebx) */
7633:
764	movl	%ebx,%ecx
765
766#if defined(I586_CPU) && NNPX > 0
767	ALIGN_TEXT
768slow_copyout:
769#endif
770	shrl	$2,%ecx
771	cld
772	rep
773	movsl
774	movb	%bl,%cl
775	andb	$3,%cl
776	rep
777	movsb
778
779done_copyout:
780	popl	%ebx
781	popl	%edi
782	popl	%esi
783	xorl	%eax,%eax
784	movl	PCPU(CURPCB),%edx
785	movl	%eax,PCB_ONFAULT(%edx)
786	ret
787
788	ALIGN_TEXT
789copyout_fault:
790	popl	%ebx
791	popl	%edi
792	popl	%esi
793	movl	PCPU(CURPCB),%edx
794	movl	$0,PCB_ONFAULT(%edx)
795	movl	$EFAULT,%eax
796	ret
797
798#if defined(I586_CPU) && NNPX > 0
799ENTRY(i586_copyout)
800	/*
801	 * Duplicated from generic_copyout.  Could be done a bit better.
802	 */
803	movl	PCPU(CURPCB),%eax
804	movl	$copyout_fault,PCB_ONFAULT(%eax)
805	pushl	%esi
806	pushl	%edi
807	pushl	%ebx
808	movl	16(%esp),%esi
809	movl	20(%esp),%edi
810	movl	24(%esp),%ebx
811	testl	%ebx,%ebx			/* anything to do? */
812	jz	done_copyout
813
814	/*
815	 * Check explicitly for non-user addresses.  If 486 write protection
816	 * is being used, this check is essential because we are in kernel
817	 * mode so the h/w does not provide any protection against writing
818	 * kernel addresses.
819	 */
820
821	/*
822	 * First, prevent address wrapping.
823	 */
824	movl	%edi,%eax
825	addl	%ebx,%eax
826	jc	copyout_fault
827/*
828 * XXX STOP USING VM_MAXUSER_ADDRESS.
829 * It is an end address, not a max, so every time it is used correctly it
830 * looks like there is an off by one error, and of course it caused an off
831 * by one error in several places.
832 */
833	cmpl	$VM_MAXUSER_ADDRESS,%eax
834	ja	copyout_fault
835
836	/* bcopy(%esi, %edi, %ebx) */
8373:
838	movl	%ebx,%ecx
839	/*
840	 * End of duplicated code.
841	 */
842
843	cmpl	$1024,%ecx
844	jb	slow_copyout
845
846	pushl	%ecx
847	call	_fastmove
848	addl	$4,%esp
849	jmp	done_copyout
850#endif /* I586_CPU && NNPX > 0 */
851
852/*
853 * copyin(from_user, to_kernel, len) - MP SAFE
854 */
855ENTRY(copyin)
856	MEXITCOUNT
857	jmp	*_copyin_vector
858
859ENTRY(generic_copyin)
860	movl	PCPU(CURPCB),%eax
861	movl	$copyin_fault,PCB_ONFAULT(%eax)
862	pushl	%esi
863	pushl	%edi
864	movl	12(%esp),%esi			/* caddr_t from */
865	movl	16(%esp),%edi			/* caddr_t to */
866	movl	20(%esp),%ecx			/* size_t  len */
867
868	/*
869	 * make sure address is valid
870	 */
871	movl	%esi,%edx
872	addl	%ecx,%edx
873	jc	copyin_fault
874	cmpl	$VM_MAXUSER_ADDRESS,%edx
875	ja	copyin_fault
876
877#if defined(I586_CPU) && NNPX > 0
878	ALIGN_TEXT
879slow_copyin:
880#endif
881	movb	%cl,%al
882	shrl	$2,%ecx				/* copy longword-wise */
883	cld
884	rep
885	movsl
886	movb	%al,%cl
887	andb	$3,%cl				/* copy remaining bytes */
888	rep
889	movsb
890
891#if defined(I586_CPU) && NNPX > 0
892	ALIGN_TEXT
893done_copyin:
894#endif
895	popl	%edi
896	popl	%esi
897	xorl	%eax,%eax
898	movl	PCPU(CURPCB),%edx
899	movl	%eax,PCB_ONFAULT(%edx)
900	ret
901
902	ALIGN_TEXT
903copyin_fault:
904	popl	%edi
905	popl	%esi
906	movl	PCPU(CURPCB),%edx
907	movl	$0,PCB_ONFAULT(%edx)
908	movl	$EFAULT,%eax
909	ret
910
911#if defined(I586_CPU) && NNPX > 0
912ENTRY(i586_copyin)
913	/*
914	 * Duplicated from generic_copyin.  Could be done a bit better.
915	 */
916	movl	PCPU(CURPCB),%eax
917	movl	$copyin_fault,PCB_ONFAULT(%eax)
918	pushl	%esi
919	pushl	%edi
920	movl	12(%esp),%esi			/* caddr_t from */
921	movl	16(%esp),%edi			/* caddr_t to */
922	movl	20(%esp),%ecx			/* size_t  len */
923
924	/*
925	 * make sure address is valid
926	 */
927	movl	%esi,%edx
928	addl	%ecx,%edx
929	jc	copyin_fault
930	cmpl	$VM_MAXUSER_ADDRESS,%edx
931	ja	copyin_fault
932	/*
933	 * End of duplicated code.
934	 */
935
936	cmpl	$1024,%ecx
937	jb	slow_copyin
938
939	pushl	%ebx			/* XXX prepare for fastmove_fault */
940	pushl	%ecx
941	call	_fastmove
942	addl	$8,%esp
943	jmp	done_copyin
944#endif /* I586_CPU && NNPX > 0 */
945
946#if defined(I586_CPU) && NNPX > 0
947/* fastmove(src, dst, len)
948	src in %esi
949	dst in %edi
950	len in %ecx		XXX changed to on stack for profiling
951	uses %eax and %edx for tmp. storage
952 */
953/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
954ENTRY(fastmove)
955	pushl	%ebp
956	movl	%esp,%ebp
957	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
958
959	movl	8(%ebp),%ecx
960	cmpl	$63,%ecx
961	jbe	fastmove_tail
962
963	testl	$7,%esi	/* check if src addr is multiple of 8 */
964	jnz	fastmove_tail
965
966	testl	$7,%edi	/* check if dst addr is multiple of 8 */
967	jnz	fastmove_tail
968
969/* if (npxproc != NULL) { */
970	cmpl	$0,PCPU(NPXPROC)
971	je	6f
972/*    fnsave(&curpcb->pcb_savefpu); */
973	movl	PCPU(CURPCB),%eax
974	fnsave	PCB_SAVEFPU(%eax)
975/*   npxproc = NULL; */
976	movl	$0,PCPU(NPXPROC)
977/* } */
9786:
979/* now we own the FPU. */
980
981/*
982 * The process' FP state is saved in the pcb, but if we get
983 * switched, the cpu_switch() will store our FP state in the
984 * pcb.  It should be possible to avoid all the copying for
985 * this, e.g., by setting a flag to tell cpu_switch() to
986 * save the state somewhere else.
987 */
988/* tmp = curpcb->pcb_savefpu; */
989	movl	%ecx,-12(%ebp)
990	movl	%esi,-8(%ebp)
991	movl	%edi,-4(%ebp)
992	movl	%esp,%edi
993	movl	PCPU(CURPCB),%esi
994	addl	$PCB_SAVEFPU,%esi
995	cld
996	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
997	rep
998	movsl
999	movl	-12(%ebp),%ecx
1000	movl	-8(%ebp),%esi
1001	movl	-4(%ebp),%edi
1002/* stop_emulating(); */
1003	clts
1004/* npxproc = curproc; */
1005	movl	PCPU(CURPROC),%eax
1006	movl	%eax,PCPU(NPXPROC)
1007	movl	PCPU(CURPCB),%eax
1008	movl	$fastmove_fault,PCB_ONFAULT(%eax)
10094:
1010	movl	%ecx,-12(%ebp)
1011	cmpl	$1792,%ecx
1012	jbe	2f
1013	movl	$1792,%ecx
10142:
1015	subl	%ecx,-12(%ebp)
1016	cmpl	$256,%ecx
1017	jb	5f
1018	movl	%ecx,-8(%ebp)
1019	movl	%esi,-4(%ebp)
1020	ALIGN_TEXT
10213:
1022	movl	0(%esi),%eax
1023	movl	32(%esi),%eax
1024	movl	64(%esi),%eax
1025	movl	96(%esi),%eax
1026	movl	128(%esi),%eax
1027	movl	160(%esi),%eax
1028	movl	192(%esi),%eax
1029	movl	224(%esi),%eax
1030	addl	$256,%esi
1031	subl	$256,%ecx
1032	cmpl	$256,%ecx
1033	jae	3b
1034	movl	-8(%ebp),%ecx
1035	movl	-4(%ebp),%esi
10365:
1037	ALIGN_TEXT
1038fastmove_loop:
1039	fildq	0(%esi)
1040	fildq	8(%esi)
1041	fildq	16(%esi)
1042	fildq	24(%esi)
1043	fildq	32(%esi)
1044	fildq	40(%esi)
1045	fildq	48(%esi)
1046	fildq	56(%esi)
1047	fistpq	56(%edi)
1048	fistpq	48(%edi)
1049	fistpq	40(%edi)
1050	fistpq	32(%edi)
1051	fistpq	24(%edi)
1052	fistpq	16(%edi)
1053	fistpq	8(%edi)
1054	fistpq	0(%edi)
1055	addl	$-64,%ecx
1056	addl	$64,%esi
1057	addl	$64,%edi
1058	cmpl	$63,%ecx
1059	ja	fastmove_loop
1060	movl	-12(%ebp),%eax
1061	addl	%eax,%ecx
1062	cmpl	$64,%ecx
1063	jae	4b
1064
1065/* curpcb->pcb_savefpu = tmp; */
1066	movl	%ecx,-12(%ebp)
1067	movl	%esi,-8(%ebp)
1068	movl	%edi,-4(%ebp)
1069	movl	PCPU(CURPCB),%edi
1070	addl	$PCB_SAVEFPU,%edi
1071	movl	%esp,%esi
1072	cld
1073	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1074	rep
1075	movsl
1076	movl	-12(%ebp),%ecx
1077	movl	-8(%ebp),%esi
1078	movl	-4(%ebp),%edi
1079
1080/* start_emulating(); */
1081	smsw	%ax
1082	orb	$CR0_TS,%al
1083	lmsw	%ax
1084/* npxproc = NULL; */
1085	movl	$0,PCPU(NPXPROC)
1086
1087	ALIGN_TEXT
1088fastmove_tail:
1089	movl	PCPU(CURPCB),%eax
1090	movl	$fastmove_tail_fault,PCB_ONFAULT(%eax)
1091
1092	movb	%cl,%al
1093	shrl	$2,%ecx				/* copy longword-wise */
1094	cld
1095	rep
1096	movsl
1097	movb	%al,%cl
1098	andb	$3,%cl				/* copy remaining bytes */
1099	rep
1100	movsb
1101
1102	movl	%ebp,%esp
1103	popl	%ebp
1104	ret
1105
1106	ALIGN_TEXT
1107fastmove_fault:
1108	movl	PCPU(CURPCB),%edi
1109	addl	$PCB_SAVEFPU,%edi
1110	movl	%esp,%esi
1111	cld
1112	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1113	rep
1114	movsl
1115
1116	smsw	%ax
1117	orb	$CR0_TS,%al
1118	lmsw	%ax
1119	movl	$0,PCPU(NPXPROC)
1120
1121fastmove_tail_fault:
1122	movl	%ebp,%esp
1123	popl	%ebp
1124	addl	$8,%esp
1125	popl	%ebx
1126	popl	%edi
1127	popl	%esi
1128	movl	PCPU(CURPCB),%edx
1129	movl	$0,PCB_ONFAULT(%edx)
1130	movl	$EFAULT,%eax
1131	ret
1132#endif /* I586_CPU && NNPX > 0 */
1133
1134/*
1135 * fu{byte,sword,word} - MP SAFE
1136 *
1137 *	Fetch a byte (sword, word) from user memory
1138 */
1139ENTRY(fuword)
1140	movl	PCPU(CURPCB),%ecx
1141	movl	$fusufault,PCB_ONFAULT(%ecx)
1142	movl	4(%esp),%edx			/* from */
1143
1144	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1145	ja	fusufault
1146
1147	movl	(%edx),%eax
1148	movl	$0,PCB_ONFAULT(%ecx)
1149	ret
1150
1151/*
1152 * These two routines are called from the profiling code, potentially
1153 * at interrupt time. If they fail, that's okay, good things will
1154 * happen later. Fail all the time for now - until the trap code is
1155 * able to deal with this.
1156 */
1157ALTENTRY(suswintr)
1158ENTRY(fuswintr)
1159	movl	$-1,%eax
1160	ret
1161
1162/*
1163 * fusword - MP SAFE
1164 */
1165ENTRY(fusword)
1166	movl	PCPU(CURPCB),%ecx
1167	movl	$fusufault,PCB_ONFAULT(%ecx)
1168	movl	4(%esp),%edx
1169
1170	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
1171	ja	fusufault
1172
1173	movzwl	(%edx),%eax
1174	movl	$0,PCB_ONFAULT(%ecx)
1175	ret
1176
1177/*
1178 * fubyte - MP SAFE
1179 */
1180ENTRY(fubyte)
1181	movl	PCPU(CURPCB),%ecx
1182	movl	$fusufault,PCB_ONFAULT(%ecx)
1183	movl	4(%esp),%edx
1184
1185	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1186	ja	fusufault
1187
1188	movzbl	(%edx),%eax
1189	movl	$0,PCB_ONFAULT(%ecx)
1190	ret
1191
1192	ALIGN_TEXT
1193fusufault:
1194	movl	PCPU(CURPCB),%ecx
1195	xorl	%eax,%eax
1196	movl	%eax,PCB_ONFAULT(%ecx)
1197	decl	%eax
1198	ret
1199
1200/*
1201 * su{byte,sword,word} - MP SAFE (if not I386_CPU)
1202 *
1203 *	Write a byte (word, longword) to user memory
1204 */
1205ENTRY(suword)
1206	movl	PCPU(CURPCB),%ecx
1207	movl	$fusufault,PCB_ONFAULT(%ecx)
1208	movl	4(%esp),%edx
1209
1210#if defined(I386_CPU)
1211
1212#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1213	cmpl	$CPUCLASS_386,_cpu_class
1214	jne	2f				/* we only have to set the right segment selector */
1215#endif /* I486_CPU || I586_CPU || I686_CPU */
1216
1217	/* XXX - page boundary crossing is still not handled */
1218	movl	%edx,%eax
1219	shrl	$IDXSHIFT,%edx
1220	andb	$0xfc,%dl
1221
1222	leal	_PTmap(%edx),%ecx
1223	shrl	$IDXSHIFT,%ecx
1224	andb	$0xfc,%cl
1225	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
1226	je	4f
1227	movb	_PTmap(%edx),%dl
1228	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1229	cmpb	$PG_V|PG_RW|PG_U,%dl
1230	je	1f
1231
12324:
1233	/* simulate a trap */
1234	pushl	%eax
1235	call	_trapwrite
1236	popl	%edx				/* remove junk parameter from stack */
1237	testl	%eax,%eax
1238	jnz	fusufault
12391:
1240	movl	4(%esp),%edx
1241#endif
1242
12432:
1244	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1245	ja	fusufault
1246
1247	movl	8(%esp),%eax
1248	movl	%eax,(%edx)
1249	xorl	%eax,%eax
1250	movl	PCPU(CURPCB),%ecx
1251	movl	%eax,PCB_ONFAULT(%ecx)
1252	ret
1253
1254/*
1255 * susword - MP SAFE (if not I386_CPU)
1256 */
1257ENTRY(susword)
1258	movl	PCPU(CURPCB),%ecx
1259	movl	$fusufault,PCB_ONFAULT(%ecx)
1260	movl	4(%esp),%edx
1261
1262#if defined(I386_CPU)
1263
1264#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1265	cmpl	$CPUCLASS_386,_cpu_class
1266	jne	2f
1267#endif /* I486_CPU || I586_CPU || I686_CPU */
1268
1269	/* XXX - page boundary crossing is still not handled */
1270	movl	%edx,%eax
1271	shrl	$IDXSHIFT,%edx
1272	andb	$0xfc,%dl
1273
1274	leal	_PTmap(%edx),%ecx
1275	shrl	$IDXSHIFT,%ecx
1276	andb	$0xfc,%cl
1277	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
1278	je	4f
1279	movb	_PTmap(%edx),%dl
1280	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1281	cmpb	$PG_V|PG_RW|PG_U,%dl
1282	je	1f
1283
12844:
1285	/* simulate a trap */
1286	pushl	%eax
1287	call	_trapwrite
1288	popl	%edx				/* remove junk parameter from stack */
1289	testl	%eax,%eax
1290	jnz	fusufault
12911:
1292	movl	4(%esp),%edx
1293#endif
1294
12952:
1296	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1297	ja	fusufault
1298
1299	movw	8(%esp),%ax
1300	movw	%ax,(%edx)
1301	xorl	%eax,%eax
1302	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
1303	movl	%eax,PCB_ONFAULT(%ecx)
1304	ret
1305
1306/*
1307 * su[i]byte - MP SAFE (if not I386_CPU)
1308 */
1309ALTENTRY(suibyte)
1310ENTRY(subyte)
1311	movl	PCPU(CURPCB),%ecx
1312	movl	$fusufault,PCB_ONFAULT(%ecx)
1313	movl	4(%esp),%edx
1314
1315#if defined(I386_CPU)
1316
1317#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1318	cmpl	$CPUCLASS_386,_cpu_class
1319	jne	2f
1320#endif /* I486_CPU || I586_CPU || I686_CPU */
1321
1322	movl	%edx,%eax
1323	shrl	$IDXSHIFT,%edx
1324	andb	$0xfc,%dl
1325
1326	leal	_PTmap(%edx),%ecx
1327	shrl	$IDXSHIFT,%ecx
1328	andb	$0xfc,%cl
1329	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
1330	je	4f
1331	movb	_PTmap(%edx),%dl
1332	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1333	cmpb	$PG_V|PG_RW|PG_U,%dl
1334	je	1f
1335
13364:
1337	/* simulate a trap */
1338	pushl	%eax
1339	call	_trapwrite
1340	popl	%edx				/* remove junk parameter from stack */
1341	testl	%eax,%eax
1342	jnz	fusufault
13431:
1344	movl	4(%esp),%edx
1345#endif
1346
13472:
1348	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1349	ja	fusufault
1350
1351	movb	8(%esp),%al
1352	movb	%al,(%edx)
1353	xorl	%eax,%eax
1354	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
1355	movl	%eax,PCB_ONFAULT(%ecx)
1356	ret
1357
1358/*
1359 * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
1360 *
1361 *	copy a string from from to to, stop when a 0 character is reached.
1362 *	return ENAMETOOLONG if string is longer than maxlen, and
1363 *	EFAULT on protection violations. If lencopied is non-zero,
1364 *	return the actual length in *lencopied.
1365 */
1366ENTRY(copyinstr)
1367	pushl	%esi
1368	pushl	%edi
1369	movl	PCPU(CURPCB),%ecx
1370	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1371
1372	movl	12(%esp),%esi			/* %esi = from */
1373	movl	16(%esp),%edi			/* %edi = to */
1374	movl	20(%esp),%edx			/* %edx = maxlen */
1375
1376	movl	$VM_MAXUSER_ADDRESS,%eax
1377
1378	/* make sure 'from' is within bounds */
1379	subl	%esi,%eax
1380	jbe	cpystrflt
1381
1382	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1383	cmpl	%edx,%eax
1384	jae	1f
1385	movl	%eax,%edx
1386	movl	%eax,20(%esp)
13871:
1388	incl	%edx
1389	cld
1390
13912:
1392	decl	%edx
1393	jz	3f
1394
1395	lodsb
1396	stosb
1397	orb	%al,%al
1398	jnz	2b
1399
1400	/* Success -- 0 byte reached */
1401	decl	%edx
1402	xorl	%eax,%eax
1403	jmp	cpystrflt_x
14043:
1405	/* edx is zero - return ENAMETOOLONG or EFAULT */
1406	cmpl	$VM_MAXUSER_ADDRESS,%esi
1407	jae	cpystrflt
14084:
1409	movl	$ENAMETOOLONG,%eax
1410	jmp	cpystrflt_x
1411
1412cpystrflt:
1413	movl	$EFAULT,%eax
1414
1415cpystrflt_x:
1416	/* set *lencopied and return %eax */
1417	movl	PCPU(CURPCB),%ecx
1418	movl	$0,PCB_ONFAULT(%ecx)
1419	movl	20(%esp),%ecx
1420	subl	%edx,%ecx
1421	movl	24(%esp),%edx
1422	testl	%edx,%edx
1423	jz	1f
1424	movl	%ecx,(%edx)
14251:
1426	popl	%edi
1427	popl	%esi
1428	ret
1429
1430
1431/*
1432 * copystr(from, to, maxlen, int *lencopied) - MP SAFE
1433 */
1434ENTRY(copystr)
1435	pushl	%esi
1436	pushl	%edi
1437
1438	movl	12(%esp),%esi			/* %esi = from */
1439	movl	16(%esp),%edi			/* %edi = to */
1440	movl	20(%esp),%edx			/* %edx = maxlen */
1441	incl	%edx
1442	cld
14431:
1444	decl	%edx
1445	jz	4f
1446	lodsb
1447	stosb
1448	orb	%al,%al
1449	jnz	1b
1450
1451	/* Success -- 0 byte reached */
1452	decl	%edx
1453	xorl	%eax,%eax
1454	jmp	6f
14554:
1456	/* edx is zero -- return ENAMETOOLONG */
1457	movl	$ENAMETOOLONG,%eax
1458
14596:
1460	/* set *lencopied and return %eax */
1461	movl	20(%esp),%ecx
1462	subl	%edx,%ecx
1463	movl	24(%esp),%edx
1464	testl	%edx,%edx
1465	jz	7f
1466	movl	%ecx,(%edx)
14677:
1468	popl	%edi
1469	popl	%esi
1470	ret
1471
1472ENTRY(bcmp)
1473	pushl	%edi
1474	pushl	%esi
1475	movl	12(%esp),%edi
1476	movl	16(%esp),%esi
1477	movl	20(%esp),%edx
1478	xorl	%eax,%eax
1479
1480	movl	%edx,%ecx
1481	shrl	$2,%ecx
1482	cld					/* compare forwards */
1483	repe
1484	cmpsl
1485	jne	1f
1486
1487	movl	%edx,%ecx
1488	andl	$3,%ecx
1489	repe
1490	cmpsb
1491	je	2f
14921:
1493	incl	%eax
14942:
1495	popl	%esi
1496	popl	%edi
1497	ret
1498
1499
1500/*
1501 * Handling of special 386 registers and descriptor tables etc
1502 */
1503/* void lgdt(struct region_descriptor *rdp); */
1504ENTRY(lgdt)
1505	/* reload the descriptor table */
1506	movl	4(%esp),%eax
1507	lgdt	(%eax)
1508
1509	/* flush the prefetch q */
1510	jmp	1f
1511	nop
15121:
1513	/* reload "stale" selectors */
1514	movl	$KDSEL,%eax
1515	mov	%ax,%ds
1516	mov	%ax,%es
1517	mov	%ax,%gs
1518	mov	%ax,%ss
1519	movl	$KPSEL,%eax
1520	mov	%ax,%fs
1521
1522	/* reload code selector by turning return into intersegmental return */
1523	movl	(%esp),%eax
1524	pushl	%eax
1525	movl	$KCSEL,4(%esp)
1526	lret
1527
1528/*
1529 * void lidt(struct region_descriptor *rdp);
1530 */
1531ENTRY(lidt)
1532	movl	4(%esp),%eax
1533	lidt	(%eax)
1534	ret
1535
1536/*
1537 * void lldt(u_short sel)
1538 */
1539ENTRY(lldt)
1540	lldt	4(%esp)
1541	ret
1542
1543/*
1544 * void ltr(u_short sel)
1545 */
1546ENTRY(ltr)
1547	ltr	4(%esp)
1548	ret
1549
1550/* ssdtosd(*ssdp,*sdp) */
1551ENTRY(ssdtosd)
1552	pushl	%ebx
1553	movl	8(%esp),%ecx
1554	movl	8(%ecx),%ebx
1555	shll	$16,%ebx
1556	movl	(%ecx),%edx
1557	roll	$16,%edx
1558	movb	%dh,%bl
1559	movb	%dl,%bh
1560	rorl	$8,%ebx
1561	movl	4(%ecx),%eax
1562	movw	%ax,%dx
1563	andl	$0xf0000,%eax
1564	orl	%eax,%ebx
1565	movl	12(%esp),%ecx
1566	movl	%edx,(%ecx)
1567	movl	%ebx,4(%ecx)
1568	popl	%ebx
1569	ret
1570
1571/* load_cr0(cr0) */
1572ENTRY(load_cr0)
1573	movl	4(%esp),%eax
1574	movl	%eax,%cr0
1575	ret
1576
1577/* rcr0() */
1578ENTRY(rcr0)
1579	movl	%cr0,%eax
1580	ret
1581
1582/* rcr3() */
1583ENTRY(rcr3)
1584	movl	%cr3,%eax
1585	ret
1586
1587/* void load_cr3(caddr_t cr3) */
1588ENTRY(load_cr3)
1589#if defined(SWTCH_OPTIM_STATS)
1590	incl	_tlb_flush_count
1591#endif
1592	movl	4(%esp),%eax
1593	movl	%eax,%cr3
1594	ret
1595
1596/* rcr4() */
1597ENTRY(rcr4)
1598	movl	%cr4,%eax
1599	ret
1600
1601/* void load_cr4(caddr_t cr4) */
1602ENTRY(load_cr4)
1603	movl	4(%esp),%eax
1604	movl	%eax,%cr4
1605	ret
1606
1607/* void load_dr6(u_int dr6) */
1608ENTRY(load_dr6)
1609	movl    4(%esp),%eax
1610	movl    %eax,%dr6
1611	ret
1612
1613/* void reset_dbregs() */
1614ENTRY(reset_dbregs)
1615	movl    $0,%eax
1616	movl    %eax,%dr7     /* disable all breapoints first */
1617	movl    %eax,%dr0
1618	movl    %eax,%dr1
1619	movl    %eax,%dr2
1620	movl    %eax,%dr3
1621	movl    %eax,%dr6
1622	ret
1623
1624/*****************************************************************************/
1625/* setjump, longjump                                                         */
1626/*****************************************************************************/
1627
1628ENTRY(setjmp)
1629	movl	4(%esp),%eax
1630	movl	%ebx,(%eax)			/* save ebx */
1631	movl	%esp,4(%eax)			/* save esp */
1632	movl	%ebp,8(%eax)			/* save ebp */
1633	movl	%esi,12(%eax)			/* save esi */
1634	movl	%edi,16(%eax)			/* save edi */
1635	movl	(%esp),%edx			/* get rta */
1636	movl	%edx,20(%eax)			/* save eip */
1637	xorl	%eax,%eax			/* return(0); */
1638	ret
1639
1640ENTRY(longjmp)
1641	movl	4(%esp),%eax
1642	movl	(%eax),%ebx			/* restore ebx */
1643	movl	4(%eax),%esp			/* restore esp */
1644	movl	8(%eax),%ebp			/* restore ebp */
1645	movl	12(%eax),%esi			/* restore esi */
1646	movl	16(%eax),%edi			/* restore edi */
1647	movl	20(%eax),%edx			/* get rta */
1648	movl	%edx,(%esp)			/* put in return frame */
1649	xorl	%eax,%eax			/* return(1); */
1650	incl	%eax
1651	ret
1652
1653/*
1654 * Support for BB-profiling (gcc -a).  The kernbb program will extract
1655 * the data from the kernel.
1656 */
1657
1658	.data
1659	ALIGN_DATA
1660	.globl bbhead
1661bbhead:
1662	.long 0
1663
1664	.text
1665NON_GPROF_ENTRY(__bb_init_func)
1666	movl	4(%esp),%eax
1667	movl	$1,(%eax)
1668	movl	bbhead,%edx
1669	movl	%edx,16(%eax)
1670	movl	%eax,bbhead
1671	NON_GPROF_RET
1672