support.s revision 71257
137Srgrimes/*-
237Srgrimes * Copyright (c) 1993 The Regents of the University of California.
337Srgrimes * All rights reserved.
437Srgrimes *
537Srgrimes * Redistribution and use in source and binary forms, with or without
637Srgrimes * modification, are permitted provided that the following conditions
737Srgrimes * are met:
837Srgrimes * 1. Redistributions of source code must retain the above copyright
9476Srgrimes *    notice, this list of conditions and the following disclaimer.
1037Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
1137Srgrimes *    notice, this list of conditions and the following disclaimer in the
1237Srgrimes *    documentation and/or other materials provided with the distribution.
1337Srgrimes * 3. All advertising materials mentioning features or use of this software
1437Srgrimes *    must display the following acknowledgement:
1537Srgrimes *	This product includes software developed by the University of
1637Srgrimes *	California, Berkeley and its contributors.
1737Srgrimes * 4. Neither the name of the University nor the names of its contributors
1837Srgrimes *    may be used to endorse or promote products derived from this software
1937Srgrimes *    without specific prior written permission.
2037Srgrimes *
2137Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2237Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2337Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2437Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2537Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2637Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2737Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2837Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2937Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
301141Sache * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
311141Sache * SUCH DAMAGE.
321141Sache *
331141Sache * $FreeBSD: head/sys/i386/i386/support.s 71257 2001-01-19 13:19:02Z peter $
341141Sache */
357708Srgrimes
3611137Sjoerg#include "opt_npx.h"
3711137Sjoerg
381141Sache#include <machine/asmacros.h>
3911137Sjoerg#include <machine/cputypes.h>
4011137Sjoerg#include <machine/pmap.h>
4111137Sjoerg#include <machine/specialreg.h>
4211137Sjoerg
4311137Sjoerg#include "assym.s"
4411137Sjoerg
4511137Sjoerg#define IDXSHIFT	10
4611137Sjoerg
4711137Sjoerg	.data
481141Sache	.globl	_bcopy_vector
491141Sache_bcopy_vector:
501141Sache	.long	_generic_bcopy
511141Sache	.globl	_bzero
521141Sache_bzero:
531141Sache	.long	_generic_bzero
541141Sache	.globl	_copyin_vector
551141Sache_copyin_vector:
561141Sache	.long	_generic_copyin
571141Sache	.globl	_copyout_vector
581141Sache_copyout_vector:
591141Sache	.long	_generic_copyout
601141Sache	.globl	_ovbcopy_vector
611182Srgrimes_ovbcopy_vector:
621144Sache	.long	_generic_bcopy
631141Sache#if defined(I586_CPU) && defined(DEV_NPX)
641141Sachekernel_fpu_lock:
651141Sache	.byte	0xfe
661141Sache	.space	3
671182Srgrimes#endif
681141Sache
691141Sache	.text
701141Sache
711141Sache/*
721141Sache * bcopy family
731141Sache * void bzero(void *buf, u_int len)
741141Sache */
751141Sache
761141SacheENTRY(generic_bzero)
771141Sache	pushl	%edi
781141Sache	movl	8(%esp),%edi
791141Sache	movl	12(%esp),%ecx
801141Sache	xorl	%eax,%eax
811141Sache	shrl	$2,%ecx
821141Sache	cld
831141Sache	rep
841141Sache	stosl
851141Sache	movl	12(%esp),%ecx
861141Sache	andl	$3,%ecx
871141Sache	rep
881141Sache	stosb
891141Sache	popl	%edi
901141Sache	ret
911141Sache
921141Sache#ifdef I486_CPU
931141SacheENTRY(i486_bzero)
941141Sache	movl	4(%esp),%edx
951141Sache	movl	8(%esp),%ecx
961141Sache	xorl	%eax,%eax
971141Sache/*
981141Sache * do 64 byte chunks first
991141Sache *
1001141Sache * XXX this is probably over-unrolled at least for DX2's
1011141Sache */
1021141Sache2:
1031141Sache	cmpl	$64,%ecx
10437Srgrimes	jb	3f
10537Srgrimes	movl	%eax,(%edx)
10637Srgrimes	movl	%eax,4(%edx)
10737Srgrimes	movl	%eax,8(%edx)
10837Srgrimes	movl	%eax,12(%edx)
10937Srgrimes	movl	%eax,16(%edx)
11037Srgrimes	movl	%eax,20(%edx)
11137Srgrimes	movl	%eax,24(%edx)
11237Srgrimes	movl	%eax,28(%edx)
11337Srgrimes	movl	%eax,32(%edx)
11437Srgrimes	movl	%eax,36(%edx)
11537Srgrimes	movl	%eax,40(%edx)
11637Srgrimes	movl	%eax,44(%edx)
11737Srgrimes	movl	%eax,48(%edx)
11837Srgrimes	movl	%eax,52(%edx)
11937Srgrimes	movl	%eax,56(%edx)
12037Srgrimes	movl	%eax,60(%edx)
12137Srgrimes	addl	$64,%edx
1227708Srgrimes	subl	$64,%ecx
12337Srgrimes	jnz	2b
12437Srgrimes	ret
12537Srgrimes
126145Srgrimes/*
12737Srgrimes * do 16 byte chunks
12837Srgrimes */
12937Srgrimes	SUPERALIGN_TEXT
13037Srgrimes3:
13137Srgrimes	cmpl	$16,%ecx
132145Srgrimes	jb	4f
133145Srgrimes	movl	%eax,(%edx)
134145Srgrimes	movl	%eax,4(%edx)
135145Srgrimes	movl	%eax,8(%edx)
136145Srgrimes	movl	%eax,12(%edx)
137145Srgrimes	addl	$16,%edx
138145Srgrimes	subl	$16,%ecx
139145Srgrimes	jnz	3b
140145Srgrimes	ret
141145Srgrimes
142145Srgrimes/*
143145Srgrimes * do 4 byte chunks
144145Srgrimes */
145145Srgrimes	SUPERALIGN_TEXT
146145Srgrimes4:
147145Srgrimes	cmpl	$4,%ecx
148145Srgrimes	jb	5f
14937Srgrimes	movl	%eax,(%edx)
15037Srgrimes	addl	$4,%edx
15137Srgrimes	subl	$4,%ecx
15237Srgrimes	jnz	4b
15337Srgrimes	ret
15437Srgrimes
15537Srgrimes/*
15637Srgrimes * do 1 byte chunks
15737Srgrimes * a jump table seems to be faster than a loop or more range reductions
15837Srgrimes *
15937Srgrimes * XXX need a const section for non-text
16037Srgrimes */
16137Srgrimes	.data
16237Srgrimesjtab:
16337Srgrimes	.long	do0
16437Srgrimes	.long	do1
16537Srgrimes	.long	do2
16637Srgrimes	.long	do3
16737Srgrimes
16837Srgrimes	.text
16937Srgrimes	SUPERALIGN_TEXT
17037Srgrimes5:
17137Srgrimes	jmp	*jtab(,%ecx,4)
17237Srgrimes
17337Srgrimes	SUPERALIGN_TEXT
17437Srgrimesdo3:
17537Srgrimes	movw	%ax,(%edx)
17637Srgrimes	movb	%al,2(%edx)
17737Srgrimes	ret
17837Srgrimes
17937Srgrimes	SUPERALIGN_TEXT
18037Srgrimesdo2:
18137Srgrimes	movw	%ax,(%edx)
18211137Sjoerg	ret
18311137Sjoerg
18411137Sjoerg	SUPERALIGN_TEXT
18511137Sjoergdo1:
186	movb	%al,(%edx)
187	ret
188
189	SUPERALIGN_TEXT
190do0:
191	ret
192#endif
193
194#if defined(I586_CPU) && defined(DEV_NPX)
195ENTRY(i586_bzero)
196	movl	4(%esp),%edx
197	movl	8(%esp),%ecx
198
199	/*
200	 * The FPU register method is twice as fast as the integer register
201	 * method unless the target is in the L1 cache and we pre-allocate a
202	 * cache line for it (then the integer register method is 4-5 times
203	 * faster).  However, we never pre-allocate cache lines, since that
204	 * would make the integer method 25% or more slower for the common
205	 * case when the target isn't in either the L1 cache or the L2 cache.
206	 * Thus we normally use the FPU register method unless the overhead
207	 * would be too large.
208	 */
209	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
210	jb	intreg_i586_bzero
211
212	/*
213	 * The FPU registers may belong to an application or to fastmove()
214	 * or to another invocation of bcopy() or ourself in a higher level
215	 * interrupt or trap handler.  Preserving the registers is
216	 * complicated since we avoid it if possible at all levels.  We
217	 * want to localize the complications even when that increases them.
218	 * Here the extra work involves preserving CR0_TS in TS.
219	 * `npxproc != NULL' is supposed to be the condition that all the
220	 * FPU resources belong to an application, but npxproc and CR0_TS
221	 * aren't set atomically enough for this condition to work in
222	 * interrupt handlers.
223	 *
224	 * Case 1: FPU registers belong to the application: we must preserve
225	 * the registers if we use them, so we only use the FPU register
226	 * method if the target size is large enough to amortize the extra
227	 * overhead for preserving them.  CR0_TS must be preserved although
228	 * it is very likely to end up as set.
229	 *
230	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
231	 * makes the registers look like they belong to an application so
232	 * that cpu_switch() and savectx() don't have to know about it, so
233	 * this case reduces to case 1.
234	 *
235	 * Case 3: FPU registers belong to the kernel: don't use the FPU
236	 * register method.  This case is unlikely, and supporting it would
237	 * be more complicated and might take too much stack.
238	 *
239	 * Case 4: FPU registers don't belong to anyone: the FPU registers
240	 * don't need to be preserved, so we always use the FPU register
241	 * method.  CR0_TS must be preserved although it is very likely to
242	 * always end up as clear.
243	 */
244	cmpl	$0,PCPU(NPXPROC)
245	je	i586_bz1
246	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
247	jb	intreg_i586_bzero
248	sarb	$1,kernel_fpu_lock
249	jc	intreg_i586_bzero
250	smsw	%ax
251	clts
252	subl	$108,%esp
253	fnsave	0(%esp)
254	jmp	i586_bz2
255
256i586_bz1:
257	sarb	$1,kernel_fpu_lock
258	jc	intreg_i586_bzero
259	smsw	%ax
260	clts
261	fninit				/* XXX should avoid needing this */
262i586_bz2:
263	fldz
264
265	/*
266	 * Align to an 8 byte boundary (misalignment in the main loop would
267	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
268	 * already aligned) by always zeroing 8 bytes and using the part up
269	 * to the _next_ alignment position.
270	 */
271	fstl	0(%edx)
272	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
273	addl	$8,%edx
274	andl	$~7,%edx
275	subl	%edx,%ecx
276
277	/*
278	 * Similarly align `len' to a multiple of 8.
279	 */
280	fstl	-8(%edx,%ecx)
281	decl	%ecx
282	andl	$~7,%ecx
283
284	/*
285	 * This wouldn't be any faster if it were unrolled, since the loop
286	 * control instructions are much faster than the fstl and/or done
287	 * in parallel with it so their overhead is insignificant.
288	 */
289fpureg_i586_bzero_loop:
290	fstl	0(%edx)
291	addl	$8,%edx
292	subl	$8,%ecx
293	cmpl	$8,%ecx
294	jae	fpureg_i586_bzero_loop
295
296	cmpl	$0,PCPU(NPXPROC)
297	je	i586_bz3
298	frstor	0(%esp)
299	addl	$108,%esp
300	lmsw	%ax
301	movb	$0xfe,kernel_fpu_lock
302	ret
303
304i586_bz3:
305	fstp	%st(0)
306	lmsw	%ax
307	movb	$0xfe,kernel_fpu_lock
308	ret
309
310intreg_i586_bzero:
311	/*
312	 * `rep stos' seems to be the best method in practice for small
313	 * counts.  Fancy methods usually take too long to start up due
314	 * to cache and BTB misses.
315	 */
316	pushl	%edi
317	movl	%edx,%edi
318	xorl	%eax,%eax
319	shrl	$2,%ecx
320	cld
321	rep
322	stosl
323	movl	12(%esp),%ecx
324	andl	$3,%ecx
325	jne	1f
326	popl	%edi
327	ret
328
3291:
330	rep
331	stosb
332	popl	%edi
333	ret
334#endif /* I586_CPU && defined(DEV_NPX) */
335
336ENTRY(i686_pagezero)
337	pushl	%edi
338	pushl	%ebx
339
340	movl	12(%esp), %edi
341	movl	$1024, %ecx
342	cld
343
344	ALIGN_TEXT
3451:
346	xorl	%eax, %eax
347	repe
348	scasl
349	jnz	2f
350
351	popl	%ebx
352	popl	%edi
353	ret
354
355	ALIGN_TEXT
356
3572:
358	incl	%ecx
359	subl	$4, %edi
360
361	movl	%ecx, %edx
362	cmpl	$16, %ecx
363
364	jge	3f
365
366	movl	%edi, %ebx
367	andl	$0x3f, %ebx
368	shrl	%ebx
369	shrl	%ebx
370	movl	$16, %ecx
371	subl	%ebx, %ecx
372
3733:
374	subl	%ecx, %edx
375	rep
376	stosl
377
378	movl	%edx, %ecx
379	testl	%edx, %edx
380	jnz	1b
381
382	popl	%ebx
383	popl	%edi
384	ret
385
386/* fillw(pat, base, cnt) */
387ENTRY(fillw)
388	pushl	%edi
389	movl	8(%esp),%eax
390	movl	12(%esp),%edi
391	movl	16(%esp),%ecx
392	cld
393	rep
394	stosw
395	popl	%edi
396	ret
397
398ENTRY(bcopyb)
399	pushl	%esi
400	pushl	%edi
401	movl	12(%esp),%esi
402	movl	16(%esp),%edi
403	movl	20(%esp),%ecx
404	movl	%edi,%eax
405	subl	%esi,%eax
406	cmpl	%ecx,%eax			/* overlapping && src < dst? */
407	jb	1f
408	cld					/* nope, copy forwards */
409	rep
410	movsb
411	popl	%edi
412	popl	%esi
413	ret
414
415	ALIGN_TEXT
4161:
417	addl	%ecx,%edi			/* copy backwards. */
418	addl	%ecx,%esi
419	decl	%edi
420	decl	%esi
421	std
422	rep
423	movsb
424	popl	%edi
425	popl	%esi
426	cld
427	ret
428
429ENTRY(bcopy)
430	MEXITCOUNT
431	jmp	*_bcopy_vector
432
433ENTRY(ovbcopy)
434	MEXITCOUNT
435	jmp	*_ovbcopy_vector
436
437/*
438 * generic_bcopy(src, dst, cnt)
439 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
440 */
441ENTRY(generic_bcopy)
442	pushl	%esi
443	pushl	%edi
444	movl	12(%esp),%esi
445	movl	16(%esp),%edi
446	movl	20(%esp),%ecx
447
448	movl	%edi,%eax
449	subl	%esi,%eax
450	cmpl	%ecx,%eax			/* overlapping && src < dst? */
451	jb	1f
452
453	shrl	$2,%ecx				/* copy by 32-bit words */
454	cld					/* nope, copy forwards */
455	rep
456	movsl
457	movl	20(%esp),%ecx
458	andl	$3,%ecx				/* any bytes left? */
459	rep
460	movsb
461	popl	%edi
462	popl	%esi
463	ret
464
465	ALIGN_TEXT
4661:
467	addl	%ecx,%edi			/* copy backwards */
468	addl	%ecx,%esi
469	decl	%edi
470	decl	%esi
471	andl	$3,%ecx				/* any fractional bytes? */
472	std
473	rep
474	movsb
475	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
476	shrl	$2,%ecx
477	subl	$3,%esi
478	subl	$3,%edi
479	rep
480	movsl
481	popl	%edi
482	popl	%esi
483	cld
484	ret
485
486#if defined(I586_CPU) && defined(DEV_NPX)
487ENTRY(i586_bcopy)
488	pushl	%esi
489	pushl	%edi
490	movl	12(%esp),%esi
491	movl	16(%esp),%edi
492	movl	20(%esp),%ecx
493
494	movl	%edi,%eax
495	subl	%esi,%eax
496	cmpl	%ecx,%eax			/* overlapping && src < dst? */
497	jb	1f
498
499	cmpl	$1024,%ecx
500	jb	small_i586_bcopy
501
502	sarb	$1,kernel_fpu_lock
503	jc	small_i586_bcopy
504	cmpl	$0,PCPU(NPXPROC)
505	je	i586_bc1
506	smsw	%dx
507	clts
508	subl	$108,%esp
509	fnsave	0(%esp)
510	jmp	4f
511
512i586_bc1:
513	smsw	%dx
514	clts
515	fninit				/* XXX should avoid needing this */
516
517	ALIGN_TEXT
5184:
519	pushl	%ecx
520#define	DCACHE_SIZE	8192
521	cmpl	$(DCACHE_SIZE-512)/2,%ecx
522	jbe	2f
523	movl	$(DCACHE_SIZE-512)/2,%ecx
5242:
525	subl	%ecx,0(%esp)
526	cmpl	$256,%ecx
527	jb	5f			/* XXX should prefetch if %ecx >= 32 */
528	pushl	%esi
529	pushl	%ecx
530	ALIGN_TEXT
5313:
532	movl	0(%esi),%eax
533	movl	32(%esi),%eax
534	movl	64(%esi),%eax
535	movl	96(%esi),%eax
536	movl	128(%esi),%eax
537	movl	160(%esi),%eax
538	movl	192(%esi),%eax
539	movl	224(%esi),%eax
540	addl	$256,%esi
541	subl	$256,%ecx
542	cmpl	$256,%ecx
543	jae	3b
544	popl	%ecx
545	popl	%esi
5465:
547	ALIGN_TEXT
548large_i586_bcopy_loop:
549	fildq	0(%esi)
550	fildq	8(%esi)
551	fildq	16(%esi)
552	fildq	24(%esi)
553	fildq	32(%esi)
554	fildq	40(%esi)
555	fildq	48(%esi)
556	fildq	56(%esi)
557	fistpq	56(%edi)
558	fistpq	48(%edi)
559	fistpq	40(%edi)
560	fistpq	32(%edi)
561	fistpq	24(%edi)
562	fistpq	16(%edi)
563	fistpq	8(%edi)
564	fistpq	0(%edi)
565	addl	$64,%esi
566	addl	$64,%edi
567	subl	$64,%ecx
568	cmpl	$64,%ecx
569	jae	large_i586_bcopy_loop
570	popl	%eax
571	addl	%eax,%ecx
572	cmpl	$64,%ecx
573	jae	4b
574
575	cmpl	$0,PCPU(NPXPROC)
576	je	i586_bc2
577	frstor	0(%esp)
578	addl	$108,%esp
579i586_bc2:
580	lmsw	%dx
581	movb	$0xfe,kernel_fpu_lock
582
583/*
584 * This is a duplicate of the main part of generic_bcopy.  See the comments
585 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
586 * would mess up high resolution profiling.
587 */
588	ALIGN_TEXT
589small_i586_bcopy:
590	shrl	$2,%ecx
591	cld
592	rep
593	movsl
594	movl	20(%esp),%ecx
595	andl	$3,%ecx
596	rep
597	movsb
598	popl	%edi
599	popl	%esi
600	ret
601
602	ALIGN_TEXT
6031:
604	addl	%ecx,%edi
605	addl	%ecx,%esi
606	decl	%edi
607	decl	%esi
608	andl	$3,%ecx
609	std
610	rep
611	movsb
612	movl	20(%esp),%ecx
613	shrl	$2,%ecx
614	subl	$3,%esi
615	subl	$3,%edi
616	rep
617	movsl
618	popl	%edi
619	popl	%esi
620	cld
621	ret
622#endif /* I586_CPU && defined(DEV_NPX) */
623
624/*
625 * Note: memcpy does not support overlapping copies
626 */
627ENTRY(memcpy)
628	pushl	%edi
629	pushl	%esi
630	movl	12(%esp),%edi
631	movl	16(%esp),%esi
632	movl	20(%esp),%ecx
633	movl	%edi,%eax
634	shrl	$2,%ecx				/* copy by 32-bit words */
635	cld					/* nope, copy forwards */
636	rep
637	movsl
638	movl	20(%esp),%ecx
639	andl	$3,%ecx				/* any bytes left? */
640	rep
641	movsb
642	popl	%esi
643	popl	%edi
644	ret
645
646
647/*****************************************************************************/
648/* copyout and fubyte family                                                 */
649/*****************************************************************************/
650/*
651 * Access user memory from inside the kernel. These routines and possibly
652 * the math- and DOS emulators should be the only places that do this.
653 *
654 * We have to access the memory with user's permissions, so use a segment
655 * selector with RPL 3. For writes to user space we have to additionally
656 * check the PTE for write permission, because the 386 does not check
657 * write permissions when we are executing with EPL 0. The 486 does check
658 * this if the WP bit is set in CR0, so we can use a simpler version here.
659 *
660 * These routines set curpcb->onfault for the time they execute. When a
661 * protection violation occurs inside the functions, the trap handler
662 * returns to *curpcb->onfault instead of the function.
663 */
664
665/*
666 * copyout(from_kernel, to_user, len)  - MP SAFE (if not I386_CPU)
667 */
668ENTRY(copyout)
669	MEXITCOUNT
670	jmp	*_copyout_vector
671
672ENTRY(generic_copyout)
673	movl	PCPU(CURPCB),%eax
674	movl	$copyout_fault,PCB_ONFAULT(%eax)
675	pushl	%esi
676	pushl	%edi
677	pushl	%ebx
678	movl	16(%esp),%esi
679	movl	20(%esp),%edi
680	movl	24(%esp),%ebx
681	testl	%ebx,%ebx			/* anything to do? */
682	jz	done_copyout
683
684	/*
685	 * Check explicitly for non-user addresses.  If 486 write protection
686	 * is being used, this check is essential because we are in kernel
687	 * mode so the h/w does not provide any protection against writing
688	 * kernel addresses.
689	 */
690
691	/*
692	 * First, prevent address wrapping.
693	 */
694	movl	%edi,%eax
695	addl	%ebx,%eax
696	jc	copyout_fault
697/*
698 * XXX STOP USING VM_MAXUSER_ADDRESS.
699 * It is an end address, not a max, so every time it is used correctly it
700 * looks like there is an off by one error, and of course it caused an off
701 * by one error in several places.
702 */
703	cmpl	$VM_MAXUSER_ADDRESS,%eax
704	ja	copyout_fault
705
706#ifdef I386_CPU
707
708/*
709 * We have to check each PTE for user write permission.
710 * The checking may cause a page fault, so it is important to set
711 * up everything for return via copyout_fault before here.
712 */
713	/* compute number of pages */
714	movl	%edi,%ecx
715	andl	$PAGE_MASK,%ecx
716	addl	%ebx,%ecx
717	decl	%ecx
718	shrl	$IDXSHIFT+2,%ecx
719	incl	%ecx
720
721	/* compute PTE offset for start address */
722	movl	%edi,%edx
723	shrl	$IDXSHIFT,%edx
724	andb	$0xfc,%dl
725
7261:
727	/* check PTE for each page */
728	leal	_PTmap(%edx),%eax
729	shrl	$IDXSHIFT,%eax
730	andb	$0xfc,%al
731	testb	$PG_V,_PTmap(%eax)		/* PTE page must be valid */
732	je	4f
733	movb	_PTmap(%edx),%al
734	andb	$PG_V|PG_RW|PG_U,%al		/* page must be valid and user writable */
735	cmpb	$PG_V|PG_RW|PG_U,%al
736	je	2f
737
7384:
739	/* simulate a trap */
740	pushl	%edx
741	pushl	%ecx
742	shll	$IDXSHIFT,%edx
743	pushl	%edx
744	call	_trapwrite			/* trapwrite(addr) */
745	popl	%edx
746	popl	%ecx
747	popl	%edx
748
749	testl	%eax,%eax			/* if not ok, return EFAULT */
750	jnz	copyout_fault
751
7522:
753	addl	$4,%edx
754	decl	%ecx
755	jnz	1b				/* check next page */
756#endif /* I386_CPU */
757
758	/* bcopy(%esi, %edi, %ebx) */
759	movl	%ebx,%ecx
760
761#if defined(I586_CPU) && defined(DEV_NPX)
762	ALIGN_TEXT
763slow_copyout:
764#endif
765	shrl	$2,%ecx
766	cld
767	rep
768	movsl
769	movb	%bl,%cl
770	andb	$3,%cl
771	rep
772	movsb
773
774done_copyout:
775	popl	%ebx
776	popl	%edi
777	popl	%esi
778	xorl	%eax,%eax
779	movl	PCPU(CURPCB),%edx
780	movl	%eax,PCB_ONFAULT(%edx)
781	ret
782
783	ALIGN_TEXT
784copyout_fault:
785	popl	%ebx
786	popl	%edi
787	popl	%esi
788	movl	PCPU(CURPCB),%edx
789	movl	$0,PCB_ONFAULT(%edx)
790	movl	$EFAULT,%eax
791	ret
792
793#if defined(I586_CPU) && defined(DEV_NPX)
794ENTRY(i586_copyout)
795	/*
796	 * Duplicated from generic_copyout.  Could be done a bit better.
797	 */
798	movl	PCPU(CURPCB),%eax
799	movl	$copyout_fault,PCB_ONFAULT(%eax)
800	pushl	%esi
801	pushl	%edi
802	pushl	%ebx
803	movl	16(%esp),%esi
804	movl	20(%esp),%edi
805	movl	24(%esp),%ebx
806	testl	%ebx,%ebx			/* anything to do? */
807	jz	done_copyout
808
809	/*
810	 * Check explicitly for non-user addresses.  If 486 write protection
811	 * is being used, this check is essential because we are in kernel
812	 * mode so the h/w does not provide any protection against writing
813	 * kernel addresses.
814	 */
815
816	/*
817	 * First, prevent address wrapping.
818	 */
819	movl	%edi,%eax
820	addl	%ebx,%eax
821	jc	copyout_fault
822/*
823 * XXX STOP USING VM_MAXUSER_ADDRESS.
824 * It is an end address, not a max, so every time it is used correctly it
825 * looks like there is an off by one error, and of course it caused an off
826 * by one error in several places.
827 */
828	cmpl	$VM_MAXUSER_ADDRESS,%eax
829	ja	copyout_fault
830
831	/* bcopy(%esi, %edi, %ebx) */
8323:
833	movl	%ebx,%ecx
834	/*
835	 * End of duplicated code.
836	 */
837
838	cmpl	$1024,%ecx
839	jb	slow_copyout
840
841	pushl	%ecx
842	call	_fastmove
843	addl	$4,%esp
844	jmp	done_copyout
845#endif /* I586_CPU && defined(DEV_NPX) */
846
847/*
848 * copyin(from_user, to_kernel, len) - MP SAFE
849 */
850ENTRY(copyin)
851	MEXITCOUNT
852	jmp	*_copyin_vector
853
854ENTRY(generic_copyin)
855	movl	PCPU(CURPCB),%eax
856	movl	$copyin_fault,PCB_ONFAULT(%eax)
857	pushl	%esi
858	pushl	%edi
859	movl	12(%esp),%esi			/* caddr_t from */
860	movl	16(%esp),%edi			/* caddr_t to */
861	movl	20(%esp),%ecx			/* size_t  len */
862
863	/*
864	 * make sure address is valid
865	 */
866	movl	%esi,%edx
867	addl	%ecx,%edx
868	jc	copyin_fault
869	cmpl	$VM_MAXUSER_ADDRESS,%edx
870	ja	copyin_fault
871
872#if defined(I586_CPU) && defined(DEV_NPX)
873	ALIGN_TEXT
874slow_copyin:
875#endif
876	movb	%cl,%al
877	shrl	$2,%ecx				/* copy longword-wise */
878	cld
879	rep
880	movsl
881	movb	%al,%cl
882	andb	$3,%cl				/* copy remaining bytes */
883	rep
884	movsb
885
886#if defined(I586_CPU) && defined(DEV_NPX)
887	ALIGN_TEXT
888done_copyin:
889#endif
890	popl	%edi
891	popl	%esi
892	xorl	%eax,%eax
893	movl	PCPU(CURPCB),%edx
894	movl	%eax,PCB_ONFAULT(%edx)
895	ret
896
897	ALIGN_TEXT
898copyin_fault:
899	popl	%edi
900	popl	%esi
901	movl	PCPU(CURPCB),%edx
902	movl	$0,PCB_ONFAULT(%edx)
903	movl	$EFAULT,%eax
904	ret
905
906#if defined(I586_CPU) && defined(DEV_NPX)
907ENTRY(i586_copyin)
908	/*
909	 * Duplicated from generic_copyin.  Could be done a bit better.
910	 */
911	movl	PCPU(CURPCB),%eax
912	movl	$copyin_fault,PCB_ONFAULT(%eax)
913	pushl	%esi
914	pushl	%edi
915	movl	12(%esp),%esi			/* caddr_t from */
916	movl	16(%esp),%edi			/* caddr_t to */
917	movl	20(%esp),%ecx			/* size_t  len */
918
919	/*
920	 * make sure address is valid
921	 */
922	movl	%esi,%edx
923	addl	%ecx,%edx
924	jc	copyin_fault
925	cmpl	$VM_MAXUSER_ADDRESS,%edx
926	ja	copyin_fault
927	/*
928	 * End of duplicated code.
929	 */
930
931	cmpl	$1024,%ecx
932	jb	slow_copyin
933
934	pushl	%ebx			/* XXX prepare for fastmove_fault */
935	pushl	%ecx
936	call	_fastmove
937	addl	$8,%esp
938	jmp	done_copyin
939#endif /* I586_CPU && defined(DEV_NPX) */
940
941#if defined(I586_CPU) && defined(DEV_NPX)
942/* fastmove(src, dst, len)
943	src in %esi
944	dst in %edi
945	len in %ecx		XXX changed to on stack for profiling
946	uses %eax and %edx for tmp. storage
947 */
948/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
949ENTRY(fastmove)
950	pushl	%ebp
951	movl	%esp,%ebp
952	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
953
954	movl	8(%ebp),%ecx
955	cmpl	$63,%ecx
956	jbe	fastmove_tail
957
958	testl	$7,%esi	/* check if src addr is multiple of 8 */
959	jnz	fastmove_tail
960
961	testl	$7,%edi	/* check if dst addr is multiple of 8 */
962	jnz	fastmove_tail
963
964/* if (npxproc != NULL) { */
965	cmpl	$0,PCPU(NPXPROC)
966	je	6f
967/*    fnsave(&curpcb->pcb_savefpu); */
968	movl	PCPU(CURPCB),%eax
969	fnsave	PCB_SAVEFPU(%eax)
970/*   npxproc = NULL; */
971	movl	$0,PCPU(NPXPROC)
972/* } */
9736:
974/* now we own the FPU. */
975
976/*
977 * The process' FP state is saved in the pcb, but if we get
978 * switched, the cpu_switch() will store our FP state in the
979 * pcb.  It should be possible to avoid all the copying for
980 * this, e.g., by setting a flag to tell cpu_switch() to
981 * save the state somewhere else.
982 */
983/* tmp = curpcb->pcb_savefpu; */
984	movl	%ecx,-12(%ebp)
985	movl	%esi,-8(%ebp)
986	movl	%edi,-4(%ebp)
987	movl	%esp,%edi
988	movl	PCPU(CURPCB),%esi
989	addl	$PCB_SAVEFPU,%esi
990	cld
991	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
992	rep
993	movsl
994	movl	-12(%ebp),%ecx
995	movl	-8(%ebp),%esi
996	movl	-4(%ebp),%edi
997/* stop_emulating(); */
998	clts
999/* npxproc = curproc; */
1000	movl	PCPU(CURPROC),%eax
1001	movl	%eax,PCPU(NPXPROC)
1002	movl	PCPU(CURPCB),%eax
1003	movl	$fastmove_fault,PCB_ONFAULT(%eax)
10044:
1005	movl	%ecx,-12(%ebp)
1006	cmpl	$1792,%ecx
1007	jbe	2f
1008	movl	$1792,%ecx
10092:
1010	subl	%ecx,-12(%ebp)
1011	cmpl	$256,%ecx
1012	jb	5f
1013	movl	%ecx,-8(%ebp)
1014	movl	%esi,-4(%ebp)
1015	ALIGN_TEXT
10163:
1017	movl	0(%esi),%eax
1018	movl	32(%esi),%eax
1019	movl	64(%esi),%eax
1020	movl	96(%esi),%eax
1021	movl	128(%esi),%eax
1022	movl	160(%esi),%eax
1023	movl	192(%esi),%eax
1024	movl	224(%esi),%eax
1025	addl	$256,%esi
1026	subl	$256,%ecx
1027	cmpl	$256,%ecx
1028	jae	3b
1029	movl	-8(%ebp),%ecx
1030	movl	-4(%ebp),%esi
10315:
1032	ALIGN_TEXT
1033fastmove_loop:
1034	fildq	0(%esi)
1035	fildq	8(%esi)
1036	fildq	16(%esi)
1037	fildq	24(%esi)
1038	fildq	32(%esi)
1039	fildq	40(%esi)
1040	fildq	48(%esi)
1041	fildq	56(%esi)
1042	fistpq	56(%edi)
1043	fistpq	48(%edi)
1044	fistpq	40(%edi)
1045	fistpq	32(%edi)
1046	fistpq	24(%edi)
1047	fistpq	16(%edi)
1048	fistpq	8(%edi)
1049	fistpq	0(%edi)
1050	addl	$-64,%ecx
1051	addl	$64,%esi
1052	addl	$64,%edi
1053	cmpl	$63,%ecx
1054	ja	fastmove_loop
1055	movl	-12(%ebp),%eax
1056	addl	%eax,%ecx
1057	cmpl	$64,%ecx
1058	jae	4b
1059
1060/* curpcb->pcb_savefpu = tmp; */
1061	movl	%ecx,-12(%ebp)
1062	movl	%esi,-8(%ebp)
1063	movl	%edi,-4(%ebp)
1064	movl	PCPU(CURPCB),%edi
1065	addl	$PCB_SAVEFPU,%edi
1066	movl	%esp,%esi
1067	cld
1068	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1069	rep
1070	movsl
1071	movl	-12(%ebp),%ecx
1072	movl	-8(%ebp),%esi
1073	movl	-4(%ebp),%edi
1074
1075/* start_emulating(); */
1076	smsw	%ax
1077	orb	$CR0_TS,%al
1078	lmsw	%ax
1079/* npxproc = NULL; */
1080	movl	$0,PCPU(NPXPROC)
1081
1082	ALIGN_TEXT
1083fastmove_tail:
1084	movl	PCPU(CURPCB),%eax
1085	movl	$fastmove_tail_fault,PCB_ONFAULT(%eax)
1086
1087	movb	%cl,%al
1088	shrl	$2,%ecx				/* copy longword-wise */
1089	cld
1090	rep
1091	movsl
1092	movb	%al,%cl
1093	andb	$3,%cl				/* copy remaining bytes */
1094	rep
1095	movsb
1096
1097	movl	%ebp,%esp
1098	popl	%ebp
1099	ret
1100
1101	ALIGN_TEXT
1102fastmove_fault:
1103	movl	PCPU(CURPCB),%edi
1104	addl	$PCB_SAVEFPU,%edi
1105	movl	%esp,%esi
1106	cld
1107	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1108	rep
1109	movsl
1110
1111	smsw	%ax
1112	orb	$CR0_TS,%al
1113	lmsw	%ax
1114	movl	$0,PCPU(NPXPROC)
1115
1116fastmove_tail_fault:
1117	movl	%ebp,%esp
1118	popl	%ebp
1119	addl	$8,%esp
1120	popl	%ebx
1121	popl	%edi
1122	popl	%esi
1123	movl	PCPU(CURPCB),%edx
1124	movl	$0,PCB_ONFAULT(%edx)
1125	movl	$EFAULT,%eax
1126	ret
1127#endif /* I586_CPU && defined(DEV_NPX) */
1128
1129/*
1130 * fu{byte,sword,word} - MP SAFE
1131 *
1132 *	Fetch a byte (sword, word) from user memory
1133 */
1134ENTRY(fuword)
1135	movl	PCPU(CURPCB),%ecx
1136	movl	$fusufault,PCB_ONFAULT(%ecx)
1137	movl	4(%esp),%edx			/* from */
1138
1139	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1140	ja	fusufault
1141
1142	movl	(%edx),%eax
1143	movl	$0,PCB_ONFAULT(%ecx)
1144	ret
1145
1146/*
1147 * These two routines are called from the profiling code, potentially
1148 * at interrupt time. If they fail, that's okay, good things will
1149 * happen later. Fail all the time for now - until the trap code is
1150 * able to deal with this.
1151 */
1152ALTENTRY(suswintr)
1153ENTRY(fuswintr)
1154	movl	$-1,%eax
1155	ret
1156
1157/*
1158 * fusword - MP SAFE
1159 */
1160ENTRY(fusword)
1161	movl	PCPU(CURPCB),%ecx
1162	movl	$fusufault,PCB_ONFAULT(%ecx)
1163	movl	4(%esp),%edx
1164
1165	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
1166	ja	fusufault
1167
1168	movzwl	(%edx),%eax
1169	movl	$0,PCB_ONFAULT(%ecx)
1170	ret
1171
1172/*
1173 * fubyte - MP SAFE
1174 */
1175ENTRY(fubyte)
1176	movl	PCPU(CURPCB),%ecx
1177	movl	$fusufault,PCB_ONFAULT(%ecx)
1178	movl	4(%esp),%edx
1179
1180	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1181	ja	fusufault
1182
1183	movzbl	(%edx),%eax
1184	movl	$0,PCB_ONFAULT(%ecx)
1185	ret
1186
1187	ALIGN_TEXT
1188fusufault:
1189	movl	PCPU(CURPCB),%ecx
1190	xorl	%eax,%eax
1191	movl	%eax,PCB_ONFAULT(%ecx)
1192	decl	%eax
1193	ret
1194
1195/*
1196 * su{byte,sword,word} - MP SAFE (if not I386_CPU)
1197 *
1198 *	Write a byte (word, longword) to user memory
1199 */
1200ENTRY(suword)
1201	movl	PCPU(CURPCB),%ecx
1202	movl	$fusufault,PCB_ONFAULT(%ecx)
1203	movl	4(%esp),%edx
1204
1205#ifdef I386_CPU
1206
1207	/* XXX - page boundary crossing is still not handled */
1208	movl	%edx,%eax
1209	shrl	$IDXSHIFT,%edx
1210	andb	$0xfc,%dl
1211
1212	leal	_PTmap(%edx),%ecx
1213	shrl	$IDXSHIFT,%ecx
1214	andb	$0xfc,%cl
1215	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
1216	je	4f
1217	movb	_PTmap(%edx),%dl
1218	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1219	cmpb	$PG_V|PG_RW|PG_U,%dl
1220	je	1f
1221
12224:
1223	/* simulate a trap */
1224	pushl	%eax
1225	call	_trapwrite
1226	popl	%edx				/* remove junk parameter from stack */
1227	testl	%eax,%eax
1228	jnz	fusufault
12291:
1230	movl	4(%esp),%edx
1231#endif
1232
1233	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1234	ja	fusufault
1235
1236	movl	8(%esp),%eax
1237	movl	%eax,(%edx)
1238	xorl	%eax,%eax
1239	movl	PCPU(CURPCB),%ecx
1240	movl	%eax,PCB_ONFAULT(%ecx)
1241	ret
1242
1243/*
1244 * susword - MP SAFE (if not I386_CPU)
1245 */
1246ENTRY(susword)
1247	movl	PCPU(CURPCB),%ecx
1248	movl	$fusufault,PCB_ONFAULT(%ecx)
1249	movl	4(%esp),%edx
1250
1251#ifdef I386_CPU
1252
1253	/* XXX - page boundary crossing is still not handled */
1254	movl	%edx,%eax
1255	shrl	$IDXSHIFT,%edx
1256	andb	$0xfc,%dl
1257
1258	leal	_PTmap(%edx),%ecx
1259	shrl	$IDXSHIFT,%ecx
1260	andb	$0xfc,%cl
1261	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
1262	je	4f
1263	movb	_PTmap(%edx),%dl
1264	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1265	cmpb	$PG_V|PG_RW|PG_U,%dl
1266	je	1f
1267
12684:
1269	/* simulate a trap */
1270	pushl	%eax
1271	call	_trapwrite
1272	popl	%edx				/* remove junk parameter from stack */
1273	testl	%eax,%eax
1274	jnz	fusufault
12751:
1276	movl	4(%esp),%edx
1277#endif
1278
1279	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1280	ja	fusufault
1281
1282	movw	8(%esp),%ax
1283	movw	%ax,(%edx)
1284	xorl	%eax,%eax
1285	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
1286	movl	%eax,PCB_ONFAULT(%ecx)
1287	ret
1288
1289/*
1290 * su[i]byte - MP SAFE (if not I386_CPU)
1291 */
1292ALTENTRY(suibyte)
1293ENTRY(subyte)
1294	movl	PCPU(CURPCB),%ecx
1295	movl	$fusufault,PCB_ONFAULT(%ecx)
1296	movl	4(%esp),%edx
1297
1298#ifdef I386_CPU
1299
1300	movl	%edx,%eax
1301	shrl	$IDXSHIFT,%edx
1302	andb	$0xfc,%dl
1303
1304	leal	_PTmap(%edx),%ecx
1305	shrl	$IDXSHIFT,%ecx
1306	andb	$0xfc,%cl
1307	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
1308	je	4f
1309	movb	_PTmap(%edx),%dl
1310	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1311	cmpb	$PG_V|PG_RW|PG_U,%dl
1312	je	1f
1313
13144:
1315	/* simulate a trap */
1316	pushl	%eax
1317	call	_trapwrite
1318	popl	%edx				/* remove junk parameter from stack */
1319	testl	%eax,%eax
1320	jnz	fusufault
13211:
1322	movl	4(%esp),%edx
1323#endif
1324
1325	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1326	ja	fusufault
1327
1328	movb	8(%esp),%al
1329	movb	%al,(%edx)
1330	xorl	%eax,%eax
1331	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
1332	movl	%eax,PCB_ONFAULT(%ecx)
1333	ret
1334
1335/*
1336 * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
1337 *
1338 *	copy a string from from to to, stop when a 0 character is reached.
1339 *	return ENAMETOOLONG if string is longer than maxlen, and
1340 *	EFAULT on protection violations. If lencopied is non-zero,
1341 *	return the actual length in *lencopied.
1342 */
1343ENTRY(copyinstr)
1344	pushl	%esi
1345	pushl	%edi
1346	movl	PCPU(CURPCB),%ecx
1347	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1348
1349	movl	12(%esp),%esi			/* %esi = from */
1350	movl	16(%esp),%edi			/* %edi = to */
1351	movl	20(%esp),%edx			/* %edx = maxlen */
1352
1353	movl	$VM_MAXUSER_ADDRESS,%eax
1354
1355	/* make sure 'from' is within bounds */
1356	subl	%esi,%eax
1357	jbe	cpystrflt
1358
1359	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1360	cmpl	%edx,%eax
1361	jae	1f
1362	movl	%eax,%edx
1363	movl	%eax,20(%esp)
13641:
1365	incl	%edx
1366	cld
1367
13682:
1369	decl	%edx
1370	jz	3f
1371
1372	lodsb
1373	stosb
1374	orb	%al,%al
1375	jnz	2b
1376
1377	/* Success -- 0 byte reached */
1378	decl	%edx
1379	xorl	%eax,%eax
1380	jmp	cpystrflt_x
13813:
1382	/* edx is zero - return ENAMETOOLONG or EFAULT */
1383	cmpl	$VM_MAXUSER_ADDRESS,%esi
1384	jae	cpystrflt
13854:
1386	movl	$ENAMETOOLONG,%eax
1387	jmp	cpystrflt_x
1388
1389cpystrflt:
1390	movl	$EFAULT,%eax
1391
1392cpystrflt_x:
1393	/* set *lencopied and return %eax */
1394	movl	PCPU(CURPCB),%ecx
1395	movl	$0,PCB_ONFAULT(%ecx)
1396	movl	20(%esp),%ecx
1397	subl	%edx,%ecx
1398	movl	24(%esp),%edx
1399	testl	%edx,%edx
1400	jz	1f
1401	movl	%ecx,(%edx)
14021:
1403	popl	%edi
1404	popl	%esi
1405	ret
1406
1407
1408/*
1409 * copystr(from, to, maxlen, int *lencopied) - MP SAFE
1410 */
1411ENTRY(copystr)
1412	pushl	%esi
1413	pushl	%edi
1414
1415	movl	12(%esp),%esi			/* %esi = from */
1416	movl	16(%esp),%edi			/* %edi = to */
1417	movl	20(%esp),%edx			/* %edx = maxlen */
1418	incl	%edx
1419	cld
14201:
1421	decl	%edx
1422	jz	4f
1423	lodsb
1424	stosb
1425	orb	%al,%al
1426	jnz	1b
1427
1428	/* Success -- 0 byte reached */
1429	decl	%edx
1430	xorl	%eax,%eax
1431	jmp	6f
14324:
1433	/* edx is zero -- return ENAMETOOLONG */
1434	movl	$ENAMETOOLONG,%eax
1435
14366:
1437	/* set *lencopied and return %eax */
1438	movl	20(%esp),%ecx
1439	subl	%edx,%ecx
1440	movl	24(%esp),%edx
1441	testl	%edx,%edx
1442	jz	7f
1443	movl	%ecx,(%edx)
14447:
1445	popl	%edi
1446	popl	%esi
1447	ret
1448
1449ENTRY(bcmp)
1450	pushl	%edi
1451	pushl	%esi
1452	movl	12(%esp),%edi
1453	movl	16(%esp),%esi
1454	movl	20(%esp),%edx
1455	xorl	%eax,%eax
1456
1457	movl	%edx,%ecx
1458	shrl	$2,%ecx
1459	cld					/* compare forwards */
1460	repe
1461	cmpsl
1462	jne	1f
1463
1464	movl	%edx,%ecx
1465	andl	$3,%ecx
1466	repe
1467	cmpsb
1468	je	2f
14691:
1470	incl	%eax
14712:
1472	popl	%esi
1473	popl	%edi
1474	ret
1475
1476
1477/*
1478 * Handling of special 386 registers and descriptor tables etc
1479 */
1480/* void lgdt(struct region_descriptor *rdp); */
1481ENTRY(lgdt)
1482	/* reload the descriptor table */
1483	movl	4(%esp),%eax
1484	lgdt	(%eax)
1485
1486	/* flush the prefetch q */
1487	jmp	1f
1488	nop
14891:
1490	/* reload "stale" selectors */
1491	movl	$KDSEL,%eax
1492	mov	%ax,%ds
1493	mov	%ax,%es
1494	mov	%ax,%gs
1495	mov	%ax,%ss
1496	movl	$KPSEL,%eax
1497	mov	%ax,%fs
1498
1499	/* reload code selector by turning return into intersegmental return */
1500	movl	(%esp),%eax
1501	pushl	%eax
1502	movl	$KCSEL,4(%esp)
1503	lret
1504
1505/*
1506 * void lidt(struct region_descriptor *rdp);
1507 */
1508ENTRY(lidt)
1509	movl	4(%esp),%eax
1510	lidt	(%eax)
1511	ret
1512
1513/*
1514 * void lldt(u_short sel)
1515 */
1516ENTRY(lldt)
1517	lldt	4(%esp)
1518	ret
1519
1520/*
1521 * void ltr(u_short sel)
1522 */
1523ENTRY(ltr)
1524	ltr	4(%esp)
1525	ret
1526
1527/* ssdtosd(*ssdp,*sdp) */
1528ENTRY(ssdtosd)
1529	pushl	%ebx
1530	movl	8(%esp),%ecx
1531	movl	8(%ecx),%ebx
1532	shll	$16,%ebx
1533	movl	(%ecx),%edx
1534	roll	$16,%edx
1535	movb	%dh,%bl
1536	movb	%dl,%bh
1537	rorl	$8,%ebx
1538	movl	4(%ecx),%eax
1539	movw	%ax,%dx
1540	andl	$0xf0000,%eax
1541	orl	%eax,%ebx
1542	movl	12(%esp),%ecx
1543	movl	%edx,(%ecx)
1544	movl	%ebx,4(%ecx)
1545	popl	%ebx
1546	ret
1547
1548/* load_cr0(cr0) */
1549ENTRY(load_cr0)
1550	movl	4(%esp),%eax
1551	movl	%eax,%cr0
1552	ret
1553
1554/* rcr0() */
1555ENTRY(rcr0)
1556	movl	%cr0,%eax
1557	ret
1558
1559/* rcr3() */
1560ENTRY(rcr3)
1561	movl	%cr3,%eax
1562	ret
1563
1564/* void load_cr3(caddr_t cr3) */
1565ENTRY(load_cr3)
1566#ifdef SWTCH_OPTIM_STATS
1567	incl	_tlb_flush_count
1568#endif
1569	movl	4(%esp),%eax
1570	movl	%eax,%cr3
1571	ret
1572
1573/* rcr4() */
1574ENTRY(rcr4)
1575	movl	%cr4,%eax
1576	ret
1577
1578/* void load_cr4(caddr_t cr4) */
1579ENTRY(load_cr4)
1580	movl	4(%esp),%eax
1581	movl	%eax,%cr4
1582	ret
1583
1584/* void load_dr6(u_int dr6) */
1585ENTRY(load_dr6)
1586	movl    4(%esp),%eax
1587	movl    %eax,%dr6
1588	ret
1589
1590/* void reset_dbregs() */
1591ENTRY(reset_dbregs)
1592	movl    $0,%eax
1593	movl    %eax,%dr7     /* disable all breapoints first */
1594	movl    %eax,%dr0
1595	movl    %eax,%dr1
1596	movl    %eax,%dr2
1597	movl    %eax,%dr3
1598	movl    %eax,%dr6
1599	ret
1600
1601/*****************************************************************************/
1602/* setjump, longjump                                                         */
1603/*****************************************************************************/
1604
1605ENTRY(setjmp)
1606	movl	4(%esp),%eax
1607	movl	%ebx,(%eax)			/* save ebx */
1608	movl	%esp,4(%eax)			/* save esp */
1609	movl	%ebp,8(%eax)			/* save ebp */
1610	movl	%esi,12(%eax)			/* save esi */
1611	movl	%edi,16(%eax)			/* save edi */
1612	movl	(%esp),%edx			/* get rta */
1613	movl	%edx,20(%eax)			/* save eip */
1614	xorl	%eax,%eax			/* return(0); */
1615	ret
1616
1617ENTRY(longjmp)
1618	movl	4(%esp),%eax
1619	movl	(%eax),%ebx			/* restore ebx */
1620	movl	4(%eax),%esp			/* restore esp */
1621	movl	8(%eax),%ebp			/* restore ebp */
1622	movl	12(%eax),%esi			/* restore esi */
1623	movl	16(%eax),%edi			/* restore edi */
1624	movl	20(%eax),%edx			/* get rta */
1625	movl	%edx,(%esp)			/* put in return frame */
1626	xorl	%eax,%eax			/* return(1); */
1627	incl	%eax
1628	ret
1629
1630/*
1631 * Support for BB-profiling (gcc -a).  The kernbb program will extract
1632 * the data from the kernel.
1633 */
1634
1635	.data
1636	ALIGN_DATA
1637	.globl bbhead
1638bbhead:
1639	.long 0
1640
1641	.text
1642NON_GPROF_ENTRY(__bb_init_func)
1643	movl	4(%esp),%eax
1644	movl	$1,(%eax)
1645	movl	bbhead,%edx
1646	movl	%edx,16(%eax)
1647	movl	%eax,bbhead
1648	NON_GPROF_RET
1649