support.s revision 19653
1/*-
2 * Copyright (c) 1993 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	$Id: support.s,v 1.42 1996/11/08 02:38:44 asami Exp $
34 */
35
36#include "opt_cpu.h"
37#include "opt_temporary.h"			/* for I586_*_B* */
38
39#include <machine/asmacros.h>
40#include <machine/cputypes.h>
41#include <machine/specialreg.h>
42
43#include "assym.s"
44
45#define KDSEL		0x10			/* kernel data selector */
46#define IDXSHIFT	10
47
48	.data
49	.globl	_bcopy_vector
50_bcopy_vector:
51	.long	_generic_bcopy
52	.globl	_bzero
53_bzero:
54	.long	_generic_bzero
55	.globl	_copyin_vector
56_copyin_vector:
57	.long	_generic_copyin
58	.globl	_copyout_vector
59_copyout_vector:
60	.long	_generic_copyout
61	.globl	_ovbcopy_vector
62_ovbcopy_vector:
63	.long	_generic_bcopy
64kernel_fpu_lock:
65	.byte	0xfe
66	.space	3
67
68	.text
69
70/*
71 * bcopy family
72 * void bzero(void *buf, u_int len)
73 */
74
75ENTRY(generic_bzero)
76	pushl	%edi
77	movl	8(%esp),%edi
78	movl	12(%esp),%ecx
79	xorl	%eax,%eax
80	shrl	$2,%ecx
81	cld
82	rep
83	stosl
84	movl	12(%esp),%ecx
85	andl	$3,%ecx
86	rep
87	stosb
88	popl	%edi
89	ret
90
91#if defined(I486_CPU)
92ENTRY(i486_bzero)
93	movl	4(%esp),%edx
94	movl	8(%esp),%ecx
95	xorl	%eax,%eax
96/*
97 * do 64 byte chunks first
98 *
99 * XXX this is probably over-unrolled at least for DX2's
100 */
1012:
102	cmpl	$64,%ecx
103	jb	3f
104	movl	%eax,(%edx)
105	movl	%eax,4(%edx)
106	movl	%eax,8(%edx)
107	movl	%eax,12(%edx)
108	movl	%eax,16(%edx)
109	movl	%eax,20(%edx)
110	movl	%eax,24(%edx)
111	movl	%eax,28(%edx)
112	movl	%eax,32(%edx)
113	movl	%eax,36(%edx)
114	movl	%eax,40(%edx)
115	movl	%eax,44(%edx)
116	movl	%eax,48(%edx)
117	movl	%eax,52(%edx)
118	movl	%eax,56(%edx)
119	movl	%eax,60(%edx)
120	addl	$64,%edx
121	subl	$64,%ecx
122	jnz	2b
123	ret
124
125/*
126 * do 16 byte chunks
127 */
128	SUPERALIGN_TEXT
1293:
130	cmpl	$16,%ecx
131	jb	4f
132	movl	%eax,(%edx)
133	movl	%eax,4(%edx)
134	movl	%eax,8(%edx)
135	movl	%eax,12(%edx)
136	addl	$16,%edx
137	subl	$16,%ecx
138	jnz	3b
139	ret
140
141/*
142 * do 4 byte chunks
143 */
144	SUPERALIGN_TEXT
1454:
146	cmpl	$4,%ecx
147	jb	5f
148	movl	%eax,(%edx)
149	addl	$4,%edx
150	subl	$4,%ecx
151	jnz	4b
152	ret
153
154/*
155 * do 1 byte chunks
156 * a jump table seems to be faster than a loop or more range reductions
157 *
158 * XXX need a const section for non-text
159 */
160	.data
161jtab:
162	.long	do0
163	.long	do1
164	.long	do2
165	.long	do3
166
167	.text
168	SUPERALIGN_TEXT
1695:
170	jmp	jtab(,%ecx,4)
171
172	SUPERALIGN_TEXT
173do3:
174	movw	%ax,(%edx)
175	movb	%al,2(%edx)
176	ret
177
178	SUPERALIGN_TEXT
179do2:
180	movw	%ax,(%edx)
181	ret
182
183	SUPERALIGN_TEXT
184do1:
185	movb	%al,(%edx)
186	ret
187
188	SUPERALIGN_TEXT
189do0:
190	ret
191#endif
192
193#ifdef I586_CPU
194ENTRY(i586_bzero)
195	movl	4(%esp),%edx
196	movl	8(%esp),%ecx
197
198	/*
199	 * The FPU register method is twice as fast as the integer register
200	 * method unless the target is in the L1 cache and we pre-allocate a
201	 * cache line for it (then the integer register method is 4-5 times
202	 * faster).  However, we never pre-allocate cache lines, since that
203	 * would make the integer method 25% or more slower for the common
204	 * case when the target isn't in either the L1 cache or the L2 cache.
205	 * Thus we normally use the FPU register method unless the overhead
206	 * would be too large.
207	 */
208	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
209	jb	intreg_i586_bzero
210
211	/*
212	 * The FPU registers may belong to an application or to fastmove()
213	 * or to another invocation of bcopy() or ourself in a higher level
214	 * interrupt or trap handler.  Preserving the registers is
215	 * complicated since we avoid it if possible at all levels.  We
216	 * want to localize the complications even when that increases them.
217	 * Here the extra work involves preserving CR0_TS in TS.
218	 * `npxproc != NULL' is supposed to be the condition that all the
219	 * FPU resources belong to an application, but npxproc and CR0_TS
220	 * aren't set atomically enough for this condition to work in
221	 * interrupt handlers.
222	 *
223	 * Case 1: FPU registers belong to the application: we must preserve
224	 * the registers if we use them, so we only use the FPU register
225	 * method if the target size is large enough to amortize the extra
226	 * overhead for preserving them.  CR0_TS must be preserved although
227	 * it is very likely to end up as set.
228	 *
229	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
230	 * makes the registers look like they belong to an application so
231	 * that cpu_switch() and savectx() don't have to know about it, so
232	 * this case reduces to case 1.
233	 *
234	 * Case 3: FPU registers belong to the kernel: don't use the FPU
235	 * register method.  This case is unlikely, and supporting it would
236	 * be more complicated and might take too much stack.
237	 *
238	 * Case 4: FPU registers don't belong to anyone: the FPU registers
239	 * don't need to be preserved, so we always use the FPU register
240	 * method.  CR0_TS must be preserved although it is very likely to
241	 * always end up as clear.
242	 */
243	cmpl	$0,_npxproc
244	je	i586_bz1
245	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
246	jb	intreg_i586_bzero
247	sarb	$1,kernel_fpu_lock
248	jc	intreg_i586_bzero
249	smsw	%ax
250	clts
251	subl	$108,%esp
252	fnsave	0(%esp)
253	jmp	i586_bz2
254
255i586_bz1:
256	sarb	$1,kernel_fpu_lock
257	jc	intreg_i586_bzero
258	smsw	%ax
259	clts
260	fninit				/* XXX should avoid needing this */
261i586_bz2:
262	fldz
263
264	/*
265	 * Align to an 8 byte boundary (misalignment in the main loop would
266	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
267	 * already aligned) by always zeroing 8 bytes and using the part up
268	 * to the _next_ alignment position.
269	 */
270	fstl	0(%edx)
271	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
272	addl	$8,%edx
273	andl	$~7,%edx
274	subl	%edx,%ecx
275
276	/*
277	 * Similarly align `len' to a multiple of 8.
278	 */
279	fstl	-8(%edx,%ecx)
280	decl	%ecx
281	andl	$~7,%ecx
282
283	/*
284	 * This wouldn't be any faster if it were unrolled, since the loop
285	 * control instructions are much faster than the fstl and/or done
286	 * in parallel with it so their overhead is insignificant.
287	 */
288fpureg_i586_bzero_loop:
289	fstl	0(%edx)
290	addl	$8,%edx
291	subl	$8,%ecx
292	cmpl	$8,%ecx
293	jae	fpureg_i586_bzero_loop
294
295	cmpl	$0,_npxproc
296	je	i586_bz3
297	frstor	0(%esp)
298	addl	$108,%esp
299	lmsw	%ax
300	movb	$0xfe,kernel_fpu_lock
301	ret
302
303i586_bz3:
304	fstpl	%st(0)
305	lmsw	%ax
306	movb	$0xfe,kernel_fpu_lock
307	ret
308
309intreg_i586_bzero:
310	/*
311	 * `rep stos' seems to be the best method in practice for small
312	 * counts.  Fancy methods usually take too long to start up due
313	 * to cache and BTB misses.
314	 */
315	pushl	%edi
316	movl	%edx,%edi
317	xorl	%eax,%eax
318	shrl	$2,%ecx
319	cld
320	rep
321	stosl
322	movl	12(%esp),%ecx
323	andl	$3,%ecx
324	jne	1f
325	popl	%edi
326	ret
327
3281:
329	rep
330	stosb
331	popl	%edi
332	ret
333#endif /* I586_CPU */
334
335/* fillw(pat, base, cnt) */
336ENTRY(fillw)
337	pushl	%edi
338	movl	8(%esp),%eax
339	movl	12(%esp),%edi
340	movl	16(%esp),%ecx
341	cld
342	rep
343	stosw
344	popl	%edi
345	ret
346
347ENTRY(bcopyb)
348bcopyb:
349	pushl	%esi
350	pushl	%edi
351	movl	12(%esp),%esi
352	movl	16(%esp),%edi
353	movl	20(%esp),%ecx
354	movl	%edi,%eax
355	subl	%esi,%eax
356	cmpl	%ecx,%eax			/* overlapping && src < dst? */
357	jb	1f
358	cld					/* nope, copy forwards */
359	rep
360	movsb
361	popl	%edi
362	popl	%esi
363	ret
364
365	ALIGN_TEXT
3661:
367	addl	%ecx,%edi			/* copy backwards. */
368	addl	%ecx,%esi
369	decl	%edi
370	decl	%esi
371	std
372	rep
373	movsb
374	popl	%edi
375	popl	%esi
376	cld
377	ret
378
379ENTRY(bcopy)
380	MEXITCOUNT
381	jmp	*_bcopy_vector
382
383ENTRY(ovbcopy)
384	MEXITCOUNT
385	jmp	*_ovbcopy_vector
386
387/*
388 * generic_bcopy(src, dst, cnt)
389 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
390 */
391ENTRY(generic_bcopy)
392	pushl	%esi
393	pushl	%edi
394	movl	12(%esp),%esi
395	movl	16(%esp),%edi
396	movl	20(%esp),%ecx
397
398	movl	%edi,%eax
399	subl	%esi,%eax
400	cmpl	%ecx,%eax			/* overlapping && src < dst? */
401	jb	1f
402
403	shrl	$2,%ecx				/* copy by 32-bit words */
404	cld					/* nope, copy forwards */
405	rep
406	movsl
407	movl	20(%esp),%ecx
408	andl	$3,%ecx				/* any bytes left? */
409	rep
410	movsb
411	popl	%edi
412	popl	%esi
413	ret
414
415	ALIGN_TEXT
4161:
417	addl	%ecx,%edi			/* copy backwards */
418	addl	%ecx,%esi
419	decl	%edi
420	decl	%esi
421	andl	$3,%ecx				/* any fractional bytes? */
422	std
423	rep
424	movsb
425	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
426	shrl	$2,%ecx
427	subl	$3,%esi
428	subl	$3,%edi
429	rep
430	movsl
431	popl	%edi
432	popl	%esi
433	cld
434	ret
435
436#ifdef I586_CPU
437ENTRY(i586_bcopy)
438	pushl	%esi
439	pushl	%edi
440	movl	12(%esp),%esi
441	movl	16(%esp),%edi
442	movl	20(%esp),%ecx
443
444	movl	%edi,%eax
445	subl	%esi,%eax
446	cmpl	%ecx,%eax			/* overlapping && src < dst? */
447	jb	1f
448
449	cmpl	$1024,%ecx
450	jb	small_i586_bcopy
451
452	sarb	$1,kernel_fpu_lock
453	jc	small_i586_bcopy
454	cmpl	$0,_npxproc
455	je	i586_bc1
456	smsw	%dx
457	clts
458	subl	$108,%esp
459	fnsave	0(%esp)
460	jmp	4f
461
462i586_bc1:
463	smsw	%dx
464	clts
465	fninit				/* XXX should avoid needing this */
466
467	ALIGN_TEXT
4684:
469	pushl	%ecx
470#define	DCACHE_SIZE	8192
471	cmpl	$(DCACHE_SIZE-512)/2,%ecx
472	jbe	2f
473	movl	$(DCACHE_SIZE-512)/2,%ecx
4742:
475	subl	%ecx,0(%esp)
476	cmpl	$256,%ecx
477	jb	5f			/* XXX should prefetch if %ecx >= 32 */
478	pushl	%esi
479	pushl	%ecx
480	ALIGN_TEXT
4813:
482	movl	0(%esi),%eax
483	movl	32(%esi),%eax
484	movl	64(%esi),%eax
485	movl	96(%esi),%eax
486	movl	128(%esi),%eax
487	movl	160(%esi),%eax
488	movl	192(%esi),%eax
489	movl	224(%esi),%eax
490	addl	$256,%esi
491	subl	$256,%ecx
492	cmpl	$256,%ecx
493	jae	3b
494	popl	%ecx
495	popl	%esi
4965:
497	ALIGN_TEXT
498large_i586_bcopy_loop:
499	fildq	0(%esi)
500	fildq	8(%esi)
501	fildq	16(%esi)
502	fildq	24(%esi)
503	fildq	32(%esi)
504	fildq	40(%esi)
505	fildq	48(%esi)
506	fildq	56(%esi)
507	fistpq	56(%edi)
508	fistpq	48(%edi)
509	fistpq	40(%edi)
510	fistpq	32(%edi)
511	fistpq	24(%edi)
512	fistpq	16(%edi)
513	fistpq	8(%edi)
514	fistpq	0(%edi)
515	addl	$64,%esi
516	addl	$64,%edi
517	subl	$64,%ecx
518	cmpl	$64,%ecx
519	jae	large_i586_bcopy_loop
520	popl	%eax
521	addl	%eax,%ecx
522	cmpl	$64,%ecx
523	jae	4b
524
525	cmpl	$0,_npxproc
526	je	i586_bc2
527	frstor	0(%esp)
528	addl	$108,%esp
529i586_bc2:
530	lmsw	%dx
531	movb	$0xfe,kernel_fpu_lock
532
533/*
534 * This is a duplicate of the main part of generic_bcopy.  See the comments
535 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
536 * would mess up high resolution profiling.
537 */
538	ALIGN_TEXT
539small_i586_bcopy:
540	shrl	$2,%ecx
541	cld
542	rep
543	movsl
544	movl	20(%esp),%ecx
545	andl	$3,%ecx
546	rep
547	movsb
548	popl	%edi
549	popl	%esi
550	ret
551
552	ALIGN_TEXT
5531:
554	addl	%ecx,%edi
555	addl	%ecx,%esi
556	decl	%edi
557	decl	%esi
558	andl	$3,%ecx
559	std
560	rep
561	movsb
562	movl	20(%esp),%ecx
563	shrl	$2,%ecx
564	subl	$3,%esi
565	subl	$3,%edi
566	rep
567	movsl
568	popl	%edi
569	popl	%esi
570	cld
571	ret
572#endif /* I586_CPU */
573
574/*
575 * Note: memcpy does not support overlapping copies
576 */
577ENTRY(memcpy)
578	pushl	%edi
579	pushl	%esi
580	movl	12(%esp),%edi
581	movl	16(%esp),%esi
582	movl	20(%esp),%ecx
583	movl	%edi,%eax
584	shrl	$2,%ecx				/* copy by 32-bit words */
585	cld					/* nope, copy forwards */
586	rep
587	movsl
588	movl	20(%esp),%ecx
589	andl	$3,%ecx				/* any bytes left? */
590	rep
591	movsb
592	popl	%esi
593	popl	%edi
594	ret
595
596
597/*****************************************************************************/
598/* copyout and fubyte family                                                 */
599/*****************************************************************************/
600/*
601 * Access user memory from inside the kernel. These routines and possibly
602 * the math- and DOS emulators should be the only places that do this.
603 *
604 * We have to access the memory with user's permissions, so use a segment
605 * selector with RPL 3. For writes to user space we have to additionally
606 * check the PTE for write permission, because the 386 does not check
607 * write permissions when we are executing with EPL 0. The 486 does check
608 * this if the WP bit is set in CR0, so we can use a simpler version here.
609 *
610 * These routines set curpcb->onfault for the time they execute. When a
611 * protection violation occurs inside the functions, the trap handler
612 * returns to *curpcb->onfault instead of the function.
613 */
614
615/* copyout(from_kernel, to_user, len) */
616ENTRY(copyout)
617	MEXITCOUNT
618	jmp	*_copyout_vector
619
620ENTRY(generic_copyout)
621	movl	_curpcb,%eax
622	movl	$copyout_fault,PCB_ONFAULT(%eax)
623	pushl	%esi
624	pushl	%edi
625	pushl	%ebx
626	movl	16(%esp),%esi
627	movl	20(%esp),%edi
628	movl	24(%esp),%ebx
629	testl	%ebx,%ebx			/* anything to do? */
630	jz	done_copyout
631
632	/*
633	 * Check explicitly for non-user addresses.  If 486 write protection
634	 * is being used, this check is essential because we are in kernel
635	 * mode so the h/w does not provide any protection against writing
636	 * kernel addresses.
637	 */
638
639	/*
640	 * First, prevent address wrapping.
641	 */
642	movl	%edi,%eax
643	addl	%ebx,%eax
644	jc	copyout_fault
645/*
646 * XXX STOP USING VM_MAXUSER_ADDRESS.
647 * It is an end address, not a max, so every time it is used correctly it
648 * looks like there is an off by one error, and of course it caused an off
649 * by one error in several places.
650 */
651	cmpl	$VM_MAXUSER_ADDRESS,%eax
652	ja	copyout_fault
653
654#if defined(I386_CPU)
655
656#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
657	cmpl	$CPUCLASS_386,_cpu_class
658	jne	3f
659#endif
660/*
661 * We have to check each PTE for user write permission.
662 * The checking may cause a page fault, so it is important to set
663 * up everything for return via copyout_fault before here.
664 */
665	/* compute number of pages */
666	movl	%edi,%ecx
667	andl	$PAGE_MASK,%ecx
668	addl	%ebx,%ecx
669	decl	%ecx
670	shrl	$IDXSHIFT+2,%ecx
671	incl	%ecx
672
673	/* compute PTE offset for start address */
674	movl	%edi,%edx
675	shrl	$IDXSHIFT,%edx
676	andb	$0xfc,%dl
677
6781:	/* check PTE for each page */
679	movb	_PTmap(%edx),%al
680	andb	$0x07,%al			/* Pages must be VALID + USERACC + WRITABLE */
681	cmpb	$0x07,%al
682	je	2f
683
684	/* simulate a trap */
685	pushl	%edx
686	pushl	%ecx
687	shll	$IDXSHIFT,%edx
688	pushl	%edx
689	call	_trapwrite			/* trapwrite(addr) */
690	popl	%edx
691	popl	%ecx
692	popl	%edx
693
694	testl	%eax,%eax			/* if not ok, return EFAULT */
695	jnz	copyout_fault
696
6972:
698	addl	$4,%edx
699	decl	%ecx
700	jnz	1b				/* check next page */
701#endif /* I386_CPU */
702
703	/* bcopy(%esi, %edi, %ebx) */
7043:
705	movl	%ebx,%ecx
706
707#ifdef I586_CPU
708	ALIGN_TEXT
709slow_copyout:
710#endif
711	shrl	$2,%ecx
712	cld
713	rep
714	movsl
715	movb	%bl,%cl
716	andb	$3,%cl
717	rep
718	movsb
719
720done_copyout:
721	popl	%ebx
722	popl	%edi
723	popl	%esi
724	xorl	%eax,%eax
725	movl	_curpcb,%edx
726	movl	%eax,PCB_ONFAULT(%edx)
727	ret
728
729	ALIGN_TEXT
730copyout_fault:
731	popl	%ebx
732	popl	%edi
733	popl	%esi
734	movl	_curpcb,%edx
735	movl	$0,PCB_ONFAULT(%edx)
736	movl	$EFAULT,%eax
737	ret
738
739#ifdef I586_CPU
740ENTRY(i586_copyout)
741	/*
742	 * Duplicated from generic_copyout.  Could be done a bit better.
743	 */
744	movl	_curpcb,%eax
745	movl	$copyout_fault,PCB_ONFAULT(%eax)
746	pushl	%esi
747	pushl	%edi
748	pushl	%ebx
749	movl	16(%esp),%esi
750	movl	20(%esp),%edi
751	movl	24(%esp),%ebx
752	testl	%ebx,%ebx			/* anything to do? */
753	jz	done_copyout
754
755	/*
756	 * Check explicitly for non-user addresses.  If 486 write protection
757	 * is being used, this check is essential because we are in kernel
758	 * mode so the h/w does not provide any protection against writing
759	 * kernel addresses.
760	 */
761
762	/*
763	 * First, prevent address wrapping.
764	 */
765	movl	%edi,%eax
766	addl	%ebx,%eax
767	jc	copyout_fault
768/*
769 * XXX STOP USING VM_MAXUSER_ADDRESS.
770 * It is an end address, not a max, so every time it is used correctly it
771 * looks like there is an off by one error, and of course it caused an off
772 * by one error in several places.
773 */
774	cmpl	$VM_MAXUSER_ADDRESS,%eax
775	ja	copyout_fault
776
777	/* bcopy(%esi, %edi, %ebx) */
7783:
779	movl	%ebx,%ecx
780	/*
781	 * End of duplicated code.
782	 */
783
784	cmpl	$1024,%ecx
785	jb	slow_copyout
786
787	pushl	%ecx
788	call	_fastmove
789	addl	$4,%esp
790	jmp	done_copyout
791#endif /* I586_CPU */
792
793/* copyin(from_user, to_kernel, len) */
794ENTRY(copyin)
795	MEXITCOUNT
796	jmp	*_copyin_vector
797
798ENTRY(generic_copyin)
799	movl	_curpcb,%eax
800	movl	$copyin_fault,PCB_ONFAULT(%eax)
801	pushl	%esi
802	pushl	%edi
803	movl	12(%esp),%esi			/* caddr_t from */
804	movl	16(%esp),%edi			/* caddr_t to */
805	movl	20(%esp),%ecx			/* size_t  len */
806
807	/*
808	 * make sure address is valid
809	 */
810	movl	%esi,%edx
811	addl	%ecx,%edx
812	jc	copyin_fault
813	cmpl	$VM_MAXUSER_ADDRESS,%edx
814	ja	copyin_fault
815
816#ifdef I586_CPU
817	ALIGN_TEXT
818slow_copyin:
819#endif
820	movb	%cl,%al
821	shrl	$2,%ecx				/* copy longword-wise */
822	cld
823	rep
824	movsl
825	movb	%al,%cl
826	andb	$3,%cl				/* copy remaining bytes */
827	rep
828	movsb
829
830#if defined(I586_CPU)
831	ALIGN_TEXT
832done_copyin:
833#endif /* I586_CPU */
834	popl	%edi
835	popl	%esi
836	xorl	%eax,%eax
837	movl	_curpcb,%edx
838	movl	%eax,PCB_ONFAULT(%edx)
839	ret
840
841	ALIGN_TEXT
842copyin_fault:
843	popl	%edi
844	popl	%esi
845	movl	_curpcb,%edx
846	movl	$0,PCB_ONFAULT(%edx)
847	movl	$EFAULT,%eax
848	ret
849
850#ifdef I586_CPU
851ENTRY(i586_copyin)
852	/*
853	 * Duplicated from generic_copyin.  Could be done a bit better.
854	 */
855	movl	_curpcb,%eax
856	movl	$copyin_fault,PCB_ONFAULT(%eax)
857	pushl	%esi
858	pushl	%edi
859	movl	12(%esp),%esi			/* caddr_t from */
860	movl	16(%esp),%edi			/* caddr_t to */
861	movl	20(%esp),%ecx			/* size_t  len */
862
863	/*
864	 * make sure address is valid
865	 */
866	movl	%esi,%edx
867	addl	%ecx,%edx
868	jc	copyin_fault
869	cmpl	$VM_MAXUSER_ADDRESS,%edx
870	ja	copyin_fault
871	/*
872	 * End of duplicated code.
873	 */
874
875	cmpl	$1024,%ecx
876	jb	slow_copyin
877
878	pushl	%ecx
879	call	_fastmove
880	addl	$4,%esp
881	jmp	done_copyin
882#endif /* I586_CPU */
883
884#if defined(I586_CPU)
885/* fastmove(src, dst, len)
886	src in %esi
887	dst in %edi
888	len in %ecx		XXX changed to on stack for profiling
889	uses %eax and %edx for tmp. storage
890 */
891/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
892ENTRY(fastmove)
893	movl	4(%esp),%ecx
894	cmpl	$63,%ecx
895	jbe	fastmove_tail
896
897	testl	$7,%esi	/* check if src addr is multiple of 8 */
898	jnz	fastmove_tail
899
900	testl	$7,%edi	/* check if dst addr is multiple of 8 */
901	jnz	fastmove_tail
902
903	pushl	%ebp
904	movl	%esp,%ebp
905	subl	$PCB_SAVEFPU_SIZE,%esp
906
907/* if (npxproc != NULL) { */
908	cmpl	$0,_npxproc
909	je	6f
910/*    fnsave(&curpcb->pcb_savefpu); */
911	movl	_curpcb,%eax
912	fnsave	PCB_SAVEFPU(%eax)
913/*   npxproc = NULL; */
914	movl	$0,_npxproc
915/* } */
9166:
917/* now we own the FPU. */
918
919/*
920 * The process' FP state is saved in the pcb, but if we get
921 * switched, the cpu_switch() will store our FP state in the
922 * pcb.  It should be possible to avoid all the copying for
923 * this, e.g., by setting a flag to tell cpu_switch() to
924 * save the state somewhere else.
925 */
926/* tmp = curpcb->pcb_savefpu; */
927	pushl	%edi
928	pushl	%esi
929	pushl	%ecx
930	leal	-PCB_SAVEFPU_SIZE(%ebp),%edi
931	movl	_curpcb,%esi
932	addl	$PCB_SAVEFPU,%esi
933	cld
934	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
935	rep
936	movsl
937	popl	%ecx
938	popl	%esi
939	popl	%edi
940/* stop_emulating(); */
941	clts
942/* npxproc = curproc; */
943	movl	_curproc,%eax
944	movl	%eax,_npxproc
9454:
946	pushl	%ecx
947	cmpl	$1792,%ecx
948	jbe	2f
949	movl	$1792,%ecx
9502:
951	subl	%ecx,0(%esp)
952	cmpl	$256,%ecx
953	jb	5f
954	pushl	%esi
955	pushl	%ecx
956	ALIGN_TEXT
9573:
958	movl	0(%esi),%eax
959	movl	32(%esi),%eax
960	movl	64(%esi),%eax
961	movl	96(%esi),%eax
962	movl	128(%esi),%eax
963	movl	160(%esi),%eax
964	movl	192(%esi),%eax
965	movl	224(%esi),%eax
966	addl	$256,%esi
967	subl	$256,%ecx
968	cmpl	$256,%ecx
969	jae	3b
970	popl	%ecx
971	popl	%esi
9725:
973	ALIGN_TEXT
974fastmove_loop:
975	fildq	0(%esi)
976	fildq	8(%esi)
977	fildq	16(%esi)
978	fildq	24(%esi)
979	fildq	32(%esi)
980	fildq	40(%esi)
981	fildq	48(%esi)
982	fildq	56(%esi)
983	fistpq	56(%edi)
984	fistpq	48(%edi)
985	fistpq	40(%edi)
986	fistpq	32(%edi)
987	fistpq	24(%edi)
988	fistpq	16(%edi)
989	fistpq	8(%edi)
990	fistpq	0(%edi)
991	addl	$-64,%ecx
992	addl	$64,%esi
993	addl	$64,%edi
994	cmpl	$63,%ecx
995	ja	fastmove_loop
996	popl	%eax
997	addl	%eax,%ecx
998	cmpl	$64,%ecx
999	jae	4b
1000
1001/* curpcb->pcb_savefpu = tmp; */
1002	pushl	%edi
1003	pushl	%esi
1004	pushl	%ecx
1005	movl	_curpcb,%edi
1006	addl	$PCB_SAVEFPU,%edi
1007	leal	-PCB_SAVEFPU_SIZE(%ebp),%esi
1008	cld
1009	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1010	rep
1011	movsl
1012	popl	%ecx
1013	popl	%esi
1014	popl	%edi
1015
1016/* start_emulating(); */
1017	smsw	%ax
1018	orb	$CR0_TS,%al
1019	lmsw	%ax
1020/* npxproc = NULL; */
1021	movl	$0,_npxproc
1022	movl	%ebp,%esp
1023	popl	%ebp
1024
1025	ALIGN_TEXT
1026fastmove_tail:
1027	movb	%cl,%al
1028	shrl	$2,%ecx				/* copy longword-wise */
1029	cld
1030	rep
1031	movsl
1032	movb	%al,%cl
1033	andb	$3,%cl				/* copy remaining bytes */
1034	rep
1035	movsb
1036
1037	ret
1038#endif /* I586_CPU */
1039
1040/*
1041 * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
1042 */
1043ENTRY(fuword)
1044	movl	_curpcb,%ecx
1045	movl	$fusufault,PCB_ONFAULT(%ecx)
1046	movl	4(%esp),%edx			/* from */
1047
1048	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1049	ja	fusufault
1050
1051	movl	(%edx),%eax
1052	movl	$0,PCB_ONFAULT(%ecx)
1053	ret
1054
1055/*
1056 * These two routines are called from the profiling code, potentially
1057 * at interrupt time. If they fail, that's okay, good things will
1058 * happen later. Fail all the time for now - until the trap code is
1059 * able to deal with this.
1060 */
1061ALTENTRY(suswintr)
1062ENTRY(fuswintr)
1063	movl	$-1,%eax
1064	ret
1065
1066ENTRY(fusword)
1067	movl	_curpcb,%ecx
1068	movl	$fusufault,PCB_ONFAULT(%ecx)
1069	movl	4(%esp),%edx
1070
1071	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
1072	ja	fusufault
1073
1074	movzwl	(%edx),%eax
1075	movl	$0,PCB_ONFAULT(%ecx)
1076	ret
1077
1078ENTRY(fubyte)
1079	movl	_curpcb,%ecx
1080	movl	$fusufault,PCB_ONFAULT(%ecx)
1081	movl	4(%esp),%edx
1082
1083	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1084	ja	fusufault
1085
1086	movzbl	(%edx),%eax
1087	movl	$0,PCB_ONFAULT(%ecx)
1088	ret
1089
1090	ALIGN_TEXT
1091fusufault:
1092	movl	_curpcb,%ecx
1093	xorl	%eax,%eax
1094	movl	%eax,PCB_ONFAULT(%ecx)
1095	decl	%eax
1096	ret
1097
1098/*
1099 * su{byte,sword,word}: write a byte (word, longword) to user memory
1100 */
1101ENTRY(suword)
1102	movl	_curpcb,%ecx
1103	movl	$fusufault,PCB_ONFAULT(%ecx)
1104	movl	4(%esp),%edx
1105
1106#if defined(I386_CPU)
1107
1108#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1109	cmpl	$CPUCLASS_386,_cpu_class
1110	jne	2f				/* we only have to set the right segment selector */
1111#endif /* I486_CPU || I586_CPU || I686_CPU */
1112
1113	/* XXX - page boundary crossing is still not handled */
1114	movl	%edx,%eax
1115	shrl	$IDXSHIFT,%edx
1116	andb	$0xfc,%dl
1117	movb	_PTmap(%edx),%dl
1118	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1119	cmpb	$0x7,%dl
1120	je	1f
1121
1122	/* simulate a trap */
1123	pushl	%eax
1124	call	_trapwrite
1125	popl	%edx				/* remove junk parameter from stack */
1126	movl	_curpcb,%ecx			/* restore trashed register */
1127	testl	%eax,%eax
1128	jnz	fusufault
11291:
1130	movl	4(%esp),%edx
1131#endif
1132
11332:
1134	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1135	ja	fusufault
1136
1137	movl	8(%esp),%eax
1138	movl	%eax,(%edx)
1139	xorl	%eax,%eax
1140	movl	%eax,PCB_ONFAULT(%ecx)
1141	ret
1142
1143ENTRY(susword)
1144	movl	_curpcb,%ecx
1145	movl	$fusufault,PCB_ONFAULT(%ecx)
1146	movl	4(%esp),%edx
1147
1148#if defined(I386_CPU)
1149
1150#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1151	cmpl	$CPUCLASS_386,_cpu_class
1152	jne	2f
1153#endif /* I486_CPU || I586_CPU || I686_CPU */
1154
1155	/* XXX - page boundary crossing is still not handled */
1156	movl	%edx,%eax
1157	shrl	$IDXSHIFT,%edx
1158	andb	$0xfc,%dl
1159	movb	_PTmap(%edx),%dl
1160	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1161	cmpb	$0x7,%dl
1162	je	1f
1163
1164	/* simulate a trap */
1165	pushl	%eax
1166	call	_trapwrite
1167	popl	%edx				/* remove junk parameter from stack */
1168	movl	_curpcb,%ecx			/* restore trashed register */
1169	testl	%eax,%eax
1170	jnz	fusufault
11711:
1172	movl	4(%esp),%edx
1173#endif
1174
11752:
1176	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1177	ja	fusufault
1178
1179	movw	8(%esp),%ax
1180	movw	%ax,(%edx)
1181	xorl	%eax,%eax
1182	movl	%eax,PCB_ONFAULT(%ecx)
1183	ret
1184
1185ALTENTRY(suibyte)
1186ENTRY(subyte)
1187	movl	_curpcb,%ecx
1188	movl	$fusufault,PCB_ONFAULT(%ecx)
1189	movl	4(%esp),%edx
1190
1191#if defined(I386_CPU)
1192
1193#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1194	cmpl	$CPUCLASS_386,_cpu_class
1195	jne	2f
1196#endif /* I486_CPU || I586_CPU || I686_CPU */
1197
1198	movl	%edx,%eax
1199	shrl	$IDXSHIFT,%edx
1200	andb	$0xfc,%dl
1201	movb	_PTmap(%edx),%dl
1202	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1203	cmpb	$0x7,%dl
1204	je	1f
1205
1206	/* simulate a trap */
1207	pushl	%eax
1208	call	_trapwrite
1209	popl	%edx				/* remove junk parameter from stack */
1210	movl	_curpcb,%ecx			/* restore trashed register */
1211	testl	%eax,%eax
1212	jnz	fusufault
12131:
1214	movl	4(%esp),%edx
1215#endif
1216
12172:
1218	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1219	ja	fusufault
1220
1221	movb	8(%esp),%al
1222	movb	%al,(%edx)
1223	xorl	%eax,%eax
1224	movl	%eax,PCB_ONFAULT(%ecx)
1225	ret
1226
1227/*
1228 * copyinstr(from, to, maxlen, int *lencopied)
1229 *	copy a string from from to to, stop when a 0 character is reached.
1230 *	return ENAMETOOLONG if string is longer than maxlen, and
1231 *	EFAULT on protection violations. If lencopied is non-zero,
1232 *	return the actual length in *lencopied.
1233 */
1234ENTRY(copyinstr)
1235	pushl	%esi
1236	pushl	%edi
1237	movl	_curpcb,%ecx
1238	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1239
1240	movl	12(%esp),%esi			/* %esi = from */
1241	movl	16(%esp),%edi			/* %edi = to */
1242	movl	20(%esp),%edx			/* %edx = maxlen */
1243
1244	movl	$VM_MAXUSER_ADDRESS,%eax
1245
1246	/* make sure 'from' is within bounds */
1247	subl	%esi,%eax
1248	jbe	cpystrflt
1249
1250	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1251	cmpl	%edx,%eax
1252	jae	1f
1253	movl	%eax,%edx
1254	movl	%eax,20(%esp)
12551:
1256	incl	%edx
1257	cld
1258
12592:
1260	decl	%edx
1261	jz	3f
1262
1263	lodsb
1264	stosb
1265	orb	%al,%al
1266	jnz	2b
1267
1268	/* Success -- 0 byte reached */
1269	decl	%edx
1270	xorl	%eax,%eax
1271	jmp	cpystrflt_x
12723:
1273	/* edx is zero - return ENAMETOOLONG or EFAULT */
1274	cmpl	$VM_MAXUSER_ADDRESS,%esi
1275	jae	cpystrflt
12764:
1277	movl	$ENAMETOOLONG,%eax
1278	jmp	cpystrflt_x
1279
1280cpystrflt:
1281	movl	$EFAULT,%eax
1282
1283cpystrflt_x:
1284	/* set *lencopied and return %eax */
1285	movl	_curpcb,%ecx
1286	movl	$0,PCB_ONFAULT(%ecx)
1287	movl	20(%esp),%ecx
1288	subl	%edx,%ecx
1289	movl	24(%esp),%edx
1290	testl	%edx,%edx
1291	jz	1f
1292	movl	%ecx,(%edx)
12931:
1294	popl	%edi
1295	popl	%esi
1296	ret
1297
1298
1299/*
1300 * copystr(from, to, maxlen, int *lencopied)
1301 */
1302ENTRY(copystr)
1303	pushl	%esi
1304	pushl	%edi
1305
1306	movl	12(%esp),%esi			/* %esi = from */
1307	movl	16(%esp),%edi			/* %edi = to */
1308	movl	20(%esp),%edx			/* %edx = maxlen */
1309	incl	%edx
1310	cld
13111:
1312	decl	%edx
1313	jz	4f
1314	lodsb
1315	stosb
1316	orb	%al,%al
1317	jnz	1b
1318
1319	/* Success -- 0 byte reached */
1320	decl	%edx
1321	xorl	%eax,%eax
1322	jmp	6f
13234:
1324	/* edx is zero -- return ENAMETOOLONG */
1325	movl	$ENAMETOOLONG,%eax
1326
13276:
1328	/* set *lencopied and return %eax */
1329	movl	20(%esp),%ecx
1330	subl	%edx,%ecx
1331	movl	24(%esp),%edx
1332	testl	%edx,%edx
1333	jz	7f
1334	movl	%ecx,(%edx)
13357:
1336	popl	%edi
1337	popl	%esi
1338	ret
1339
1340ENTRY(bcmp)
1341	pushl	%edi
1342	pushl	%esi
1343	movl	12(%esp),%edi
1344	movl	16(%esp),%esi
1345	movl	20(%esp),%edx
1346	xorl	%eax,%eax
1347
1348	movl	%edx,%ecx
1349	shrl	$2,%ecx
1350	cld					/* compare forwards */
1351	repe
1352	cmpsl
1353	jne	1f
1354
1355	movl	%edx,%ecx
1356	andl	$3,%ecx
1357	repe
1358	cmpsb
1359	je	2f
13601:
1361	incl	%eax
13622:
1363	popl	%esi
1364	popl	%edi
1365	ret
1366
1367
1368/*
1369 * Handling of special 386 registers and descriptor tables etc
1370 */
1371/* void lgdt(struct region_descriptor *rdp); */
1372ENTRY(lgdt)
1373	/* reload the descriptor table */
1374	movl	4(%esp),%eax
1375	lgdt	(%eax)
1376
1377	/* flush the prefetch q */
1378	jmp	1f
1379	nop
13801:
1381	/* reload "stale" selectors */
1382	movl	$KDSEL,%eax
1383	movl	%ax,%ds
1384	movl	%ax,%es
1385	movl	%ax,%ss
1386
1387	/* reload code selector by turning return into intersegmental return */
1388	movl	(%esp),%eax
1389	pushl	%eax
1390#	movl	$KCSEL,4(%esp)
1391	movl	$8,4(%esp)
1392	lret
1393
1394/*
1395 * void lidt(struct region_descriptor *rdp);
1396 */
1397ENTRY(lidt)
1398	movl	4(%esp),%eax
1399	lidt	(%eax)
1400	ret
1401
1402/*
1403 * void lldt(u_short sel)
1404 */
1405ENTRY(lldt)
1406	lldt	4(%esp)
1407	ret
1408
1409/*
1410 * void ltr(u_short sel)
1411 */
1412ENTRY(ltr)
1413	ltr	4(%esp)
1414	ret
1415
1416/* ssdtosd(*ssdp,*sdp) */
1417ENTRY(ssdtosd)
1418	pushl	%ebx
1419	movl	8(%esp),%ecx
1420	movl	8(%ecx),%ebx
1421	shll	$16,%ebx
1422	movl	(%ecx),%edx
1423	roll	$16,%edx
1424	movb	%dh,%bl
1425	movb	%dl,%bh
1426	rorl	$8,%ebx
1427	movl	4(%ecx),%eax
1428	movw	%ax,%dx
1429	andl	$0xf0000,%eax
1430	orl	%eax,%ebx
1431	movl	12(%esp),%ecx
1432	movl	%edx,(%ecx)
1433	movl	%ebx,4(%ecx)
1434	popl	%ebx
1435	ret
1436
1437/* load_cr0(cr0) */
1438ENTRY(load_cr0)
1439	movl	4(%esp),%eax
1440	movl	%eax,%cr0
1441	ret
1442
1443/* rcr0() */
1444ENTRY(rcr0)
1445	movl	%cr0,%eax
1446	ret
1447
1448/* rcr3() */
1449ENTRY(rcr3)
1450	movl	%cr3,%eax
1451	ret
1452
1453/* void load_cr3(caddr_t cr3) */
1454ENTRY(load_cr3)
1455	movl	4(%esp),%eax
1456	movl	%eax,%cr3
1457	ret
1458
1459
1460/*****************************************************************************/
1461/* setjump, longjump                                                         */
1462/*****************************************************************************/
1463
1464ENTRY(setjmp)
1465	movl	4(%esp),%eax
1466	movl	%ebx,(%eax)			/* save ebx */
1467	movl	%esp,4(%eax)			/* save esp */
1468	movl	%ebp,8(%eax)			/* save ebp */
1469	movl	%esi,12(%eax)			/* save esi */
1470	movl	%edi,16(%eax)			/* save edi */
1471	movl	(%esp),%edx			/* get rta */
1472	movl	%edx,20(%eax)			/* save eip */
1473	xorl	%eax,%eax			/* return(0); */
1474	ret
1475
1476ENTRY(longjmp)
1477	movl	4(%esp),%eax
1478	movl	(%eax),%ebx			/* restore ebx */
1479	movl	4(%eax),%esp			/* restore esp */
1480	movl	8(%eax),%ebp			/* restore ebp */
1481	movl	12(%eax),%esi			/* restore esi */
1482	movl	16(%eax),%edi			/* restore edi */
1483	movl	20(%eax),%edx			/* get rta */
1484	movl	%edx,(%esp)			/* put in return frame */
1485	xorl	%eax,%eax			/* return(1); */
1486	incl	%eax
1487	ret
1488
1489/*
1490 * Here for doing BB-profiling (gcc -a).
1491 * We rely on the "bbset" instead, but need a dummy function.
1492 */
1493NON_GPROF_ENTRY(__bb_init_func)
1494	movl	4(%esp),%eax
1495	movl	$1,(%eax)
1496	.byte	0xc3				/* avoid macro for `ret' */
1497