support.s revision 18842
1/*-
2 * Copyright (c) 1993 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	$Id: support.s,v 1.40 1996/10/09 18:16:17 bde Exp $
34 */
35
36#include "opt_cpu.h"
37#include "opt_temporary.h"			/* for I586_*_B* */
38
39#include <machine/asmacros.h>
40#include <machine/cputypes.h>
41#include <machine/specialreg.h>
42
43#include "assym.s"
44
45#define KDSEL		0x10			/* kernel data selector */
46#define IDXSHIFT	10
47
48	.data
49	.globl	_bcopy_vector
50_bcopy_vector:
51	.long	_generic_bcopy
52	.globl	_bzero
53_bzero:
54	.long	_generic_bzero
55	.globl	_ovbcopy_vector
56_ovbcopy_vector:
57	.long	_generic_bcopy
58kernel_fpu_lock:
59	.byte	0xfe
60	.space	3
61
62	.text
63
64/*
65 * bcopy family
66 * void bzero(void *buf, u_int len)
67 */
68
69ENTRY(generic_bzero)
70	pushl	%edi
71	movl	8(%esp),%edi
72	movl	12(%esp),%ecx
73	xorl	%eax,%eax
74	shrl	$2,%ecx
75	cld
76	rep
77	stosl
78	movl	12(%esp),%ecx
79	andl	$3,%ecx
80	rep
81	stosb
82	popl	%edi
83	ret
84
85#if defined(I486_CPU)
86ENTRY(i486_bzero)
87	movl	4(%esp),%edx
88	movl	8(%esp),%ecx
89	xorl	%eax,%eax
90/*
91 * do 64 byte chunks first
92 *
93 * XXX this is probably over-unrolled at least for DX2's
94 */
952:
96	cmpl	$64,%ecx
97	jb	3f
98	movl	%eax,(%edx)
99	movl	%eax,4(%edx)
100	movl	%eax,8(%edx)
101	movl	%eax,12(%edx)
102	movl	%eax,16(%edx)
103	movl	%eax,20(%edx)
104	movl	%eax,24(%edx)
105	movl	%eax,28(%edx)
106	movl	%eax,32(%edx)
107	movl	%eax,36(%edx)
108	movl	%eax,40(%edx)
109	movl	%eax,44(%edx)
110	movl	%eax,48(%edx)
111	movl	%eax,52(%edx)
112	movl	%eax,56(%edx)
113	movl	%eax,60(%edx)
114	addl	$64,%edx
115	subl	$64,%ecx
116	jnz	2b
117	ret
118
119/*
120 * do 16 byte chunks
121 */
122	SUPERALIGN_TEXT
1233:
124	cmpl	$16,%ecx
125	jb	4f
126	movl	%eax,(%edx)
127	movl	%eax,4(%edx)
128	movl	%eax,8(%edx)
129	movl	%eax,12(%edx)
130	addl	$16,%edx
131	subl	$16,%ecx
132	jnz	3b
133	ret
134
135/*
136 * do 4 byte chunks
137 */
138	SUPERALIGN_TEXT
1394:
140	cmpl	$4,%ecx
141	jb	5f
142	movl	%eax,(%edx)
143	addl	$4,%edx
144	subl	$4,%ecx
145	jnz	4b
146	ret
147
148/*
149 * do 1 byte chunks
150 * a jump table seems to be faster than a loop or more range reductions
151 *
152 * XXX need a const section for non-text
153 */
154	.data
155jtab:
156	.long	do0
157	.long	do1
158	.long	do2
159	.long	do3
160
161	.text
162	SUPERALIGN_TEXT
1635:
164	jmp	jtab(,%ecx,4)
165
166	SUPERALIGN_TEXT
167do3:
168	movw	%ax,(%edx)
169	movb	%al,2(%edx)
170	ret
171
172	SUPERALIGN_TEXT
173do2:
174	movw	%ax,(%edx)
175	ret
176
177	SUPERALIGN_TEXT
178do1:
179	movb	%al,(%edx)
180	ret
181
182	SUPERALIGN_TEXT
183do0:
184	ret
185#endif
186
187#if defined(I586_CPU) || defined(I686_CPU)
188ENTRY(i586_bzero)
189	movl	4(%esp),%edx
190	movl	8(%esp),%ecx
191
192	/*
193	 * The FPU register method is twice as fast as the integer register
194	 * method unless the target is in the L1 cache and we pre-allocate a
195	 * cache line for it (then the integer register method is 4-5 times
196	 * faster).  However, we never pre-allocate cache lines, since that
197	 * would make the integer method 25% or more slower for the common
198	 * case when the target isn't in either the L1 cache or the L2 cache.
199	 * Thus we normally use the FPU register method unless the overhead
200	 * would be too large.
201	 */
202	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
203	jb	intreg_i586_bzero
204
205	/*
206	 * The FPU registers may belong to an application or to fastmove()
207	 * or to another invocation of bcopy() or ourself in a higher level
208	 * interrupt or trap handler.  Preserving the registers is
209	 * complicated since we avoid it if possible at all levels.  We
210	 * want to localize the complications even when that increases them.
211	 * Here the extra work involves preserving CR0_TS in TS.
212	 * `npxproc != NULL' is supposed to be the condition that all the
213	 * FPU resources belong to an application, but npxproc and CR0_TS
214	 * aren't set atomically enough for this condition to work in
215	 * interrupt handlers.
216	 *
217	 * Case 1: FPU registers belong to the application: we must preserve
218	 * the registers if we use them, so we only use the FPU register
219	 * method if the target size is large enough to amortize the extra
220	 * overhead for preserving them.  CR0_TS must be preserved although
221	 * it is very likely to end up as set.
222	 *
223	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
224	 * makes the registers look like they belong to an application so
225	 * that cpu_switch() and savectx() don't have to know about it, so
226	 * this case reduces to case 1.
227	 *
228	 * Case 3: FPU registers belong to the kernel: don't use the FPU
229	 * register method.  This case is unlikely, and supporting it would
230	 * be more complicated and might take too much stack.
231	 *
232	 * Case 4: FPU registers don't belong to anyone: the FPU registers
233	 * don't need to be preserved, so we always use the FPU register
234	 * method.  CR0_TS must be preserved although it is very likely to
235	 * always end up as clear.
236	 */
237	cmpl	$0,_npxproc
238	je	i586_bz1
239	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
240	jb	intreg_i586_bzero
241	sarb	$1,kernel_fpu_lock
242	jc	intreg_i586_bzero
243	smsw	%ax
244	clts
245	subl	$108,%esp
246	fnsave	0(%esp)
247	jmp	i586_bz2
248
249i586_bz1:
250	sarb	$1,kernel_fpu_lock
251	jc	intreg_i586_bzero
252	smsw	%ax
253	clts
254	fninit				/* XXX should avoid needing this */
255i586_bz2:
256	fldz
257
258	/*
259	 * Align to an 8 byte boundary (misalignment in the main loop would
260	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
261	 * already aligned) by always zeroing 8 bytes and using the part up
262	 * to the _next_ alignment position.
263	 */
264	fstl	0(%edx)
265	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
266	addl	$8,%edx
267	andl	$~7,%edx
268	subl	%edx,%ecx
269
270	/*
271	 * Similarly align `len' to a multiple of 8.
272	 */
273	fstl	-8(%edx,%ecx)
274	decl	%ecx
275	andl	$~7,%ecx
276
277	/*
278	 * This wouldn't be any faster if it were unrolled, since the loop
279	 * control instructions are much faster than the fstl and/or done
280	 * in parallel with it so their overhead is insignificant.
281	 */
282fpureg_i586_bzero_loop:
283	fstl	0(%edx)
284	addl	$8,%edx
285	subl	$8,%ecx
286	cmpl	$8,%ecx
287	jae	fpureg_i586_bzero_loop
288
289	cmpl	$0,_npxproc
290	je	i586_bz3
291	frstor	0(%esp)
292	addl	$108,%esp
293	lmsw	%ax
294	movb	$0xfe,kernel_fpu_lock
295	ret
296
297i586_bz3:
298	fstpl	%st(0)
299	lmsw	%ax
300	movb	$0xfe,kernel_fpu_lock
301	ret
302
303intreg_i586_bzero:
304	/*
305	 * `rep stos' seems to be the best method in practice for small
306	 * counts.  Fancy methods usually take too long to start up due
307	 * to cache and BTB misses.
308	 */
309	pushl	%edi
310	movl	%edx,%edi
311	xorl	%eax,%eax
312	shrl	$2,%ecx
313	cld
314	rep
315	stosl
316	movl	12(%esp),%ecx
317	andl	$3,%ecx
318	jne	1f
319	popl	%edi
320	ret
321
3221:
323	rep
324	stosb
325	popl	%edi
326	ret
327#endif /* I586_CPU || I686_CPU */
328
329/* fillw(pat, base, cnt) */
330ENTRY(fillw)
331	pushl	%edi
332	movl	8(%esp),%eax
333	movl	12(%esp),%edi
334	movl	16(%esp),%ecx
335	cld
336	rep
337	stosw
338	popl	%edi
339	ret
340
341ENTRY(bcopyb)
342bcopyb:
343	pushl	%esi
344	pushl	%edi
345	movl	12(%esp),%esi
346	movl	16(%esp),%edi
347	movl	20(%esp),%ecx
348	movl	%edi,%eax
349	subl	%esi,%eax
350	cmpl	%ecx,%eax			/* overlapping && src < dst? */
351	jb	1f
352	cld					/* nope, copy forwards */
353	rep
354	movsb
355	popl	%edi
356	popl	%esi
357	ret
358
359	ALIGN_TEXT
3601:
361	addl	%ecx,%edi			/* copy backwards. */
362	addl	%ecx,%esi
363	decl	%edi
364	decl	%esi
365	std
366	rep
367	movsb
368	popl	%edi
369	popl	%esi
370	cld
371	ret
372
373ENTRY(bcopy)
374	MEXITCOUNT
375	jmp	*_bcopy_vector
376
377ENTRY(ovbcopy)
378	MEXITCOUNT
379	jmp	*_ovbcopy_vector
380
381/*
382 * generic_bcopy(src, dst, cnt)
383 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
384 */
385ENTRY(generic_bcopy)
386	pushl	%esi
387	pushl	%edi
388	movl	12(%esp),%esi
389	movl	16(%esp),%edi
390	movl	20(%esp),%ecx
391
392	movl	%edi,%eax
393	subl	%esi,%eax
394	cmpl	%ecx,%eax			/* overlapping && src < dst? */
395	jb	1f
396
397	shrl	$2,%ecx				/* copy by 32-bit words */
398	cld					/* nope, copy forwards */
399	rep
400	movsl
401	movl	20(%esp),%ecx
402	andl	$3,%ecx				/* any bytes left? */
403	rep
404	movsb
405	popl	%edi
406	popl	%esi
407	ret
408
409	ALIGN_TEXT
4101:
411	addl	%ecx,%edi			/* copy backwards */
412	addl	%ecx,%esi
413	decl	%edi
414	decl	%esi
415	andl	$3,%ecx				/* any fractional bytes? */
416	std
417	rep
418	movsb
419	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
420	shrl	$2,%ecx
421	subl	$3,%esi
422	subl	$3,%edi
423	rep
424	movsl
425	popl	%edi
426	popl	%esi
427	cld
428	ret
429
430ENTRY(i586_bcopy)
431	pushl	%esi
432	pushl	%edi
433	movl	12(%esp),%esi
434	movl	16(%esp),%edi
435	movl	20(%esp),%ecx
436
437	movl	%edi,%eax
438	subl	%esi,%eax
439	cmpl	%ecx,%eax			/* overlapping && src < dst? */
440	jb	1f
441
442	cmpl	$1024,%ecx
443	jb	small_i586_bcopy
444
445	sarb	$1,kernel_fpu_lock
446	jc	small_i586_bcopy
447	cmpl	$0,_npxproc
448	je	i586_bc1
449	smsw	%dx
450	clts
451	subl	$108,%esp
452	fnsave	0(%esp)
453	jmp	4f
454
455i586_bc1:
456	smsw	%dx
457	clts
458	fninit				/* XXX should avoid needing this */
459
460	ALIGN_TEXT
4614:
462	pushl	%ecx
463#define	DCACHE_SIZE	8192
464	cmpl	$(DCACHE_SIZE-512)/2,%ecx
465	jbe	2f
466	movl	$(DCACHE_SIZE-512)/2,%ecx
4672:
468	subl	%ecx,0(%esp)
469	cmpl	$256,%ecx
470	jb	5f			/* XXX should prefetch if %ecx >= 32 */
471	pushl	%esi
472	pushl	%ecx
473	ALIGN_TEXT
4743:
475	movl	0(%esi),%eax
476	movl	32(%esi),%eax
477	movl	64(%esi),%eax
478	movl	96(%esi),%eax
479	movl	128(%esi),%eax
480	movl	160(%esi),%eax
481	movl	192(%esi),%eax
482	movl	224(%esi),%eax
483	addl	$256,%esi
484	subl	$256,%ecx
485	cmpl	$256,%ecx
486	jae	3b
487	popl	%ecx
488	popl	%esi
4895:
490	ALIGN_TEXT
491large_i586_bcopy_loop:
492	fildq	0(%esi)
493	fildq	8(%esi)
494	fildq	16(%esi)
495	fildq	24(%esi)
496	fildq	32(%esi)
497	fildq	40(%esi)
498	fildq	48(%esi)
499	fildq	56(%esi)
500	fistpq	56(%edi)
501	fistpq	48(%edi)
502	fistpq	40(%edi)
503	fistpq	32(%edi)
504	fistpq	24(%edi)
505	fistpq	16(%edi)
506	fistpq	8(%edi)
507	fistpq	0(%edi)
508	addl	$64,%esi
509	addl	$64,%edi
510	subl	$64,%ecx
511	cmpl	$64,%ecx
512	jae	large_i586_bcopy_loop
513	popl	%eax
514	addl	%eax,%ecx
515	cmpl	$64,%ecx
516	jae	4b
517
518	cmpl	$0,_npxproc
519	je	i586_bc2
520	frstor	0(%esp)
521	addl	$108,%esp
522i586_bc2:
523	lmsw	%dx
524	movb	$0xfe,kernel_fpu_lock
525
526/*
527 * This is a duplicate of the main part of generic_bcopy.  See the comments
528 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
529 * would mess up high resolution profiling.
530 */
531	ALIGN_TEXT
532small_i586_bcopy:
533	shrl	$2,%ecx
534	cld
535	rep
536	movsl
537	movl	20(%esp),%ecx
538	andl	$3,%ecx
539	rep
540	movsb
541	popl	%edi
542	popl	%esi
543	ret
544
545	ALIGN_TEXT
5461:
547	addl	%ecx,%edi
548	addl	%ecx,%esi
549	decl	%edi
550	decl	%esi
551	andl	$3,%ecx
552	std
553	rep
554	movsb
555	movl	20(%esp),%ecx
556	shrl	$2,%ecx
557	subl	$3,%esi
558	subl	$3,%edi
559	rep
560	movsl
561	popl	%edi
562	popl	%esi
563	cld
564	ret
565
566/*
567 * Note: memcpy does not support overlapping copies
568 */
569ENTRY(memcpy)
570	pushl	%edi
571	pushl	%esi
572	movl	12(%esp),%edi
573	movl	16(%esp),%esi
574	movl	20(%esp),%ecx
575	movl	%edi,%eax
576	shrl	$2,%ecx				/* copy by 32-bit words */
577	cld					/* nope, copy forwards */
578	rep
579	movsl
580	movl	20(%esp),%ecx
581	andl	$3,%ecx				/* any bytes left? */
582	rep
583	movsb
584	popl	%esi
585	popl	%edi
586	ret
587
588
589/*****************************************************************************/
590/* copyout and fubyte family                                                 */
591/*****************************************************************************/
592/*
593 * Access user memory from inside the kernel. These routines and possibly
594 * the math- and DOS emulators should be the only places that do this.
595 *
596 * We have to access the memory with user's permissions, so use a segment
597 * selector with RPL 3. For writes to user space we have to additionally
598 * check the PTE for write permission, because the 386 does not check
599 * write permissions when we are executing with EPL 0. The 486 does check
600 * this if the WP bit is set in CR0, so we can use a simpler version here.
601 *
602 * These routines set curpcb->onfault for the time they execute. When a
603 * protection violation occurs inside the functions, the trap handler
604 * returns to *curpcb->onfault instead of the function.
605 */
606
607
608ENTRY(copyout)					/* copyout(from_kernel, to_user, len) */
609	movl	_curpcb,%eax
610	movl	$copyout_fault,PCB_ONFAULT(%eax)
611	pushl	%esi
612	pushl	%edi
613	pushl	%ebx
614	movl	16(%esp),%esi
615	movl	20(%esp),%edi
616	movl	24(%esp),%ebx
617	testl	%ebx,%ebx			/* anything to do? */
618	jz	done_copyout
619
620	/*
621	 * Check explicitly for non-user addresses.  If 486 write protection
622	 * is being used, this check is essential because we are in kernel
623	 * mode so the h/w does not provide any protection against writing
624	 * kernel addresses.
625	 */
626
627	/*
628	 * First, prevent address wrapping.
629	 */
630	movl	%edi,%eax
631	addl	%ebx,%eax
632	jc	copyout_fault
633/*
634 * XXX STOP USING VM_MAXUSER_ADDRESS.
635 * It is an end address, not a max, so every time it is used correctly it
636 * looks like there is an off by one error, and of course it caused an off
637 * by one error in several places.
638 */
639	cmpl	$VM_MAXUSER_ADDRESS,%eax
640	ja	copyout_fault
641
642#if defined(I386_CPU)
643
644#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
645	cmpl	$CPUCLASS_386,_cpu_class
646	jne	3f
647#endif
648/*
649 * We have to check each PTE for user write permission.
650 * The checking may cause a page fault, so it is important to set
651 * up everything for return via copyout_fault before here.
652 */
653	/* compute number of pages */
654	movl	%edi,%ecx
655	andl	$PAGE_MASK,%ecx
656	addl	%ebx,%ecx
657	decl	%ecx
658	shrl	$IDXSHIFT+2,%ecx
659	incl	%ecx
660
661	/* compute PTE offset for start address */
662	movl	%edi,%edx
663	shrl	$IDXSHIFT,%edx
664	andb	$0xfc,%dl
665
6661:	/* check PTE for each page */
667	movb	_PTmap(%edx),%al
668	andb	$0x07,%al			/* Pages must be VALID + USERACC + WRITABLE */
669	cmpb	$0x07,%al
670	je	2f
671
672	/* simulate a trap */
673	pushl	%edx
674	pushl	%ecx
675	shll	$IDXSHIFT,%edx
676	pushl	%edx
677	call	_trapwrite			/* trapwrite(addr) */
678	popl	%edx
679	popl	%ecx
680	popl	%edx
681
682	testl	%eax,%eax			/* if not ok, return EFAULT */
683	jnz	copyout_fault
684
6852:
686	addl	$4,%edx
687	decl	%ecx
688	jnz	1b				/* check next page */
689#endif /* I386_CPU */
690
691	/* bcopy(%esi, %edi, %ebx) */
6923:
693	movl	%ebx,%ecx
694#if defined(I586_CPU) && defined(I586_FAST_BCOPY)
695	cmpl	$1024,%ecx
696	jb	slow_copyout
697
698#if defined(I386_CPU) || defined(I486_CPU) || defined(I686_CPU)
699	cmpl	$CPUCLASS_586,_cpu_class
700	jne	slow_copyout
701#endif /* I386_CPU || I486_CPU || I686_CPU */
702
703	pushl	%ecx
704	call	_fastmove
705	addl	$4,%esp
706	jmp	done_copyout
707
708	ALIGN_TEXT
709slow_copyout:
710#endif /* I586_CPU && I586_FAST_BCOPY */
711	shrl	$2,%ecx
712	cld
713	rep
714	movsl
715	movb	%bl,%cl
716	andb	$3,%cl
717	rep
718	movsb
719
720done_copyout:
721	popl	%ebx
722	popl	%edi
723	popl	%esi
724	xorl	%eax,%eax
725	movl	_curpcb,%edx
726	movl	%eax,PCB_ONFAULT(%edx)
727	ret
728
729	ALIGN_TEXT
730copyout_fault:
731	popl	%ebx
732	popl	%edi
733	popl	%esi
734	movl	_curpcb,%edx
735	movl	$0,PCB_ONFAULT(%edx)
736	movl	$EFAULT,%eax
737	ret
738
739/* copyin(from_user, to_kernel, len) */
740ENTRY(copyin)
741	movl	_curpcb,%eax
742	movl	$copyin_fault,PCB_ONFAULT(%eax)
743	pushl	%esi
744	pushl	%edi
745	movl	12(%esp),%esi			/* caddr_t from */
746	movl	16(%esp),%edi			/* caddr_t to */
747	movl	20(%esp),%ecx			/* size_t  len */
748
749	/*
750	 * make sure address is valid
751	 */
752	movl	%esi,%edx
753	addl	%ecx,%edx
754	jc	copyin_fault
755	cmpl	$VM_MAXUSER_ADDRESS,%edx
756	ja	copyin_fault
757
758#if defined(I586_CPU) && defined(I586_FAST_BCOPY)
759	cmpl	$1024,%ecx
760	jb	slow_copyin
761
762#if defined(I386_CPU) || defined(I486_CPU) || defined(I686_CPU)
763	cmpl	$CPUCLASS_586,_cpu_class
764	jne	slow_copyin
765#endif /* I386_CPU || I486_CPU || I686_CPU */
766
767	pushl	%ecx
768	call	_fastmove
769	addl	$4,%esp
770	jmp	done_copyin
771
772	ALIGN_TEXT
773slow_copyin:
774#endif /* I586_CPU && I586_FAST_BCOPY */
775	movb	%cl,%al
776	shrl	$2,%ecx				/* copy longword-wise */
777	cld
778	rep
779	movsl
780	movb	%al,%cl
781	andb	$3,%cl				/* copy remaining bytes */
782	rep
783	movsb
784
785#if defined(I586_CPU) && defined(I586_FAST_BCOPY)
786	ALIGN_TEXT
787done_copyin:
788#endif /* I586_CPU && I586_FAST_BCOPY */
789	popl	%edi
790	popl	%esi
791	xorl	%eax,%eax
792	movl	_curpcb,%edx
793	movl	%eax,PCB_ONFAULT(%edx)
794	ret
795
796	ALIGN_TEXT
797copyin_fault:
798	popl	%edi
799	popl	%esi
800	movl	_curpcb,%edx
801	movl	$0,PCB_ONFAULT(%edx)
802	movl	$EFAULT,%eax
803	ret
804
805#if defined(I586_CPU) && defined(I586_FAST_BCOPY)
806/* fastmove(src, dst, len)
807	src in %esi
808	dst in %edi
809	len in %ecx		XXX changed to on stack for profiling
810	uses %eax and %edx for tmp. storage
811 */
812/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
813ENTRY(fastmove)
814	movl	4(%esp),%ecx
815	cmpl	$63,%ecx
816	jbe	fastmove_tail
817
818	testl	$7,%esi	/* check if src addr is multiple of 8 */
819	jnz	fastmove_tail
820
821	testl	$7,%edi	/* check if dst addr is multiple of 8 */
822	jnz	fastmove_tail
823
824	pushl	%ebp
825	movl	%esp,%ebp
826	subl	$PCB_SAVEFPU_SIZE,%esp
827
828/* if (npxproc != NULL) { */
829	cmpl	$0,_npxproc
830	je	6f
831/*    fnsave(&curpcb->pcb_savefpu); */
832	movl	_curpcb,%eax
833	fnsave	PCB_SAVEFPU(%eax)
834/*   npxproc = NULL; */
835	movl	$0,_npxproc
836/* } */
8376:
838/* now we own the FPU. */
839
840/*
841 * The process' FP state is saved in the pcb, but if we get
842 * switched, the cpu_switch() will store our FP state in the
843 * pcb.  It should be possible to avoid all the copying for
844 * this, e.g., by setting a flag to tell cpu_switch() to
845 * save the state somewhere else.
846 */
847/* tmp = curpcb->pcb_savefpu; */
848	pushl	%edi
849	pushl	%esi
850	pushl	%ecx
851	leal	-PCB_SAVEFPU_SIZE(%ebp),%edi
852	movl	_curpcb,%esi
853	addl	$PCB_SAVEFPU,%esi
854	cld
855	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
856	rep
857	movsl
858	popl	%ecx
859	popl	%esi
860	popl	%edi
861/* stop_emulating(); */
862	clts
863/* npxproc = curproc; */
864	movl	_curproc,%eax
865	movl	%eax,_npxproc
8664:
867	pushl	%ecx
868	cmpl	$1792,%ecx
869	jbe	2f
870	movl	$1792,%ecx
8712:
872	subl	%ecx,0(%esp)
873	cmpl	$256,%ecx
874	jb	5f
875	pushl	%esi
876	pushl	%ecx
877	ALIGN_TEXT
8783:
879	movl	0(%esi),%eax
880	movl	32(%esi),%eax
881	movl	64(%esi),%eax
882	movl	96(%esi),%eax
883	movl	128(%esi),%eax
884	movl	160(%esi),%eax
885	movl	192(%esi),%eax
886	movl	224(%esi),%eax
887	addl	$256,%esi
888	subl	$256,%ecx
889	cmpl	$256,%ecx
890	jae	3b
891	popl	%ecx
892	popl	%esi
8935:
894	ALIGN_TEXT
895fastmove_loop:
896	fildq	0(%esi)
897	fildq	8(%esi)
898	fildq	16(%esi)
899	fildq	24(%esi)
900	fildq	32(%esi)
901	fildq	40(%esi)
902	fildq	48(%esi)
903	fildq	56(%esi)
904	fistpq	56(%edi)
905	fistpq	48(%edi)
906	fistpq	40(%edi)
907	fistpq	32(%edi)
908	fistpq	24(%edi)
909	fistpq	16(%edi)
910	fistpq	8(%edi)
911	fistpq	0(%edi)
912	addl	$-64,%ecx
913	addl	$64,%esi
914	addl	$64,%edi
915	cmpl	$63,%ecx
916	ja	fastmove_loop
917	popl	%eax
918	addl	%eax,%ecx
919	cmpl	$64,%ecx
920	jae	4b
921
922/* curpcb->pcb_savefpu = tmp; */
923	pushl	%edi
924	pushl	%esi
925	pushl	%ecx
926	movl	_curpcb,%edi
927	addl	$PCB_SAVEFPU,%edi
928	leal	-PCB_SAVEFPU_SIZE(%ebp),%esi
929	cld
930	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
931	rep
932	movsl
933	popl	%ecx
934	popl	%esi
935	popl	%edi
936
937/* start_emulating(); */
938	smsw	%ax
939	orb	$CR0_TS,%al
940	lmsw	%ax
941/* npxproc = NULL; */
942	movl	$0,_npxproc
943	movl	%ebp,%esp
944	popl	%ebp
945
946	ALIGN_TEXT
947fastmove_tail:
948	movb	%cl,%al
949	shrl	$2,%ecx				/* copy longword-wise */
950	cld
951	rep
952	movsl
953	movb	%al,%cl
954	andb	$3,%cl				/* copy remaining bytes */
955	rep
956	movsb
957
958	ret
959#endif /* I586_CPU && I586_FAST_BCOPY */
960
961/*
962 * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
963 */
964ENTRY(fuword)
965	movl	_curpcb,%ecx
966	movl	$fusufault,PCB_ONFAULT(%ecx)
967	movl	4(%esp),%edx			/* from */
968
969	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
970	ja	fusufault
971
972	movl	(%edx),%eax
973	movl	$0,PCB_ONFAULT(%ecx)
974	ret
975
976/*
977 * These two routines are called from the profiling code, potentially
978 * at interrupt time. If they fail, that's okay, good things will
979 * happen later. Fail all the time for now - until the trap code is
980 * able to deal with this.
981 */
982ALTENTRY(suswintr)
983ENTRY(fuswintr)
984	movl	$-1,%eax
985	ret
986
987ENTRY(fusword)
988	movl	_curpcb,%ecx
989	movl	$fusufault,PCB_ONFAULT(%ecx)
990	movl	4(%esp),%edx
991
992	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
993	ja	fusufault
994
995	movzwl	(%edx),%eax
996	movl	$0,PCB_ONFAULT(%ecx)
997	ret
998
999ENTRY(fubyte)
1000	movl	_curpcb,%ecx
1001	movl	$fusufault,PCB_ONFAULT(%ecx)
1002	movl	4(%esp),%edx
1003
1004	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1005	ja	fusufault
1006
1007	movzbl	(%edx),%eax
1008	movl	$0,PCB_ONFAULT(%ecx)
1009	ret
1010
1011	ALIGN_TEXT
1012fusufault:
1013	movl	_curpcb,%ecx
1014	xorl	%eax,%eax
1015	movl	%eax,PCB_ONFAULT(%ecx)
1016	decl	%eax
1017	ret
1018
1019/*
1020 * su{byte,sword,word}: write a byte (word, longword) to user memory
1021 */
1022ENTRY(suword)
1023	movl	_curpcb,%ecx
1024	movl	$fusufault,PCB_ONFAULT(%ecx)
1025	movl	4(%esp),%edx
1026
1027#if defined(I386_CPU)
1028
1029#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1030	cmpl	$CPUCLASS_386,_cpu_class
1031	jne	2f				/* we only have to set the right segment selector */
1032#endif /* I486_CPU || I586_CPU || I686_CPU */
1033
1034	/* XXX - page boundary crossing is still not handled */
1035	movl	%edx,%eax
1036	shrl	$IDXSHIFT,%edx
1037	andb	$0xfc,%dl
1038	movb	_PTmap(%edx),%dl
1039	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1040	cmpb	$0x7,%dl
1041	je	1f
1042
1043	/* simulate a trap */
1044	pushl	%eax
1045	call	_trapwrite
1046	popl	%edx				/* remove junk parameter from stack */
1047	movl	_curpcb,%ecx			/* restore trashed register */
1048	testl	%eax,%eax
1049	jnz	fusufault
10501:
1051	movl	4(%esp),%edx
1052#endif
1053
10542:
1055	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1056	ja	fusufault
1057
1058	movl	8(%esp),%eax
1059	movl	%eax,(%edx)
1060	xorl	%eax,%eax
1061	movl	%eax,PCB_ONFAULT(%ecx)
1062	ret
1063
1064ENTRY(susword)
1065	movl	_curpcb,%ecx
1066	movl	$fusufault,PCB_ONFAULT(%ecx)
1067	movl	4(%esp),%edx
1068
1069#if defined(I386_CPU)
1070
1071#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1072	cmpl	$CPUCLASS_386,_cpu_class
1073	jne	2f
1074#endif /* I486_CPU || I586_CPU || I686_CPU */
1075
1076	/* XXX - page boundary crossing is still not handled */
1077	movl	%edx,%eax
1078	shrl	$IDXSHIFT,%edx
1079	andb	$0xfc,%dl
1080	movb	_PTmap(%edx),%dl
1081	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1082	cmpb	$0x7,%dl
1083	je	1f
1084
1085	/* simulate a trap */
1086	pushl	%eax
1087	call	_trapwrite
1088	popl	%edx				/* remove junk parameter from stack */
1089	movl	_curpcb,%ecx			/* restore trashed register */
1090	testl	%eax,%eax
1091	jnz	fusufault
10921:
1093	movl	4(%esp),%edx
1094#endif
1095
10962:
1097	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1098	ja	fusufault
1099
1100	movw	8(%esp),%ax
1101	movw	%ax,(%edx)
1102	xorl	%eax,%eax
1103	movl	%eax,PCB_ONFAULT(%ecx)
1104	ret
1105
1106ALTENTRY(suibyte)
1107ENTRY(subyte)
1108	movl	_curpcb,%ecx
1109	movl	$fusufault,PCB_ONFAULT(%ecx)
1110	movl	4(%esp),%edx
1111
1112#if defined(I386_CPU)
1113
1114#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1115	cmpl	$CPUCLASS_386,_cpu_class
1116	jne	2f
1117#endif /* I486_CPU || I586_CPU || I686_CPU */
1118
1119	movl	%edx,%eax
1120	shrl	$IDXSHIFT,%edx
1121	andb	$0xfc,%dl
1122	movb	_PTmap(%edx),%dl
1123	andb	$0x7,%dl			/* must be VALID + USERACC + WRITE */
1124	cmpb	$0x7,%dl
1125	je	1f
1126
1127	/* simulate a trap */
1128	pushl	%eax
1129	call	_trapwrite
1130	popl	%edx				/* remove junk parameter from stack */
1131	movl	_curpcb,%ecx			/* restore trashed register */
1132	testl	%eax,%eax
1133	jnz	fusufault
11341:
1135	movl	4(%esp),%edx
1136#endif
1137
11382:
1139	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1140	ja	fusufault
1141
1142	movb	8(%esp),%al
1143	movb	%al,(%edx)
1144	xorl	%eax,%eax
1145	movl	%eax,PCB_ONFAULT(%ecx)
1146	ret
1147
1148/*
1149 * copyinstr(from, to, maxlen, int *lencopied)
1150 *	copy a string from from to to, stop when a 0 character is reached.
1151 *	return ENAMETOOLONG if string is longer than maxlen, and
1152 *	EFAULT on protection violations. If lencopied is non-zero,
1153 *	return the actual length in *lencopied.
1154 */
1155ENTRY(copyinstr)
1156	pushl	%esi
1157	pushl	%edi
1158	movl	_curpcb,%ecx
1159	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1160
1161	movl	12(%esp),%esi			/* %esi = from */
1162	movl	16(%esp),%edi			/* %edi = to */
1163	movl	20(%esp),%edx			/* %edx = maxlen */
1164
1165	movl	$VM_MAXUSER_ADDRESS,%eax
1166
1167	/* make sure 'from' is within bounds */
1168	subl	%esi,%eax
1169	jbe	cpystrflt
1170
1171	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1172	cmpl	%edx,%eax
1173	jae	1f
1174	movl	%eax,%edx
1175	movl	%eax,20(%esp)
11761:
1177	incl	%edx
1178	cld
1179
11802:
1181	decl	%edx
1182	jz	3f
1183
1184	lodsb
1185	stosb
1186	orb	%al,%al
1187	jnz	2b
1188
1189	/* Success -- 0 byte reached */
1190	decl	%edx
1191	xorl	%eax,%eax
1192	jmp	cpystrflt_x
11933:
1194	/* edx is zero - return ENAMETOOLONG or EFAULT */
1195	cmpl	$VM_MAXUSER_ADDRESS,%esi
1196	jae	cpystrflt
11974:
1198	movl	$ENAMETOOLONG,%eax
1199	jmp	cpystrflt_x
1200
1201cpystrflt:
1202	movl	$EFAULT,%eax
1203
1204cpystrflt_x:
1205	/* set *lencopied and return %eax */
1206	movl	_curpcb,%ecx
1207	movl	$0,PCB_ONFAULT(%ecx)
1208	movl	20(%esp),%ecx
1209	subl	%edx,%ecx
1210	movl	24(%esp),%edx
1211	testl	%edx,%edx
1212	jz	1f
1213	movl	%ecx,(%edx)
12141:
1215	popl	%edi
1216	popl	%esi
1217	ret
1218
1219
1220/*
1221 * copystr(from, to, maxlen, int *lencopied)
1222 */
1223ENTRY(copystr)
1224	pushl	%esi
1225	pushl	%edi
1226
1227	movl	12(%esp),%esi			/* %esi = from */
1228	movl	16(%esp),%edi			/* %edi = to */
1229	movl	20(%esp),%edx			/* %edx = maxlen */
1230	incl	%edx
1231	cld
12321:
1233	decl	%edx
1234	jz	4f
1235	lodsb
1236	stosb
1237	orb	%al,%al
1238	jnz	1b
1239
1240	/* Success -- 0 byte reached */
1241	decl	%edx
1242	xorl	%eax,%eax
1243	jmp	6f
12444:
1245	/* edx is zero -- return ENAMETOOLONG */
1246	movl	$ENAMETOOLONG,%eax
1247
12486:
1249	/* set *lencopied and return %eax */
1250	movl	20(%esp),%ecx
1251	subl	%edx,%ecx
1252	movl	24(%esp),%edx
1253	testl	%edx,%edx
1254	jz	7f
1255	movl	%ecx,(%edx)
12567:
1257	popl	%edi
1258	popl	%esi
1259	ret
1260
1261ENTRY(bcmp)
1262	pushl	%edi
1263	pushl	%esi
1264	movl	12(%esp),%edi
1265	movl	16(%esp),%esi
1266	movl	20(%esp),%edx
1267	xorl	%eax,%eax
1268
1269	movl	%edx,%ecx
1270	shrl	$2,%ecx
1271	cld					/* compare forwards */
1272	repe
1273	cmpsl
1274	jne	1f
1275
1276	movl	%edx,%ecx
1277	andl	$3,%ecx
1278	repe
1279	cmpsb
1280	je	2f
12811:
1282	incl	%eax
12832:
1284	popl	%esi
1285	popl	%edi
1286	ret
1287
1288
1289/*
1290 * Handling of special 386 registers and descriptor tables etc
1291 */
1292/* void lgdt(struct region_descriptor *rdp); */
1293ENTRY(lgdt)
1294	/* reload the descriptor table */
1295	movl	4(%esp),%eax
1296	lgdt	(%eax)
1297
1298	/* flush the prefetch q */
1299	jmp	1f
1300	nop
13011:
1302	/* reload "stale" selectors */
1303	movl	$KDSEL,%eax
1304	movl	%ax,%ds
1305	movl	%ax,%es
1306	movl	%ax,%ss
1307
1308	/* reload code selector by turning return into intersegmental return */
1309	movl	(%esp),%eax
1310	pushl	%eax
1311#	movl	$KCSEL,4(%esp)
1312	movl	$8,4(%esp)
1313	lret
1314
1315/*
1316 * void lidt(struct region_descriptor *rdp);
1317 */
1318ENTRY(lidt)
1319	movl	4(%esp),%eax
1320	lidt	(%eax)
1321	ret
1322
1323/*
1324 * void lldt(u_short sel)
1325 */
1326ENTRY(lldt)
1327	lldt	4(%esp)
1328	ret
1329
1330/*
1331 * void ltr(u_short sel)
1332 */
1333ENTRY(ltr)
1334	ltr	4(%esp)
1335	ret
1336
1337/* ssdtosd(*ssdp,*sdp) */
1338ENTRY(ssdtosd)
1339	pushl	%ebx
1340	movl	8(%esp),%ecx
1341	movl	8(%ecx),%ebx
1342	shll	$16,%ebx
1343	movl	(%ecx),%edx
1344	roll	$16,%edx
1345	movb	%dh,%bl
1346	movb	%dl,%bh
1347	rorl	$8,%ebx
1348	movl	4(%ecx),%eax
1349	movw	%ax,%dx
1350	andl	$0xf0000,%eax
1351	orl	%eax,%ebx
1352	movl	12(%esp),%ecx
1353	movl	%edx,(%ecx)
1354	movl	%ebx,4(%ecx)
1355	popl	%ebx
1356	ret
1357
1358/* load_cr0(cr0) */
1359ENTRY(load_cr0)
1360	movl	4(%esp),%eax
1361	movl	%eax,%cr0
1362	ret
1363
1364/* rcr0() */
1365ENTRY(rcr0)
1366	movl	%cr0,%eax
1367	ret
1368
1369/* rcr3() */
1370ENTRY(rcr3)
1371	movl	%cr3,%eax
1372	ret
1373
1374/* void load_cr3(caddr_t cr3) */
1375ENTRY(load_cr3)
1376	movl	4(%esp),%eax
1377	movl	%eax,%cr3
1378	ret
1379
1380
1381/*****************************************************************************/
1382/* setjump, longjump                                                         */
1383/*****************************************************************************/
1384
1385ENTRY(setjmp)
1386	movl	4(%esp),%eax
1387	movl	%ebx,(%eax)			/* save ebx */
1388	movl	%esp,4(%eax)			/* save esp */
1389	movl	%ebp,8(%eax)			/* save ebp */
1390	movl	%esi,12(%eax)			/* save esi */
1391	movl	%edi,16(%eax)			/* save edi */
1392	movl	(%esp),%edx			/* get rta */
1393	movl	%edx,20(%eax)			/* save eip */
1394	xorl	%eax,%eax			/* return(0); */
1395	ret
1396
1397ENTRY(longjmp)
1398	movl	4(%esp),%eax
1399	movl	(%eax),%ebx			/* restore ebx */
1400	movl	4(%eax),%esp			/* restore esp */
1401	movl	8(%eax),%ebp			/* restore ebp */
1402	movl	12(%eax),%esi			/* restore esi */
1403	movl	16(%eax),%edi			/* restore edi */
1404	movl	20(%eax),%edx			/* get rta */
1405	movl	%edx,(%esp)			/* put in return frame */
1406	xorl	%eax,%eax			/* return(1); */
1407	incl	%eax
1408	ret
1409
1410/*
1411 * Here for doing BB-profiling (gcc -a).
1412 * We rely on the "bbset" instead, but need a dummy function.
1413 */
1414NON_GPROF_ENTRY(__bb_init_func)
1415	movl	4(%esp),%eax
1416	movl	$1,(%eax)
1417	.byte	0xc3				/* avoid macro for `ret' */
1418