support.s revision 112898
11558Srgrimes/*-
21558Srgrimes * Copyright (c) 1993 The Regents of the University of California.
31558Srgrimes * All rights reserved.
41558Srgrimes *
51558Srgrimes * Redistribution and use in source and binary forms, with or without
61558Srgrimes * modification, are permitted provided that the following conditions
71558Srgrimes * are met:
81558Srgrimes * 1. Redistributions of source code must retain the above copyright
91558Srgrimes *    notice, this list of conditions and the following disclaimer.
101558Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111558Srgrimes *    notice, this list of conditions and the following disclaimer in the
121558Srgrimes *    documentation and/or other materials provided with the distribution.
131558Srgrimes * 3. All advertising materials mentioning features or use of this software
141558Srgrimes *    must display the following acknowledgement:
151558Srgrimes *	This product includes software developed by the University of
161558Srgrimes *	California, Berkeley and its contributors.
171558Srgrimes * 4. Neither the name of the University nor the names of its contributors
181558Srgrimes *    may be used to endorse or promote products derived from this software
191558Srgrimes *    without specific prior written permission.
201558Srgrimes *
211558Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
221558Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
231558Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
241558Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
251558Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
261558Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
271558Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
281558Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
291558Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30114589Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
311558Srgrimes * SUCH DAMAGE.
3238039Scharnier *
331558Srgrimes * $FreeBSD: head/sys/i386/i386/support.s 112898 2003-04-01 00:18:55Z jeff $
341558Srgrimes */
351558Srgrimes
361558Srgrimes#include "opt_npx.h"
371558Srgrimes
381558Srgrimes#include <machine/asmacros.h>
39114589Sobrien#include <machine/cputypes.h>
4038039Scharnier#include <machine/pmap.h>
41114589Sobrien#include <machine/specialreg.h>
42114589Sobrien
431558Srgrimes#include "assym.s"
44108375Sdillon
45108375Sdillon#define IDXSHIFT	10
46108375Sdillon
47138129Sdas	.data
48108375Sdillon	.globl	bcopy_vector
4938039Scharnierbcopy_vector:
5038039Scharnier	.long	generic_bcopy
511558Srgrimes	.globl	bzero
521558Srgrimesbzero:
5378732Sdd	.long	generic_bzero
5438039Scharnier	.globl	copyin_vector
556661Sphkcopyin_vector:
56108375Sdillon	.long	generic_copyin
571558Srgrimes	.globl	copyout_vector
58108375Sdilloncopyout_vector:
59108375Sdillon	.long	generic_copyout
60108375Sdillon	.globl	ovbcopy_vector
616661Sphkovbcopy_vector:
62108375Sdillon	.long	generic_bcopy
63108375Sdillon#if defined(I586_CPU) && defined(DEV_NPX)
646661Sphkkernel_fpu_lock:
656661Sphk	.byte	0xfe
661558Srgrimes	.space	3
6792806Sobrien#endif
68108375Sdillon
6992806Sobrien	.text
701558Srgrimes
71108375Sdillon/*
721558Srgrimes * bcopy family
73108375Sdillon * void bzero(void *buf, u_int len)
74108375Sdillon */
75108375Sdillon
76108375SdillonENTRY(generic_bzero)
77108375Sdillon	pushl	%edi
78108375Sdillon	movl	8(%esp),%edi
79108375Sdillon	movl	12(%esp),%ecx
80108375Sdillon	xorl	%eax,%eax
811558Srgrimes	shrl	$2,%ecx
82108375Sdillon	cld
83108375Sdillon	rep
84108375Sdillon	stosl
85108375Sdillon	movl	12(%esp),%ecx
86108375Sdillon	andl	$3,%ecx
87108375Sdillon	rep
88108375Sdillon	stosb
89108375Sdillon	popl	%edi
90108375Sdillon	ret
91108375Sdillon
921558Srgrimes#ifdef I486_CPU
93108375SdillonENTRY(i486_bzero)
94108375Sdillon	movl	4(%esp),%edx
95108375Sdillon	movl	8(%esp),%ecx
96108375Sdillon	xorl	%eax,%eax
971558Srgrimes/*
98108375Sdillon * do 64 byte chunks first
99108375Sdillon *
100108375Sdillon * XXX this is probably over-unrolled at least for DX2's
101108375Sdillon */
102108375Sdillon2:
103108375Sdillon	cmpl	$64,%ecx
104108375Sdillon	jb	3f
105108375Sdillon	movl	%eax,(%edx)
106108375Sdillon	movl	%eax,4(%edx)
107108375Sdillon	movl	%eax,8(%edx)
108108375Sdillon	movl	%eax,12(%edx)
109108375Sdillon	movl	%eax,16(%edx)
110108375Sdillon	movl	%eax,20(%edx)
111108375Sdillon	movl	%eax,24(%edx)
112108375Sdillon	movl	%eax,28(%edx)
113108375Sdillon	movl	%eax,32(%edx)
114108375Sdillon	movl	%eax,36(%edx)
115108375Sdillon	movl	%eax,40(%edx)
116108375Sdillon	movl	%eax,44(%edx)
117108375Sdillon	movl	%eax,48(%edx)
118108375Sdillon	movl	%eax,52(%edx)
119108375Sdillon	movl	%eax,56(%edx)
120108375Sdillon	movl	%eax,60(%edx)
121108375Sdillon	addl	$64,%edx
122108375Sdillon	subl	$64,%ecx
123108375Sdillon	jnz	2b
1241558Srgrimes	ret
1251558Srgrimes
126108375Sdillon/*
1271558Srgrimes * do 16 byte chunks
128108375Sdillon */
1291558Srgrimes	SUPERALIGN_TEXT
1301558Srgrimes3:
1311558Srgrimes	cmpl	$16,%ecx
132108375Sdillon	jb	4f
133108375Sdillon	movl	%eax,(%edx)
134108375Sdillon	movl	%eax,4(%edx)
135108375Sdillon	movl	%eax,8(%edx)
136108375Sdillon	movl	%eax,12(%edx)
137108375Sdillon	addl	$16,%edx
138108375Sdillon	subl	$16,%ecx
139111424Sdas	jnz	3b
140108375Sdillon	ret
141108375Sdillon
142108375Sdillon/*
143108375Sdillon * do 4 byte chunks
144108375Sdillon */
145108375Sdillon	SUPERALIGN_TEXT
146108375Sdillon4:
147108375Sdillon	cmpl	$4,%ecx
148108375Sdillon	jb	5f
149108375Sdillon	movl	%eax,(%edx)
150108375Sdillon	addl	$4,%edx
151108375Sdillon	subl	$4,%ecx
1521558Srgrimes	jnz	4b
153108375Sdillon	ret
154107913Sdillon
155108375Sdillon/*
156108375Sdillon * do 1 byte chunks
157108375Sdillon * a jump table seems to be faster than a loop or more range reductions
1581558Srgrimes *
159108375Sdillon * XXX need a const section for non-text
160108375Sdillon */
161108375Sdillon	.data
162108375Sdillonjtab:
163108375Sdillon	.long	do0
164108375Sdillon	.long	do1
1651558Srgrimes	.long	do2
1661558Srgrimes	.long	do3
1671558Srgrimes
168108375Sdillon	.text
169111424Sdas	SUPERALIGN_TEXT
1701558Srgrimes5:
171108375Sdillon	jmp	*jtab(,%ecx,4)
1721558Srgrimes
1731558Srgrimes	SUPERALIGN_TEXT
174111424Sdasdo3:
17526740Scharnier	movw	%ax,(%edx)
1761558Srgrimes	movb	%al,2(%edx)
177111424Sdas	ret
178111424Sdas
179111424Sdas	SUPERALIGN_TEXT
180111424Sdasdo2:
181111424Sdas	movw	%ax,(%edx)
182111424Sdas	ret
1831558Srgrimes
18426740Scharnier	SUPERALIGN_TEXT
1851558Srgrimesdo1:
1861558Srgrimes	movb	%al,(%edx)
1871558Srgrimes	ret
1881558Srgrimes
1891558Srgrimes	SUPERALIGN_TEXT
1901558Srgrimesdo0:
1911558Srgrimes	ret
19226740Scharnier#endif
193108375Sdillon
1941558Srgrimes#if defined(I586_CPU) && defined(DEV_NPX)
195108375SdillonENTRY(i586_bzero)
196108375Sdillon	movl	4(%esp),%edx
197141611Sru	movl	8(%esp),%ecx
198108375Sdillon
199141611Sru	/*
200108375Sdillon	 * The FPU register method is twice as fast as the integer register
201108375Sdillon	 * method unless the target is in the L1 cache and we pre-allocate a
202141611Sru	 * cache line for it (then the integer register method is 4-5 times
203108375Sdillon	 * faster).  However, we never pre-allocate cache lines, since that
204108375Sdillon	 * would make the integer method 25% or more slower for the common
2051558Srgrimes	 * case when the target isn't in either the L1 cache or the L2 cache.
2061558Srgrimes	 * Thus we normally use the FPU register method unless the overhead
207107913Sdillon	 * would be too large.
208108375Sdillon	 */
209108375Sdillon	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
210107913Sdillon	jb	intreg_i586_bzero
211108375Sdillon
212108375Sdillon	/*
213108425Smike	 * The FPU registers may belong to an application or to fastmove()
214108375Sdillon	 * or to another invocation of bcopy() or ourself in a higher level
215108375Sdillon	 * interrupt or trap handler.  Preserving the registers is
216108375Sdillon	 * complicated since we avoid it if possible at all levels.  We
217108375Sdillon	 * want to localize the complications even when that increases them.
218108375Sdillon	 * Here the extra work involves preserving CR0_TS in TS.
219108375Sdillon	 * `fpcurthread != NULL' is supposed to be the condition that all the
220108375Sdillon	 * FPU resources belong to an application, but fpcurthread and CR0_TS
221108375Sdillon	 * aren't set atomically enough for this condition to work in
222108375Sdillon	 * interrupt handlers.
223108375Sdillon	 *
224108375Sdillon	 * Case 1: FPU registers belong to the application: we must preserve
225108375Sdillon	 * the registers if we use them, so we only use the FPU register
226108375Sdillon	 * method if the target size is large enough to amortize the extra
227108375Sdillon	 * overhead for preserving them.  CR0_TS must be preserved although
228108375Sdillon	 * it is very likely to end up as set.
229108375Sdillon	 *
230108375Sdillon	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
231108459Smike	 * makes the registers look like they belong to an application so
232108375Sdillon	 * that cpu_switch() and savectx() don't have to know about it, so
233108375Sdillon	 * this case reduces to case 1.
234108375Sdillon	 *
235108375Sdillon	 * Case 3: FPU registers belong to the kernel: don't use the FPU
236108375Sdillon	 * register method.  This case is unlikely, and supporting it would
237108375Sdillon	 * be more complicated and might take too much stack.
238108375Sdillon	 *
239108375Sdillon	 * Case 4: FPU registers don't belong to anyone: the FPU registers
240108375Sdillon	 * don't need to be preserved, so we always use the FPU register
241108375Sdillon	 * method.  CR0_TS must be preserved although it is very likely to
242108375Sdillon	 * always end up as clear.
243108375Sdillon	 */
244108375Sdillon	cmpl	$0,PCPU(FPCURTHREAD)
245108375Sdillon	je	i586_bz1
246108375Sdillon
247108375Sdillon	/*
248108375Sdillon	 * XXX don't use the FPU for cases 1 and 2, since preemptive
249108375Sdillon	 * scheduling of ithreads broke these cases.  Note that we can
250108375Sdillon	 * no longer get here from an interrupt handler, since the
251126643Smarkm	 * context sitch to the interrupt handler will have saved the
252108375Sdillon	 * FPU state.
253108375Sdillon	 */
254108375Sdillon	jmp	intreg_i586_bzero
255108375Sdillon
256108375Sdillon	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
257108375Sdillon	jb	intreg_i586_bzero
258108375Sdillon	sarb	$1,kernel_fpu_lock
259108375Sdillon	jc	intreg_i586_bzero
260108375Sdillon	smsw	%ax
261108375Sdillon	clts
262108375Sdillon	subl	$108,%esp
263108375Sdillon	fnsave	0(%esp)
264108375Sdillon	jmp	i586_bz2
265108375Sdillon
266108375Sdilloni586_bz1:
267108375Sdillon	sarb	$1,kernel_fpu_lock
268108375Sdillon	jc	intreg_i586_bzero
269108375Sdillon	smsw	%ax
270108375Sdillon	clts
271108375Sdillon	fninit				/* XXX should avoid needing this */
272108375Sdilloni586_bz2:
273108375Sdillon	fldz
274108375Sdillon
275108375Sdillon	/*
276107913Sdillon	 * Align to an 8 byte boundary (misalignment in the main loop would
277	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
278	 * already aligned) by always zeroing 8 bytes and using the part up
279	 * to the _next_ alignment position.
280	 */
281	fstl	0(%edx)
282	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
283	addl	$8,%edx
284	andl	$~7,%edx
285	subl	%edx,%ecx
286
287	/*
288	 * Similarly align `len' to a multiple of 8.
289	 */
290	fstl	-8(%edx,%ecx)
291	decl	%ecx
292	andl	$~7,%ecx
293
294	/*
295	 * This wouldn't be any faster if it were unrolled, since the loop
296	 * control instructions are much faster than the fstl and/or done
297	 * in parallel with it so their overhead is insignificant.
298	 */
299fpureg_i586_bzero_loop:
300	fstl	0(%edx)
301	addl	$8,%edx
302	subl	$8,%ecx
303	cmpl	$8,%ecx
304	jae	fpureg_i586_bzero_loop
305
306	cmpl	$0,PCPU(FPCURTHREAD)
307	je	i586_bz3
308
309	/* XXX check that the condition for cases 1-2 stayed false. */
310i586_bzero_oops:
311	int	$3
312	jmp	i586_bzero_oops
313
314	frstor	0(%esp)
315	addl	$108,%esp
316	lmsw	%ax
317	movb	$0xfe,kernel_fpu_lock
318	ret
319
320i586_bz3:
321	fstp	%st(0)
322	lmsw	%ax
323	movb	$0xfe,kernel_fpu_lock
324	ret
325
326intreg_i586_bzero:
327	/*
328	 * `rep stos' seems to be the best method in practice for small
329	 * counts.  Fancy methods usually take too long to start up due
330	 * to cache and BTB misses.
331	 */
332	pushl	%edi
333	movl	%edx,%edi
334	xorl	%eax,%eax
335	shrl	$2,%ecx
336	cld
337	rep
338	stosl
339	movl	12(%esp),%ecx
340	andl	$3,%ecx
341	jne	1f
342	popl	%edi
343	ret
344
3451:
346	rep
347	stosb
348	popl	%edi
349	ret
350#endif /* I586_CPU && defined(DEV_NPX) */
351
352ENTRY(i686_pagezero)
353	pushl	%edi
354	pushl	%ebx
355
356	movl	12(%esp), %edi
357	movl	$1024, %ecx
358	cld
359
360	ALIGN_TEXT
3611:
362	xorl	%eax, %eax
363	repe
364	scasl
365	jnz	2f
366
367	popl	%ebx
368	popl	%edi
369	ret
370
371	ALIGN_TEXT
372
3732:
374	incl	%ecx
375	subl	$4, %edi
376
377	movl	%ecx, %edx
378	cmpl	$16, %ecx
379
380	jge	3f
381
382	movl	%edi, %ebx
383	andl	$0x3f, %ebx
384	shrl	%ebx
385	shrl	%ebx
386	movl	$16, %ecx
387	subl	%ebx, %ecx
388
3893:
390	subl	%ecx, %edx
391	rep
392	stosl
393
394	movl	%edx, %ecx
395	testl	%edx, %edx
396	jnz	1b
397
398	popl	%ebx
399	popl	%edi
400	ret
401
402/* fillw(pat, base, cnt) */
403ENTRY(fillw)
404	pushl	%edi
405	movl	8(%esp),%eax
406	movl	12(%esp),%edi
407	movl	16(%esp),%ecx
408	cld
409	rep
410	stosw
411	popl	%edi
412	ret
413
414ENTRY(bcopyb)
415	pushl	%esi
416	pushl	%edi
417	movl	12(%esp),%esi
418	movl	16(%esp),%edi
419	movl	20(%esp),%ecx
420	movl	%edi,%eax
421	subl	%esi,%eax
422	cmpl	%ecx,%eax			/* overlapping && src < dst? */
423	jb	1f
424	cld					/* nope, copy forwards */
425	rep
426	movsb
427	popl	%edi
428	popl	%esi
429	ret
430
431	ALIGN_TEXT
4321:
433	addl	%ecx,%edi			/* copy backwards. */
434	addl	%ecx,%esi
435	decl	%edi
436	decl	%esi
437	std
438	rep
439	movsb
440	popl	%edi
441	popl	%esi
442	cld
443	ret
444
445ENTRY(bcopy)
446	MEXITCOUNT
447	jmp	*bcopy_vector
448
449ENTRY(ovbcopy)
450	MEXITCOUNT
451	jmp	*ovbcopy_vector
452
453/*
454 * generic_bcopy(src, dst, cnt)
455 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
456 */
457ENTRY(generic_bcopy)
458	pushl	%esi
459	pushl	%edi
460	movl	12(%esp),%esi
461	movl	16(%esp),%edi
462	movl	20(%esp),%ecx
463
464	movl	%edi,%eax
465	subl	%esi,%eax
466	cmpl	%ecx,%eax			/* overlapping && src < dst? */
467	jb	1f
468
469	shrl	$2,%ecx				/* copy by 32-bit words */
470	cld					/* nope, copy forwards */
471	rep
472	movsl
473	movl	20(%esp),%ecx
474	andl	$3,%ecx				/* any bytes left? */
475	rep
476	movsb
477	popl	%edi
478	popl	%esi
479	ret
480
481	ALIGN_TEXT
4821:
483	addl	%ecx,%edi			/* copy backwards */
484	addl	%ecx,%esi
485	decl	%edi
486	decl	%esi
487	andl	$3,%ecx				/* any fractional bytes? */
488	std
489	rep
490	movsb
491	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
492	shrl	$2,%ecx
493	subl	$3,%esi
494	subl	$3,%edi
495	rep
496	movsl
497	popl	%edi
498	popl	%esi
499	cld
500	ret
501
502#if defined(I586_CPU) && defined(DEV_NPX)
503ENTRY(i586_bcopy)
504	pushl	%esi
505	pushl	%edi
506	movl	12(%esp),%esi
507	movl	16(%esp),%edi
508	movl	20(%esp),%ecx
509
510	movl	%edi,%eax
511	subl	%esi,%eax
512	cmpl	%ecx,%eax			/* overlapping && src < dst? */
513	jb	1f
514
515	cmpl	$1024,%ecx
516	jb	small_i586_bcopy
517
518	sarb	$1,kernel_fpu_lock
519	jc	small_i586_bcopy
520	cmpl	$0,PCPU(FPCURTHREAD)
521	je	i586_bc1
522
523	/* XXX turn off handling of cases 1-2, as above. */
524	movb	$0xfe,kernel_fpu_lock
525	jmp	small_i586_bcopy
526
527	smsw	%dx
528	clts
529	subl	$108,%esp
530	fnsave	0(%esp)
531	jmp	4f
532
533i586_bc1:
534	smsw	%dx
535	clts
536	fninit				/* XXX should avoid needing this */
537
538	ALIGN_TEXT
5394:
540	pushl	%ecx
541#define	DCACHE_SIZE	8192
542	cmpl	$(DCACHE_SIZE-512)/2,%ecx
543	jbe	2f
544	movl	$(DCACHE_SIZE-512)/2,%ecx
5452:
546	subl	%ecx,0(%esp)
547	cmpl	$256,%ecx
548	jb	5f			/* XXX should prefetch if %ecx >= 32 */
549	pushl	%esi
550	pushl	%ecx
551	ALIGN_TEXT
5523:
553	movl	0(%esi),%eax
554	movl	32(%esi),%eax
555	movl	64(%esi),%eax
556	movl	96(%esi),%eax
557	movl	128(%esi),%eax
558	movl	160(%esi),%eax
559	movl	192(%esi),%eax
560	movl	224(%esi),%eax
561	addl	$256,%esi
562	subl	$256,%ecx
563	cmpl	$256,%ecx
564	jae	3b
565	popl	%ecx
566	popl	%esi
5675:
568	ALIGN_TEXT
569large_i586_bcopy_loop:
570	fildq	0(%esi)
571	fildq	8(%esi)
572	fildq	16(%esi)
573	fildq	24(%esi)
574	fildq	32(%esi)
575	fildq	40(%esi)
576	fildq	48(%esi)
577	fildq	56(%esi)
578	fistpq	56(%edi)
579	fistpq	48(%edi)
580	fistpq	40(%edi)
581	fistpq	32(%edi)
582	fistpq	24(%edi)
583	fistpq	16(%edi)
584	fistpq	8(%edi)
585	fistpq	0(%edi)
586	addl	$64,%esi
587	addl	$64,%edi
588	subl	$64,%ecx
589	cmpl	$64,%ecx
590	jae	large_i586_bcopy_loop
591	popl	%eax
592	addl	%eax,%ecx
593	cmpl	$64,%ecx
594	jae	4b
595
596	cmpl	$0,PCPU(FPCURTHREAD)
597	je	i586_bc2
598
599	/* XXX check that the condition for cases 1-2 stayed false. */
600i586_bcopy_oops:
601	int	$3
602	jmp	i586_bcopy_oops
603
604	frstor	0(%esp)
605	addl	$108,%esp
606i586_bc2:
607	lmsw	%dx
608	movb	$0xfe,kernel_fpu_lock
609
610/*
611 * This is a duplicate of the main part of generic_bcopy.  See the comments
612 * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
613 * would mess up high resolution profiling.
614 */
615	ALIGN_TEXT
616small_i586_bcopy:
617	shrl	$2,%ecx
618	cld
619	rep
620	movsl
621	movl	20(%esp),%ecx
622	andl	$3,%ecx
623	rep
624	movsb
625	popl	%edi
626	popl	%esi
627	ret
628
629	ALIGN_TEXT
6301:
631	addl	%ecx,%edi
632	addl	%ecx,%esi
633	decl	%edi
634	decl	%esi
635	andl	$3,%ecx
636	std
637	rep
638	movsb
639	movl	20(%esp),%ecx
640	shrl	$2,%ecx
641	subl	$3,%esi
642	subl	$3,%edi
643	rep
644	movsl
645	popl	%edi
646	popl	%esi
647	cld
648	ret
649#endif /* I586_CPU && defined(DEV_NPX) */
650
651/*
652 * Note: memcpy does not support overlapping copies
653 */
654ENTRY(memcpy)
655	pushl	%edi
656	pushl	%esi
657	movl	12(%esp),%edi
658	movl	16(%esp),%esi
659	movl	20(%esp),%ecx
660	movl	%edi,%eax
661	shrl	$2,%ecx				/* copy by 32-bit words */
662	cld					/* nope, copy forwards */
663	rep
664	movsl
665	movl	20(%esp),%ecx
666	andl	$3,%ecx				/* any bytes left? */
667	rep
668	movsb
669	popl	%esi
670	popl	%edi
671	ret
672
673
674/*****************************************************************************/
675/* copyout and fubyte family                                                 */
676/*****************************************************************************/
677/*
678 * Access user memory from inside the kernel. These routines and possibly
679 * the math- and DOS emulators should be the only places that do this.
680 *
681 * We have to access the memory with user's permissions, so use a segment
682 * selector with RPL 3. For writes to user space we have to additionally
683 * check the PTE for write permission, because the 386 does not check
684 * write permissions when we are executing with EPL 0. The 486 does check
685 * this if the WP bit is set in CR0, so we can use a simpler version here.
686 *
687 * These routines set curpcb->onfault for the time they execute. When a
688 * protection violation occurs inside the functions, the trap handler
689 * returns to *curpcb->onfault instead of the function.
690 */
691
692/*
693 * copyout(from_kernel, to_user, len)  - MP SAFE (if not I386_CPU)
694 */
695ENTRY(copyout)
696	MEXITCOUNT
697	jmp	*copyout_vector
698
699ENTRY(generic_copyout)
700	movl	PCPU(CURPCB),%eax
701	movl	$copyout_fault,PCB_ONFAULT(%eax)
702	pushl	%esi
703	pushl	%edi
704	pushl	%ebx
705	movl	16(%esp),%esi
706	movl	20(%esp),%edi
707	movl	24(%esp),%ebx
708	testl	%ebx,%ebx			/* anything to do? */
709	jz	done_copyout
710
711	/*
712	 * Check explicitly for non-user addresses.  If 486 write protection
713	 * is being used, this check is essential because we are in kernel
714	 * mode so the h/w does not provide any protection against writing
715	 * kernel addresses.
716	 */
717
718	/*
719	 * First, prevent address wrapping.
720	 */
721	movl	%edi,%eax
722	addl	%ebx,%eax
723	jc	copyout_fault
724/*
725 * XXX STOP USING VM_MAXUSER_ADDRESS.
726 * It is an end address, not a max, so every time it is used correctly it
727 * looks like there is an off by one error, and of course it caused an off
728 * by one error in several places.
729 */
730	cmpl	$VM_MAXUSER_ADDRESS,%eax
731	ja	copyout_fault
732
733#ifdef I386_CPU
734
735/*
736 * We have to check each PTE for user write permission.
737 * The checking may cause a page fault, so it is important to set
738 * up everything for return via copyout_fault before here.
739 */
740	/* compute number of pages */
741	movl	%edi,%ecx
742	andl	$PAGE_MASK,%ecx
743	addl	%ebx,%ecx
744	decl	%ecx
745	shrl	$IDXSHIFT+2,%ecx
746	incl	%ecx
747
748	/* compute PTE offset for start address */
749	movl	%edi,%edx
750	shrl	$IDXSHIFT,%edx
751	andb	$0xfc,%dl
752
7531:
754	/* check PTE for each page */
755	leal	PTmap(%edx),%eax
756	shrl	$IDXSHIFT,%eax
757	andb	$0xfc,%al
758	testb	$PG_V,PTmap(%eax)		/* PTE page must be valid */
759	je	4f
760	movb	PTmap(%edx),%al
761	andb	$PG_V|PG_RW|PG_U,%al		/* page must be valid and user writable */
762	cmpb	$PG_V|PG_RW|PG_U,%al
763	je	2f
764
7654:
766	/* simulate a trap */
767	pushl	%edx
768	pushl	%ecx
769	shll	$IDXSHIFT,%edx
770	pushl	%edx
771	call	trapwrite			/* trapwrite(addr) */
772	popl	%edx
773	popl	%ecx
774	popl	%edx
775
776	testl	%eax,%eax			/* if not ok, return EFAULT */
777	jnz	copyout_fault
778
7792:
780	addl	$4,%edx
781	decl	%ecx
782	jnz	1b				/* check next page */
783#endif /* I386_CPU */
784
785	/* bcopy(%esi, %edi, %ebx) */
786	movl	%ebx,%ecx
787
788#if defined(I586_CPU) && defined(DEV_NPX)
789	ALIGN_TEXT
790slow_copyout:
791#endif
792	shrl	$2,%ecx
793	cld
794	rep
795	movsl
796	movb	%bl,%cl
797	andb	$3,%cl
798	rep
799	movsb
800
801done_copyout:
802	popl	%ebx
803	popl	%edi
804	popl	%esi
805	xorl	%eax,%eax
806	movl	PCPU(CURPCB),%edx
807	movl	%eax,PCB_ONFAULT(%edx)
808	ret
809
810	ALIGN_TEXT
811copyout_fault:
812	popl	%ebx
813	popl	%edi
814	popl	%esi
815	movl	PCPU(CURPCB),%edx
816	movl	$0,PCB_ONFAULT(%edx)
817	movl	$EFAULT,%eax
818	ret
819
820#if defined(I586_CPU) && defined(DEV_NPX)
821ENTRY(i586_copyout)
822	/*
823	 * Duplicated from generic_copyout.  Could be done a bit better.
824	 */
825	movl	PCPU(CURPCB),%eax
826	movl	$copyout_fault,PCB_ONFAULT(%eax)
827	pushl	%esi
828	pushl	%edi
829	pushl	%ebx
830	movl	16(%esp),%esi
831	movl	20(%esp),%edi
832	movl	24(%esp),%ebx
833	testl	%ebx,%ebx			/* anything to do? */
834	jz	done_copyout
835
836	/*
837	 * Check explicitly for non-user addresses.  If 486 write protection
838	 * is being used, this check is essential because we are in kernel
839	 * mode so the h/w does not provide any protection against writing
840	 * kernel addresses.
841	 */
842
843	/*
844	 * First, prevent address wrapping.
845	 */
846	movl	%edi,%eax
847	addl	%ebx,%eax
848	jc	copyout_fault
849/*
850 * XXX STOP USING VM_MAXUSER_ADDRESS.
851 * It is an end address, not a max, so every time it is used correctly it
852 * looks like there is an off by one error, and of course it caused an off
853 * by one error in several places.
854 */
855	cmpl	$VM_MAXUSER_ADDRESS,%eax
856	ja	copyout_fault
857
858	/* bcopy(%esi, %edi, %ebx) */
8593:
860	movl	%ebx,%ecx
861	/*
862	 * End of duplicated code.
863	 */
864
865	cmpl	$1024,%ecx
866	jb	slow_copyout
867
868	pushl	%ecx
869	call	fastmove
870	addl	$4,%esp
871	jmp	done_copyout
872#endif /* I586_CPU && defined(DEV_NPX) */
873
874/*
875 * copyin(from_user, to_kernel, len) - MP SAFE
876 */
877ENTRY(copyin)
878	MEXITCOUNT
879	jmp	*copyin_vector
880
881ENTRY(generic_copyin)
882	movl	PCPU(CURPCB),%eax
883	movl	$copyin_fault,PCB_ONFAULT(%eax)
884	pushl	%esi
885	pushl	%edi
886	movl	12(%esp),%esi			/* caddr_t from */
887	movl	16(%esp),%edi			/* caddr_t to */
888	movl	20(%esp),%ecx			/* size_t  len */
889
890	/*
891	 * make sure address is valid
892	 */
893	movl	%esi,%edx
894	addl	%ecx,%edx
895	jc	copyin_fault
896	cmpl	$VM_MAXUSER_ADDRESS,%edx
897	ja	copyin_fault
898
899#if defined(I586_CPU) && defined(DEV_NPX)
900	ALIGN_TEXT
901slow_copyin:
902#endif
903	movb	%cl,%al
904	shrl	$2,%ecx				/* copy longword-wise */
905	cld
906	rep
907	movsl
908	movb	%al,%cl
909	andb	$3,%cl				/* copy remaining bytes */
910	rep
911	movsb
912
913#if defined(I586_CPU) && defined(DEV_NPX)
914	ALIGN_TEXT
915done_copyin:
916#endif
917	popl	%edi
918	popl	%esi
919	xorl	%eax,%eax
920	movl	PCPU(CURPCB),%edx
921	movl	%eax,PCB_ONFAULT(%edx)
922	ret
923
924	ALIGN_TEXT
925copyin_fault:
926	popl	%edi
927	popl	%esi
928	movl	PCPU(CURPCB),%edx
929	movl	$0,PCB_ONFAULT(%edx)
930	movl	$EFAULT,%eax
931	ret
932
933#if defined(I586_CPU) && defined(DEV_NPX)
934ENTRY(i586_copyin)
935	/*
936	 * Duplicated from generic_copyin.  Could be done a bit better.
937	 */
938	movl	PCPU(CURPCB),%eax
939	movl	$copyin_fault,PCB_ONFAULT(%eax)
940	pushl	%esi
941	pushl	%edi
942	movl	12(%esp),%esi			/* caddr_t from */
943	movl	16(%esp),%edi			/* caddr_t to */
944	movl	20(%esp),%ecx			/* size_t  len */
945
946	/*
947	 * make sure address is valid
948	 */
949	movl	%esi,%edx
950	addl	%ecx,%edx
951	jc	copyin_fault
952	cmpl	$VM_MAXUSER_ADDRESS,%edx
953	ja	copyin_fault
954	/*
955	 * End of duplicated code.
956	 */
957
958	cmpl	$1024,%ecx
959	jb	slow_copyin
960
961	pushl	%ebx			/* XXX prepare for fastmove_fault */
962	pushl	%ecx
963	call	fastmove
964	addl	$8,%esp
965	jmp	done_copyin
966#endif /* I586_CPU && defined(DEV_NPX) */
967
968#if defined(I586_CPU) && defined(DEV_NPX)
969/* fastmove(src, dst, len)
970	src in %esi
971	dst in %edi
972	len in %ecx		XXX changed to on stack for profiling
973	uses %eax and %edx for tmp. storage
974 */
975/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
976ENTRY(fastmove)
977	pushl	%ebp
978	movl	%esp,%ebp
979	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
980
981	movl	8(%ebp),%ecx
982	cmpl	$63,%ecx
983	jbe	fastmove_tail
984
985	testl	$7,%esi	/* check if src addr is multiple of 8 */
986	jnz	fastmove_tail
987
988	testl	$7,%edi	/* check if dst addr is multiple of 8 */
989	jnz	fastmove_tail
990
991	/* XXX grab FPU context atomically. */
992	cli
993
994/* if (fpcurthread != NULL) { */
995	cmpl	$0,PCPU(FPCURTHREAD)
996	je	6f
997/*    fnsave(&curpcb->pcb_savefpu); */
998	movl	PCPU(CURPCB),%eax
999	fnsave	PCB_SAVEFPU(%eax)
1000/*   FPCURTHREAD = NULL; */
1001	movl	$0,PCPU(FPCURTHREAD)
1002/* } */
10036:
1004/* now we own the FPU. */
1005
1006/*
1007 * The process' FP state is saved in the pcb, but if we get
1008 * switched, the cpu_switch() will store our FP state in the
1009 * pcb.  It should be possible to avoid all the copying for
1010 * this, e.g., by setting a flag to tell cpu_switch() to
1011 * save the state somewhere else.
1012 */
1013/* tmp = curpcb->pcb_savefpu; */
1014	movl	%ecx,-12(%ebp)
1015	movl	%esi,-8(%ebp)
1016	movl	%edi,-4(%ebp)
1017	movl	%esp,%edi
1018	movl	PCPU(CURPCB),%esi
1019	addl	$PCB_SAVEFPU,%esi
1020	cld
1021	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1022	rep
1023	movsl
1024	movl	-12(%ebp),%ecx
1025	movl	-8(%ebp),%esi
1026	movl	-4(%ebp),%edi
1027/* stop_emulating(); */
1028	clts
1029/* fpcurthread = curthread; */
1030	movl	PCPU(CURTHREAD),%eax
1031	movl	%eax,PCPU(FPCURTHREAD)
1032	movl	PCPU(CURPCB),%eax
1033
1034	/* XXX end of atomic FPU context grab. */
1035	sti
1036
1037	movl	$fastmove_fault,PCB_ONFAULT(%eax)
10384:
1039	movl	%ecx,-12(%ebp)
1040	cmpl	$1792,%ecx
1041	jbe	2f
1042	movl	$1792,%ecx
10432:
1044	subl	%ecx,-12(%ebp)
1045	cmpl	$256,%ecx
1046	jb	5f
1047	movl	%ecx,-8(%ebp)
1048	movl	%esi,-4(%ebp)
1049	ALIGN_TEXT
10503:
1051	movl	0(%esi),%eax
1052	movl	32(%esi),%eax
1053	movl	64(%esi),%eax
1054	movl	96(%esi),%eax
1055	movl	128(%esi),%eax
1056	movl	160(%esi),%eax
1057	movl	192(%esi),%eax
1058	movl	224(%esi),%eax
1059	addl	$256,%esi
1060	subl	$256,%ecx
1061	cmpl	$256,%ecx
1062	jae	3b
1063	movl	-8(%ebp),%ecx
1064	movl	-4(%ebp),%esi
10655:
1066	ALIGN_TEXT
1067fastmove_loop:
1068	fildq	0(%esi)
1069	fildq	8(%esi)
1070	fildq	16(%esi)
1071	fildq	24(%esi)
1072	fildq	32(%esi)
1073	fildq	40(%esi)
1074	fildq	48(%esi)
1075	fildq	56(%esi)
1076	fistpq	56(%edi)
1077	fistpq	48(%edi)
1078	fistpq	40(%edi)
1079	fistpq	32(%edi)
1080	fistpq	24(%edi)
1081	fistpq	16(%edi)
1082	fistpq	8(%edi)
1083	fistpq	0(%edi)
1084	addl	$-64,%ecx
1085	addl	$64,%esi
1086	addl	$64,%edi
1087	cmpl	$63,%ecx
1088	ja	fastmove_loop
1089	movl	-12(%ebp),%eax
1090	addl	%eax,%ecx
1091	cmpl	$64,%ecx
1092	jae	4b
1093
1094	/* XXX ungrab FPU context atomically. */
1095	cli
1096
1097/* curpcb->pcb_savefpu = tmp; */
1098	movl	%ecx,-12(%ebp)
1099	movl	%esi,-8(%ebp)
1100	movl	%edi,-4(%ebp)
1101	movl	PCPU(CURPCB),%edi
1102	addl	$PCB_SAVEFPU,%edi
1103	movl	%esp,%esi
1104	cld
1105	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1106	rep
1107	movsl
1108	movl	-12(%ebp),%ecx
1109	movl	-8(%ebp),%esi
1110	movl	-4(%ebp),%edi
1111
1112/* start_emulating(); */
1113	smsw	%ax
1114	orb	$CR0_TS,%al
1115	lmsw	%ax
1116/* fpcurthread = NULL; */
1117	movl	$0,PCPU(FPCURTHREAD)
1118
1119	/* XXX end of atomic FPU context ungrab. */
1120	sti
1121
1122	ALIGN_TEXT
1123fastmove_tail:
1124	movl	PCPU(CURPCB),%eax
1125	movl	$fastmove_tail_fault,PCB_ONFAULT(%eax)
1126
1127	movb	%cl,%al
1128	shrl	$2,%ecx				/* copy longword-wise */
1129	cld
1130	rep
1131	movsl
1132	movb	%al,%cl
1133	andb	$3,%cl				/* copy remaining bytes */
1134	rep
1135	movsb
1136
1137	movl	%ebp,%esp
1138	popl	%ebp
1139	ret
1140
1141	ALIGN_TEXT
1142fastmove_fault:
1143	/* XXX ungrab FPU context atomically. */
1144	cli
1145
1146	movl	PCPU(CURPCB),%edi
1147	addl	$PCB_SAVEFPU,%edi
1148	movl	%esp,%esi
1149	cld
1150	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
1151	rep
1152	movsl
1153
1154	smsw	%ax
1155	orb	$CR0_TS,%al
1156	lmsw	%ax
1157	movl	$0,PCPU(FPCURTHREAD)
1158
1159	/* XXX end of atomic FPU context ungrab. */
1160	sti
1161
1162fastmove_tail_fault:
1163	movl	%ebp,%esp
1164	popl	%ebp
1165	addl	$8,%esp
1166	popl	%ebx
1167	popl	%edi
1168	popl	%esi
1169	movl	PCPU(CURPCB),%edx
1170	movl	$0,PCB_ONFAULT(%edx)
1171	movl	$EFAULT,%eax
1172	ret
1173#endif /* I586_CPU && defined(DEV_NPX) */
1174
1175/*
1176 * casuptr.  Compare and set user pointer.  Returns -1 or the current value.
1177 */
1178ENTRY(casuptr)
1179	movl	PCPU(CURPCB),%ecx
1180	movl	$fusufault,PCB_ONFAULT(%ecx)
1181	movl	4(%esp),%edx			/* dst */
1182	movl	8(%esp),%eax			/* old */
1183	movl	12(%esp),%ecx			/* new */
1184
1185	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1186	ja	fusufault
1187
1188#if defined(SMP)
1189	lock cmpxchgl %ecx, (%edx)		/* Compare and set. */
1190#else	/* !SMP */
1191	cmpxchgl %ecx, (%edx)
1192#endif	/* !SMP */
1193
1194	/*
1195	 * We store the current value regardless of the success of the
1196	 * cmpxchg.  Calling code checks for new == return to determine
1197	 * success.
1198	 */
1199	movl	(%edx), %eax
1200
1201	movl	PCPU(CURPCB),%ecx
1202	movl	$fusufault,PCB_ONFAULT(%ecx)
1203	movl	$0,PCB_ONFAULT(%ecx)
1204	ret
1205
1206/*
1207 * fu{byte,sword,word} - MP SAFE
1208 *
1209 *	Fetch a byte (sword, word) from user memory
1210 */
1211ENTRY(fuword)
1212	movl	PCPU(CURPCB),%ecx
1213	movl	$fusufault,PCB_ONFAULT(%ecx)
1214	movl	4(%esp),%edx			/* from */
1215
1216	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
1217	ja	fusufault
1218
1219	movl	(%edx),%eax
1220	movl	$0,PCB_ONFAULT(%ecx)
1221	ret
1222
1223ENTRY(fuword32)
1224	jmp	fuword
1225
1226/*
1227 * These two routines are called from the profiling code, potentially
1228 * at interrupt time. If they fail, that's okay, good things will
1229 * happen later. Fail all the time for now - until the trap code is
1230 * able to deal with this.
1231 */
1232ALTENTRY(suswintr)
1233ENTRY(fuswintr)
1234	movl	$-1,%eax
1235	ret
1236
1237/*
1238 * fuword16 - MP SAFE
1239 */
1240ENTRY(fuword16)
1241	movl	PCPU(CURPCB),%ecx
1242	movl	$fusufault,PCB_ONFAULT(%ecx)
1243	movl	4(%esp),%edx
1244
1245	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
1246	ja	fusufault
1247
1248	movzwl	(%edx),%eax
1249	movl	$0,PCB_ONFAULT(%ecx)
1250	ret
1251
1252/*
1253 * fubyte - MP SAFE
1254 */
1255ENTRY(fubyte)
1256	movl	PCPU(CURPCB),%ecx
1257	movl	$fusufault,PCB_ONFAULT(%ecx)
1258	movl	4(%esp),%edx
1259
1260	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
1261	ja	fusufault
1262
1263	movzbl	(%edx),%eax
1264	movl	$0,PCB_ONFAULT(%ecx)
1265	ret
1266
1267	ALIGN_TEXT
1268fusufault:
1269	movl	PCPU(CURPCB),%ecx
1270	xorl	%eax,%eax
1271	movl	%eax,PCB_ONFAULT(%ecx)
1272	decl	%eax
1273	ret
1274
1275/*
1276 * su{byte,sword,word} - MP SAFE (if not I386_CPU)
1277 *
1278 *	Write a byte (word, longword) to user memory
1279 */
1280ENTRY(suword)
1281	movl	PCPU(CURPCB),%ecx
1282	movl	$fusufault,PCB_ONFAULT(%ecx)
1283	movl	4(%esp),%edx
1284
1285#ifdef I386_CPU
1286
1287	/* XXX - page boundary crossing is still not handled */
1288	movl	%edx,%eax
1289	shrl	$IDXSHIFT,%edx
1290	andb	$0xfc,%dl
1291
1292	leal	PTmap(%edx),%ecx
1293	shrl	$IDXSHIFT,%ecx
1294	andb	$0xfc,%cl
1295	testb	$PG_V,PTmap(%ecx)		/* PTE page must be valid */
1296	je	4f
1297	movb	PTmap(%edx),%dl
1298	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1299	cmpb	$PG_V|PG_RW|PG_U,%dl
1300	je	1f
1301
13024:
1303	/* simulate a trap */
1304	pushl	%eax
1305	call	trapwrite
1306	popl	%edx				/* remove junk parameter from stack */
1307	testl	%eax,%eax
1308	jnz	fusufault
13091:
1310	movl	4(%esp),%edx
1311#endif
1312
1313	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
1314	ja	fusufault
1315
1316	movl	8(%esp),%eax
1317	movl	%eax,(%edx)
1318	xorl	%eax,%eax
1319	movl	PCPU(CURPCB),%ecx
1320	movl	%eax,PCB_ONFAULT(%ecx)
1321	ret
1322
1323ENTRY(suword32)
1324	jmp	suword
1325
1326/*
1327 * suword16 - MP SAFE (if not I386_CPU)
1328 */
1329ENTRY(suword16)
1330	movl	PCPU(CURPCB),%ecx
1331	movl	$fusufault,PCB_ONFAULT(%ecx)
1332	movl	4(%esp),%edx
1333
1334#ifdef I386_CPU
1335
1336	/* XXX - page boundary crossing is still not handled */
1337	movl	%edx,%eax
1338	shrl	$IDXSHIFT,%edx
1339	andb	$0xfc,%dl
1340
1341	leal	PTmap(%edx),%ecx
1342	shrl	$IDXSHIFT,%ecx
1343	andb	$0xfc,%cl
1344	testb	$PG_V,PTmap(%ecx)		/* PTE page must be valid */
1345	je	4f
1346	movb	PTmap(%edx),%dl
1347	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1348	cmpb	$PG_V|PG_RW|PG_U,%dl
1349	je	1f
1350
13514:
1352	/* simulate a trap */
1353	pushl	%eax
1354	call	trapwrite
1355	popl	%edx				/* remove junk parameter from stack */
1356	testl	%eax,%eax
1357	jnz	fusufault
13581:
1359	movl	4(%esp),%edx
1360#endif
1361
1362	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
1363	ja	fusufault
1364
1365	movw	8(%esp),%ax
1366	movw	%ax,(%edx)
1367	xorl	%eax,%eax
1368	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
1369	movl	%eax,PCB_ONFAULT(%ecx)
1370	ret
1371
1372/*
1373 * subyte - MP SAFE (if not I386_CPU)
1374 */
1375ENTRY(subyte)
1376	movl	PCPU(CURPCB),%ecx
1377	movl	$fusufault,PCB_ONFAULT(%ecx)
1378	movl	4(%esp),%edx
1379
1380#ifdef I386_CPU
1381
1382	movl	%edx,%eax
1383	shrl	$IDXSHIFT,%edx
1384	andb	$0xfc,%dl
1385
1386	leal	PTmap(%edx),%ecx
1387	shrl	$IDXSHIFT,%ecx
1388	andb	$0xfc,%cl
1389	testb	$PG_V,PTmap(%ecx)		/* PTE page must be valid */
1390	je	4f
1391	movb	PTmap(%edx),%dl
1392	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
1393	cmpb	$PG_V|PG_RW|PG_U,%dl
1394	je	1f
1395
13964:
1397	/* simulate a trap */
1398	pushl	%eax
1399	call	trapwrite
1400	popl	%edx				/* remove junk parameter from stack */
1401	testl	%eax,%eax
1402	jnz	fusufault
14031:
1404	movl	4(%esp),%edx
1405#endif
1406
1407	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
1408	ja	fusufault
1409
1410	movb	8(%esp),%al
1411	movb	%al,(%edx)
1412	xorl	%eax,%eax
1413	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
1414	movl	%eax,PCB_ONFAULT(%ecx)
1415	ret
1416
1417/*
1418 * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
1419 *
1420 *	copy a string from from to to, stop when a 0 character is reached.
1421 *	return ENAMETOOLONG if string is longer than maxlen, and
1422 *	EFAULT on protection violations. If lencopied is non-zero,
1423 *	return the actual length in *lencopied.
1424 */
1425ENTRY(copyinstr)
1426	pushl	%esi
1427	pushl	%edi
1428	movl	PCPU(CURPCB),%ecx
1429	movl	$cpystrflt,PCB_ONFAULT(%ecx)
1430
1431	movl	12(%esp),%esi			/* %esi = from */
1432	movl	16(%esp),%edi			/* %edi = to */
1433	movl	20(%esp),%edx			/* %edx = maxlen */
1434
1435	movl	$VM_MAXUSER_ADDRESS,%eax
1436
1437	/* make sure 'from' is within bounds */
1438	subl	%esi,%eax
1439	jbe	cpystrflt
1440
1441	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1442	cmpl	%edx,%eax
1443	jae	1f
1444	movl	%eax,%edx
1445	movl	%eax,20(%esp)
14461:
1447	incl	%edx
1448	cld
1449
14502:
1451	decl	%edx
1452	jz	3f
1453
1454	lodsb
1455	stosb
1456	orb	%al,%al
1457	jnz	2b
1458
1459	/* Success -- 0 byte reached */
1460	decl	%edx
1461	xorl	%eax,%eax
1462	jmp	cpystrflt_x
14633:
1464	/* edx is zero - return ENAMETOOLONG or EFAULT */
1465	cmpl	$VM_MAXUSER_ADDRESS,%esi
1466	jae	cpystrflt
14674:
1468	movl	$ENAMETOOLONG,%eax
1469	jmp	cpystrflt_x
1470
1471cpystrflt:
1472	movl	$EFAULT,%eax
1473
1474cpystrflt_x:
1475	/* set *lencopied and return %eax */
1476	movl	PCPU(CURPCB),%ecx
1477	movl	$0,PCB_ONFAULT(%ecx)
1478	movl	20(%esp),%ecx
1479	subl	%edx,%ecx
1480	movl	24(%esp),%edx
1481	testl	%edx,%edx
1482	jz	1f
1483	movl	%ecx,(%edx)
14841:
1485	popl	%edi
1486	popl	%esi
1487	ret
1488
1489
1490/*
1491 * copystr(from, to, maxlen, int *lencopied) - MP SAFE
1492 */
1493ENTRY(copystr)
1494	pushl	%esi
1495	pushl	%edi
1496
1497	movl	12(%esp),%esi			/* %esi = from */
1498	movl	16(%esp),%edi			/* %edi = to */
1499	movl	20(%esp),%edx			/* %edx = maxlen */
1500	incl	%edx
1501	cld
15021:
1503	decl	%edx
1504	jz	4f
1505	lodsb
1506	stosb
1507	orb	%al,%al
1508	jnz	1b
1509
1510	/* Success -- 0 byte reached */
1511	decl	%edx
1512	xorl	%eax,%eax
1513	jmp	6f
15144:
1515	/* edx is zero -- return ENAMETOOLONG */
1516	movl	$ENAMETOOLONG,%eax
1517
15186:
1519	/* set *lencopied and return %eax */
1520	movl	20(%esp),%ecx
1521	subl	%edx,%ecx
1522	movl	24(%esp),%edx
1523	testl	%edx,%edx
1524	jz	7f
1525	movl	%ecx,(%edx)
15267:
1527	popl	%edi
1528	popl	%esi
1529	ret
1530
1531ENTRY(bcmp)
1532	pushl	%edi
1533	pushl	%esi
1534	movl	12(%esp),%edi
1535	movl	16(%esp),%esi
1536	movl	20(%esp),%edx
1537	xorl	%eax,%eax
1538
1539	movl	%edx,%ecx
1540	shrl	$2,%ecx
1541	cld					/* compare forwards */
1542	repe
1543	cmpsl
1544	jne	1f
1545
1546	movl	%edx,%ecx
1547	andl	$3,%ecx
1548	repe
1549	cmpsb
1550	je	2f
15511:
1552	incl	%eax
15532:
1554	popl	%esi
1555	popl	%edi
1556	ret
1557
1558
1559/*
1560 * Handling of special 386 registers and descriptor tables etc
1561 */
1562/* void lgdt(struct region_descriptor *rdp); */
1563ENTRY(lgdt)
1564	/* reload the descriptor table */
1565	movl	4(%esp),%eax
1566	lgdt	(%eax)
1567
1568	/* flush the prefetch q */
1569	jmp	1f
1570	nop
15711:
1572	/* reload "stale" selectors */
1573	movl	$KDSEL,%eax
1574	mov	%ax,%ds
1575	mov	%ax,%es
1576	mov	%ax,%gs
1577	mov	%ax,%ss
1578	movl	$KPSEL,%eax
1579	mov	%ax,%fs
1580
1581	/* reload code selector by turning return into intersegmental return */
1582	movl	(%esp),%eax
1583	pushl	%eax
1584	movl	$KCSEL,4(%esp)
1585	lret
1586
1587/* ssdtosd(*ssdp,*sdp) */
1588ENTRY(ssdtosd)
1589	pushl	%ebx
1590	movl	8(%esp),%ecx
1591	movl	8(%ecx),%ebx
1592	shll	$16,%ebx
1593	movl	(%ecx),%edx
1594	roll	$16,%edx
1595	movb	%dh,%bl
1596	movb	%dl,%bh
1597	rorl	$8,%ebx
1598	movl	4(%ecx),%eax
1599	movw	%ax,%dx
1600	andl	$0xf0000,%eax
1601	orl	%eax,%ebx
1602	movl	12(%esp),%ecx
1603	movl	%edx,(%ecx)
1604	movl	%ebx,4(%ecx)
1605	popl	%ebx
1606	ret
1607
1608/* void reset_dbregs() */
1609ENTRY(reset_dbregs)
1610	movl    $0,%eax
1611	movl    %eax,%dr7     /* disable all breapoints first */
1612	movl    %eax,%dr0
1613	movl    %eax,%dr1
1614	movl    %eax,%dr2
1615	movl    %eax,%dr3
1616	movl    %eax,%dr6
1617	ret
1618
1619/*****************************************************************************/
1620/* setjump, longjump                                                         */
1621/*****************************************************************************/
1622
1623ENTRY(setjmp)
1624	movl	4(%esp),%eax
1625	movl	%ebx,(%eax)			/* save ebx */
1626	movl	%esp,4(%eax)			/* save esp */
1627	movl	%ebp,8(%eax)			/* save ebp */
1628	movl	%esi,12(%eax)			/* save esi */
1629	movl	%edi,16(%eax)			/* save edi */
1630	movl	(%esp),%edx			/* get rta */
1631	movl	%edx,20(%eax)			/* save eip */
1632	xorl	%eax,%eax			/* return(0); */
1633	ret
1634
1635ENTRY(longjmp)
1636	movl	4(%esp),%eax
1637	movl	(%eax),%ebx			/* restore ebx */
1638	movl	4(%eax),%esp			/* restore esp */
1639	movl	8(%eax),%ebp			/* restore ebp */
1640	movl	12(%eax),%esi			/* restore esi */
1641	movl	16(%eax),%edi			/* restore edi */
1642	movl	20(%eax),%edx			/* get rta */
1643	movl	%edx,(%esp)			/* put in return frame */
1644	xorl	%eax,%eax			/* return(1); */
1645	incl	%eax
1646	ret
1647
1648/*
1649 * Support for BB-profiling (gcc -a).  The kernbb program will extract
1650 * the data from the kernel.
1651 */
1652
1653	.data
1654	ALIGN_DATA
1655	.globl bbhead
1656bbhead:
1657	.long 0
1658
1659	.text
1660NON_GPROF_ENTRY(__bb_init_func)
1661	movl	4(%esp),%eax
1662	movl	$1,(%eax)
1663	movl	bbhead,%edx
1664	movl	%edx,16(%eax)
1665	movl	%eax,bbhead
1666	NON_GPROF_RET
1667