1/*	$NetBSD: startprog64.S,v 1.5 2023/06/24 05:31:04 msaitoh Exp $	*/
2/*	NetBSD: startprog.S,v 1.3 2003/02/01 14:48:18 dsl Exp	*/
3
4/* starts program in protected mode / flat space
5 with given stackframe
6 needs global variables flatcodeseg and flatdataseg
7 (gdt offsets)
8  derived from: NetBSD:sys/arch/i386/boot/asm.S
9 */
10
11/*
12 * Ported to boot 386BSD by Julian Elischer (julian@tfs.com) Sept 1992
13 *
14 * Mach Operating System
15 * Copyright (c) 1992, 1991 Carnegie Mellon University
16 * All Rights Reserved.
17 *
18 * Permission to use, copy, modify and distribute this software and its
19 * documentation is hereby granted, provided that both the copyright
20 * notice and this permission notice appear in all copies of the
21 * software, derivative works or modified versions, and any portions
22 * thereof, and that both notices appear in supporting documentation.
23 *
24 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
25 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
26 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
27 *
28 * Carnegie Mellon requests users of this software to return to
29 *
30 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
31 *  School of Computer Science
32 *  Carnegie Mellon University
33 *  Pittsburgh PA 15213-3890
34 *
35 * any improvements or extensions that they make and grant Carnegie Mellon
36 * the rights to redistribute these changes.
37 */
38
39/*
40  Copyright 1988, 1989, 1990, 1991, 1992
41   by Intel Corporation, Santa Clara, California.
42
43                All Rights Reserved
44
45Permission to use, copy, modify, and distribute this software and
46its documentation for any purpose and without fee is hereby
47granted, provided that the above copyright notice appears in all
48copies and that both the copyright notice and this permission notice
49appear in supporting documentation, and that the name of Intel
50not be used in advertising or publicity pertaining to distribution
51of the software without specific, written prior permission.
52
53INTEL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE
54INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS,
55IN NO EVENT SHALL INTEL BE LIABLE FOR ANY SPECIAL, INDIRECT, OR
56CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
57LOSS OF USE, DATA OR PROFITS, WHETHER IN ACTION OF CONTRACT,
58NEGLIGENCE, OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
59WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
60*/
61
62#include <machine/asm.h>
63#include <machine/specialreg.h>
64
65#define	CODE_SEGMENT	0x08
66#define	DATA_SEGMENT	0x10
67
68	.align	16
69	.globl _C_LABEL(startprog64)
70_C_LABEL(startprog64):
71	.quad 0
72
73	.globl _C_LABEL(startprog64_size)
74_C_LABEL(startprog64_size):
75	.long startprog64_end - _C_LABEL(startprog64_start)
76
77	.text
78	.p2align 4,,15
79
80/*
81 * startprog64(loaddr,entry,stack,kern_load,kern_start,kern_size)
82 */
83ENTRY(startprog64_start)
84start:
85	/*
86	 * This function is to call the loaded kernel's start() with
87	 * 32bit segment mode from x64 mode.
88	 * %rdi: kernel start address
89	 * %rsi: loaded kernel address
90	 * %rdx: stack address
91	 * %rcx: loaded kernel size
92	 * %r8 : loaded start address
93	 * %r9 : kernel entry address
94	 */
95
96	cld		/* LynxOS depends on it */
97
98	cli
99
100	/* skip copy if same source and destination */
101	cmpq	%rdi,%rsi
102	jz	.Lcopy_done
103
104	/* Copy kernel */
105	mov	%rcx, %r12		/* original kernel size */
106	movq	%rdi, %r11		/* for misaligned check */
107
108#if !defined(NO_OVERLAP)
109	movq	%rdi, %r13
110	subq	%rsi, %r13
111#endif
112
113	shrq	$3, %rcx		/* count for copy by words */
114	jz	8f			/* j if less than 8 bytes */
115
116	lea	-8(%rdi, %r12), %r14	/* target address of last 8 */
117	mov	-8(%rsi, %r12), %r15	/* get last word */
118#if !defined(NO_OVERLAP)
119	cmpq	%r12, %r13		/* overlapping? */
120	jb	10f
121#endif
122
123/*
124 * Non-overlaping, copy forwards.
125 * Newer Intel cpus (Nehalem) will do 16byte read/write transfers
126 * if %ecx is more than 76.
127 * AMD might do something similar some day.
128 */
129	and	$7, %r11		/* destination misaligned ? */
130	jnz	2f
131	rep
132	movsq
133	mov	%r15, (%r14)		/* write last word */
134	jmp	.Lcopy_done
135
136/*
137 * Destination misaligned
138 * AMD say it is better to align the destination (not the source).
139 * This will also re-align copies if the source and dest are both
140 * misaligned by the same amount)
141 * (I think Nehalem will use its accelerated copy if the source
142 * and destination have the same alignment.)
143 */
1442:
145	lea	-9(%r11, %r12), %rcx	/* post re-alignment count */
146	neg	%r11			/* now -1 .. -7 */
147	mov	(%rsi), %r12		/* get first word */
148	mov	%rdi, %r13		/* target for first word */
149	lea	8(%rsi, %r11), %rsi
150	lea	8(%rdi, %r11), %rdi
151	shr	$3, %rcx
152	rep
153	movsq
154	mov	%r12, (%r13)		/* write first word */
155	mov	%r15, (%r14)		/* write last word */
156	jmp	.Lcopy_done
157
158#if !defined(NO_OVERLAP)
159/* Must copy backwards.
160 * Reverse copy is probably easy to code faster than 'rep movds'
161 * since that requires (IIRC) an extra clock every 3 iterations (AMD).
162 * However I don't suppose anything cares that much!
163 * The big cost is the std/cld pair - reputedly 50+ cycles on Netburst P4.
164 * The copy is aligned with the buffer start (more likely to
165 * be a multiple of 8 than the end).
166 */
16710:
168	lea	-8(%rsi, %rcx, 8), %rsi
169	lea	-8(%rdi, %rcx, 8), %rdi
170	std
171	rep
172	movsq
173	cld
174	mov	%r15, (%r14)	/* write last bytes */
175	jmp	.Lcopy_done
176#endif
177
178/* Less than 8 bytes to copy, copy by bytes */
179/* Intel Nehalem optimise 'rep movsb' for <= 7 bytes (9-15 clocks).
180 * For longer transfers it is 50+ !
181 */
1828:	mov	%r12, %rcx
183
184#if !defined(NO_OVERLAP)
185	cmpq	%r12, %r13	/* overlapping? */
186	jb	81f
187#endif
188
189	/* nope, copy forwards. */
190	rep
191	movsb
192	jmp	.Lcopy_done
193
194#if !defined(NO_OVERLAP)
195/* Must copy backwards */
19681:
197	lea	-1(%rsi, %rcx), %rsi
198	lea	-1(%rdi, %rcx), %rdi
199	std
200	rep
201	movsb
202	cld
203#endif
204	/* End of copy kernel */
205.Lcopy_done:
206
207	mov	%r8, %rdi	/* %rdi: loaded start address */
208	mov	%r9, %rsi	/* %rsi: kernel entry address */
209
210	/* Prepare jump address */
211	lea	(start32a - start)(%rdi), %rax
212	movl	%eax, (start32r - start)(%rdi)
213
214	/* Setup GDT */
215	lea	(gdt - start)(%rdi), %rax
216	mov	%rax, (gdtrr - start)(%rdi)
217	lgdt	(gdtr - start)(%rdi)
218
219	/* Jump to set %cs */
220	ljmp	*(start32r - start)(%rdi)
221
222	.align	4
223	.code32
224start32a:
225	movl	$DATA_SEGMENT, %eax
226	movw	%ax, %ds
227	movw	%ax, %es
228	movw	%ax, %fs
229	movw	%ax, %gs
230	movw	%ax, %ss
231
232	movl	%edx, %esp
233
234	/* Disable Paging in CR0 */
235	movl	%cr0, %eax
236	andl	$(~CR0_PG), %eax
237	movl	%eax, %cr0
238
239	/* Disable PAE in CR4 */
240	movl	%cr4, %eax
241	andl	$(~CR4_PAE), %eax
242	movl	%eax, %cr4
243
244	jmp	start32b
245
246	.align	4
247start32b:
248	xor	%eax, %eax
249	call	*%esi
250
251	.align	16
252start32r:
253	.long	0
254	.long	CODE_SEGMENT
255	.align	16
256gdt:
257	.long	0, 0
258	.byte	0xff, 0xff, 0x00, 0x00, 0x00, 0x9f, 0xcf, 0x00
259	.byte	0xff, 0xff, 0x00, 0x00, 0x00, 0x93, 0xcf, 0x00
260gdtr:
261	.word	gdtr - gdt
262gdtrr:
263	.quad
264start32end:
265	/* Space for the stack */
266	.align	16
267	.space	8192
268startprog64_end:
269