1// Copyright 2017 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <assert.h>
6#include <math.h>
7#include <string.h>
8#include <unittest/unittest.h>
9
10#include "register-set.h"
11
12namespace {
13
14// Write a NaN double value to the given uint64_t (which is how most of the
15// registers are stored in the structs).
16void WriteNaNDouble(uint64_t* output) {
17    double nan_value = nan("");
18    memcpy(output, &nan_value, sizeof(double));
19}
20
21} // namespace
22
23void general_regs_fill_test_values(zx_thread_state_general_regs_t* regs) {
24    for (uint32_t index = 0; index < sizeof(*regs); ++index) {
25        ((uint8_t*)regs)[index] = static_cast<uint8_t>(index + 1);
26    }
27// Set various flags bits that will read back the same.
28#if defined(__x86_64__)
29    // Here we set all flag bits that are modifiable from user space or
30    // that are not modifiable but are expected to read back as 1, with the
31    // exception of the trap flag (bit 8, which would interfere with
32    // execution if we set it).
33    //
34    // Note that setting the direction flag (bit 10) helps test whether the
35    // kernel correctly handles taking an interrupt when that flag is set
36    // (see ZX-998).
37    regs->rflags =
38        (1 << 0) |  // CF: carry flag
39        (1 << 1) |  // Reserved, always 1
40        (1 << 2) |  // PF: parity flag
41        (1 << 4) |  // AF: adjust flag
42        (1 << 6) |  // ZF: zero flag
43        (1 << 7) |  // SF: sign flag
44        (1 << 9) |  // IF: interrupt enable flag (set by kernel)
45        (1 << 10) | // DF: direction flag
46        (1 << 11) | // OF: overflow flag
47        (1 << 14) | // NT: nested task flag
48        (1 << 18) | // AC: alignment check flag
49        (1 << 21);  // ID: used for testing for CPUID support
50#elif defined(__aarch64__)
51    // Only set the 4 flag bits that are readable and writable by the
52    // instructions "msr nzcv, REG" and "mrs REG, nzcv".
53    regs->cpsr = 0xf0000000;
54#endif
55}
56
57void fp_regs_fill_test_values(zx_thread_state_fp_regs* regs) {
58    memset(regs, 0, sizeof(zx_thread_state_fp_regs));
59#if defined(__x86_64__)
60    for (size_t i = 0; i < 7; i++)
61        regs->st[i].low = i;
62
63    // Write NaN to the last value.
64    WriteNaNDouble(&regs->st[7].low);
65#elif defined(__aarch64__)
66// No FP struct on ARM (vector only).
67#else
68#error Unsupported architecture
69#endif
70}
71
72void vector_regs_fill_test_values(zx_thread_state_vector_regs* regs) {
73    memset(regs, 0, sizeof(zx_thread_state_vector_regs));
74#if defined(__x86_64__)
75    for (uint64_t i = 0; i < 16; i++) {
76        // Only sets the XMM registers (first two) since that's all that's guaranteed.
77        regs->zmm[i].v[0] = i;
78        regs->zmm[i].v[1] = i << 8;
79        regs->zmm[i].v[2] = 0;
80        regs->zmm[i].v[3] = 0;
81    }
82
83    // Write NaN to the last value.
84    WriteNaNDouble(&regs->zmm[15].v[0]);
85#elif defined(__aarch64__)
86    for (uint64_t i = 0; i < 32; i++) {
87        regs->v[i].low = i;
88        regs->v[i].high = i << 8;
89    }
90
91    // Write NaN to the last value.
92    WriteNaNDouble(&regs->v[31].low);
93#else
94#error Unsupported architecture
95#endif
96}
97
98bool general_regs_expect_eq(const zx_thread_state_general_regs_t& regs1,
99                            const zx_thread_state_general_regs_t& regs2) {
100    BEGIN_HELPER;
101#define CHECK_REG(FIELD) EXPECT_EQ(regs1.FIELD, regs2.FIELD, "Reg " #FIELD)
102#if defined(__x86_64__)
103    CHECK_REG(rax);
104    CHECK_REG(rbx);
105    CHECK_REG(rcx);
106    CHECK_REG(rdx);
107    CHECK_REG(rsi);
108    CHECK_REG(rdi);
109    CHECK_REG(rbp);
110    CHECK_REG(rsp);
111    CHECK_REG(r8);
112    CHECK_REG(r9);
113    CHECK_REG(r10);
114    CHECK_REG(r11);
115    CHECK_REG(r12);
116    CHECK_REG(r13);
117    CHECK_REG(r14);
118    CHECK_REG(r15);
119    CHECK_REG(rip);
120    CHECK_REG(rflags);
121#elif defined(__aarch64__)
122    for (int regnum = 0; regnum < 30; ++regnum) {
123        char name[10];
124        snprintf(name, sizeof(name), "Reg r[%d]", regnum);
125        EXPECT_EQ(regs1.r[regnum], regs2.r[regnum], name);
126    }
127    CHECK_REG(lr);
128    CHECK_REG(sp);
129    CHECK_REG(pc);
130    CHECK_REG(cpsr);
131#else
132#error Unsupported architecture
133#endif
134#undef CHECK_REG
135    END_HELPER;
136}
137
138bool fp_regs_expect_eq(const zx_thread_state_fp_regs_t& regs1,
139                       const zx_thread_state_fp_regs_t& regs2) {
140#if defined(__x86_64__)
141    BEGIN_HELPER;
142
143    // This just tests the MMX registers.
144    EXPECT_EQ(regs1.st[0].low, regs2.st[0].low, "Reg st[0].low");
145    EXPECT_EQ(regs1.st[1].low, regs2.st[1].low, "Reg st[1].low");
146    EXPECT_EQ(regs1.st[2].low, regs2.st[2].low, "Reg st[2].low");
147    EXPECT_EQ(regs1.st[3].low, regs2.st[3].low, "Reg st[3].low");
148    EXPECT_EQ(regs1.st[4].low, regs2.st[4].low, "Reg st[4].low");
149    EXPECT_EQ(regs1.st[5].low, regs2.st[5].low, "Reg st[5].low");
150    EXPECT_EQ(regs1.st[6].low, regs2.st[6].low, "Reg st[6].low");
151    EXPECT_EQ(regs1.st[7].low, regs2.st[7].low, "Reg st[7].low");
152
153    END_HELPER;
154#elif defined(__aarch64__)
155    // No FP regs on ARM (uses vector regs for FP).
156    (void)regs1;
157    (void)regs2;
158    return true;
159#else
160#error Unsupported architecture
161#endif
162}
163
164bool vector_regs_expect_eq(const zx_thread_state_vector_regs_t& regs1,
165                           const zx_thread_state_vector_regs_t& regs2) {
166    BEGIN_HELPER;
167#if defined(__x86_64__)
168    // Only check the first 16 registers (guaranteed to work).
169    for (int reg = 0; reg < 16; reg++) {
170        // Only check the low 128 bits (guaranteed to work).
171        EXPECT_EQ(regs1.zmm[reg].v[0], regs2.zmm[reg].v[0]);
172        EXPECT_EQ(regs1.zmm[reg].v[1], regs2.zmm[reg].v[1]);
173    }
174#elif defined(__aarch64__)
175    for (int i = 0; i < 32; i++) {
176        EXPECT_EQ(regs1.v[i].high, regs2.v[i].high);
177        EXPECT_EQ(regs1.v[i].low, regs2.v[i].low);
178    }
179#else
180#error Unsupported architecture
181#endif
182    END_HELPER;
183}
184
185// spin_with_general_regs() function.
186#if defined(__x86_64__)
187static_assert(offsetof(zx_thread_state_general_regs_t, rax) == 8 * 0, "");
188static_assert(offsetof(zx_thread_state_general_regs_t, rbx) == 8 * 1, "");
189static_assert(offsetof(zx_thread_state_general_regs_t, rcx) == 8 * 2, "");
190static_assert(offsetof(zx_thread_state_general_regs_t, rdx) == 8 * 3, "");
191static_assert(offsetof(zx_thread_state_general_regs_t, rsi) == 8 * 4, "");
192static_assert(offsetof(zx_thread_state_general_regs_t, rdi) == 8 * 5, "");
193static_assert(offsetof(zx_thread_state_general_regs_t, rbp) == 8 * 6, "");
194static_assert(offsetof(zx_thread_state_general_regs_t, rsp) == 8 * 7, "");
195static_assert(offsetof(zx_thread_state_general_regs_t, r8) == 8 * 8, "");
196static_assert(offsetof(zx_thread_state_general_regs_t, r9) == 8 * 9, "");
197static_assert(offsetof(zx_thread_state_general_regs_t, r10) == 8 * 10, "");
198static_assert(offsetof(zx_thread_state_general_regs_t, r11) == 8 * 11, "");
199static_assert(offsetof(zx_thread_state_general_regs_t, r12) == 8 * 12, "");
200static_assert(offsetof(zx_thread_state_general_regs_t, r13) == 8 * 13, "");
201static_assert(offsetof(zx_thread_state_general_regs_t, r14) == 8 * 14, "");
202static_assert(offsetof(zx_thread_state_general_regs_t, r15) == 8 * 15, "");
203static_assert(offsetof(zx_thread_state_general_regs_t, rip) == 8 * 16, "");
204static_assert(offsetof(zx_thread_state_general_regs_t, rflags) == 8 * 17, "");
205static_assert(sizeof(zx_thread_state_general_regs_t) == 8 * 18, "");
206__asm__(".pushsection .text, \"ax\", @progbits\n"
207        ".global spin_with_general_regs\n"
208        "spin_with_general_regs:\n"
209        // Set flags using POPF.  Note that we use POPF rather than SAHF
210        // because POPF is able to set more flags than SAHF.
211        "pushq 8*17(%rdi)\n"
212        "popfq\n"
213        // Load general purpose registers.
214        "movq 8*0(%rdi), %rax\n"
215        "movq 8*1(%rdi), %rbx\n"
216        "movq 8*2(%rdi), %rcx\n"
217        "movq 8*3(%rdi), %rdx\n"
218        "movq 8*4(%rdi), %rsi\n"
219        // Skip assigning rdi here and assign it last.
220        "movq 8*6(%rdi), %rbp\n"
221        "movq 8*7(%rdi), %rsp\n"
222        "movq 8*8(%rdi), %r8\n"
223        "movq 8*9(%rdi), %r9\n"
224        "movq 8*10(%rdi), %r10\n"
225        "movq 8*11(%rdi), %r11\n"
226        "movq 8*12(%rdi), %r12\n"
227        "movq 8*13(%rdi), %r13\n"
228        "movq 8*14(%rdi), %r14\n"
229        "movq 8*15(%rdi), %r15\n"
230        "movq 8*5(%rdi), %rdi\n"
231        ".global spin_with_general_regs_spin_address\n"
232        "spin_with_general_regs_spin_address:\n"
233        "jmp spin_with_general_regs_spin_address\n"
234        ".popsection\n");
235#elif defined(__aarch64__)
236static_assert(offsetof(zx_thread_state_general_regs_t, r[0]) == 8 * 0, "");
237static_assert(offsetof(zx_thread_state_general_regs_t, r[1]) == 8 * 1, "");
238static_assert(offsetof(zx_thread_state_general_regs_t, lr) == 8 * 30, "");
239static_assert(offsetof(zx_thread_state_general_regs_t, sp) == 8 * 31, "");
240static_assert(offsetof(zx_thread_state_general_regs_t, pc) == 8 * 32, "");
241static_assert(offsetof(zx_thread_state_general_regs_t, cpsr) == 8 * 33, "");
242static_assert(sizeof(zx_thread_state_general_regs_t) == 8 * 34, "");
243__asm__(".pushsection .text, \"ax\", %progbits\n"
244        ".global spin_with_general_regs\n"
245        "spin_with_general_regs:\n"
246        // Load sp via a temporary register.
247        "ldr x1, [x0, #8*31]\n"
248        "mov sp, x1\n"
249        // Load NZCV flags, a subset of the PSTATE/CPSR register.
250        "ldr x1, [x0, #8*33]\n"
251        "msr nzcv, x1\n"
252        // Load general purpose registers.
253        // Skip assigning x0 and x1 here and assign them last.
254        "ldp x2, x3, [x0, #8*2]\n"
255        "ldp x4, x5, [x0, #8*4]\n"
256        "ldp x6, x7, [x0, #8*6]\n"
257        "ldp x8, x9, [x0, #8*8]\n"
258        "ldp x10, x11, [x0, #8*10]\n"
259        "ldp x12, x13, [x0, #8*12]\n"
260        "ldp x14, x15, [x0, #8*14]\n"
261        "ldp x16, x17, [x0, #8*16]\n"
262        "ldp x18, x19, [x0, #8*18]\n"
263        "ldp x20, x21, [x0, #8*20]\n"
264        "ldp x22, x23, [x0, #8*22]\n"
265        "ldp x24, x25, [x0, #8*24]\n"
266        "ldp x26, x27, [x0, #8*26]\n"
267        "ldp x28, x29, [x0, #8*28]\n"
268        "ldr x30, [x0, #8*30]\n"
269        "ldp x0, x1, [x0]\n"
270        ".global spin_with_general_regs_spin_address\n"
271        "spin_with_general_regs_spin_address:\n"
272        "b spin_with_general_regs_spin_address\n"
273        ".popsection\n");
274#else
275#error Unsupported architecture
276#endif
277
278// spin_with_fp_regs() function.
279#if defined(__x86_64__)
280static_assert(offsetof(zx_thread_state_fp_regs_t, fcw) == 0, "");
281static_assert(offsetof(zx_thread_state_fp_regs_t, fsw) == 2, "");
282static_assert(offsetof(zx_thread_state_fp_regs_t, ftw) == 4, "");
283static_assert(offsetof(zx_thread_state_fp_regs_t, fop) == 6, "");
284static_assert(offsetof(zx_thread_state_fp_regs_t, fip) == 8, "");
285static_assert(offsetof(zx_thread_state_fp_regs_t, fdp) == 16, "");
286static_assert(offsetof(zx_thread_state_fp_regs_t, st) == 32, "");
287__asm__(".pushsection .text, \"ax\", @progbits\n"
288        ".global spin_with_fp_regs\n"
289        "spin_with_fp_regs:\n"
290
291        // rdi = &zx_thread_state_fp_regs_t.st[0]
292        "lea 32(%rdi), %rdi\n"
293
294        "movq $0x9999, %rax\n"
295        "movq %rax, %xmm0\n"
296
297        "movq 16*0(%rdi), %mm0\n"
298        "movq 16*1(%rdi), %mm1\n"
299        "movq 16*2(%rdi), %mm2\n"
300        "movq 16*3(%rdi), %mm3\n"
301        "movq 16*4(%rdi), %mm4\n"
302        "movq 16*5(%rdi), %mm5\n"
303        "movq 16*6(%rdi), %mm6\n"
304        "movq 16*7(%rdi), %mm7\n"
305
306        "spin_with_fp_regs_spin_address:\n"
307        "jmp spin_with_fp_regs_spin_address\n"
308        ".popsection\n");
309#elif defined(__aarch64__)
310// Just spins and does nothing. ARM64 doesn't define a separate FP state, but doing this allows the
311// rest of the code to be platform-independent.
312__asm__(".pushsection .text, \"ax\", %progbits\n"
313        ".global spin_with_fp_regs\n"
314        "spin_with_fp_regs:\n"
315
316        // Do nothing.
317
318        "spin_with_fp_regs_spin_address:\n"
319        "b spin_with_fp_regs_spin_address\n"
320        ".popsection\n");
321#else
322#error Unsupported architecture
323#endif
324
325// spin_with_vector_regs() function.
326#if defined(__x86_64__)
327__asm__(".pushsection .text, \"ax\", @progbits\n"
328        ".global spin_with_vector_regs\n"
329        "spin_with_vector_regs:\n"
330
331        // rdi = zmm[0] on call. This only loads xmm registers which are guaranteed to exist.
332        // Each zmm input is 512 bits = 64 bytes.
333        "movdqu 64*0(%rdi), %xmm0\n"
334        "movdqu 64*1(%rdi), %xmm1\n"
335        "movdqu 64*2(%rdi), %xmm2\n"
336        "movdqu 64*3(%rdi), %xmm3\n"
337        "movdqu 64*4(%rdi), %xmm4\n"
338        "movdqu 64*5(%rdi), %xmm5\n"
339        "movdqu 64*6(%rdi), %xmm6\n"
340        "movdqu 64*7(%rdi), %xmm7\n"
341        "movdqu 64*8(%rdi), %xmm8\n"
342        "movdqu 64*9(%rdi), %xmm9\n"
343        "movdqu 64*10(%rdi), %xmm10\n"
344        "movdqu 64*11(%rdi), %xmm11\n"
345        "movdqu 64*12(%rdi), %xmm12\n"
346        "movdqu 64*13(%rdi), %xmm13\n"
347        "movdqu 64*14(%rdi), %xmm14\n"
348        "movdqu 64*15(%rdi), %xmm15\n"
349
350        "spin_with_vector_regs_spin_address:\n"
351        "jmp spin_with_vector_regs_spin_address\n"
352        ".popsection\n");
353#elif defined(__aarch64__)
354static_assert(offsetof(zx_thread_state_vector_regs_t, fpcr) == 0, "");
355static_assert(offsetof(zx_thread_state_vector_regs_t, fpsr) == 4, "");
356static_assert(offsetof(zx_thread_state_vector_regs_t, v) == 8, "");
357__asm__(".pushsection .text, \"ax\", %progbits\n"
358        ".global spin_with_vector_regs\n"
359        "spin_with_vector_regs:\n"
360
361        // FPCR and FPSR are first.
362        "ldp w1, w2, [x0]\n"
363        "msr fpcr, x1\n"
364        "msr fpsr, x2\n"
365
366        // Skip to the vector registers.
367        "add x0, x0, 8\n"
368
369        // Each register is 128 bits = 16 bytes, so each pair is 32 bytes.
370        "ldp q0, q1, [x0, #(0 * 32)]\n"
371        "ldp q2, q3, [x0, #(1 * 32)]\n"
372        "ldp q4, q5, [x0, #(2 * 32)]\n"
373        "ldp q6, q7, [x0, #(3 * 32)]\n"
374        "ldp q8, q9, [x0, #(4 * 32)]\n"
375        "ldp q10, q11, [x0, #(5 * 32)]\n"
376        "ldp q12, q13, [x0, #(6 * 32)]\n"
377        "ldp q14, q15, [x0, #(7 * 32)]\n"
378        "ldp q16, q17, [x0, #(8 * 32)]\n"
379        "ldp q18, q19, [x0, #(9 * 32)]\n"
380        "ldp q20, q21, [x0, #(10 * 32)]\n"
381        "ldp q22, q23, [x0, #(11 * 32)]\n"
382        "ldp q24, q25, [x0, #(12 * 32)]\n"
383        "ldp q26, q27, [x0, #(13 * 32)]\n"
384        "ldp q28, q29, [x0, #(14 * 32)]\n"
385        "ldp q30, q31, [x0, #(15 * 32)]\n"
386
387        "spin_with_vector_regs_spin_address:\n"
388        "b spin_with_vector_regs_spin_address\n"
389        ".popsection\n");
390#else
391#error Unsupported architecture
392#endif
393
394// save_general_regs_and_exit_thread() function.
395#if defined(__x86_64__)
396__asm__(".pushsection .text,\"ax\", @progbits\n"
397        ".global save_general_regs_and_exit_thread\n"
398        "save_general_regs_and_exit_thread:\n"
399        "movq %rax, 8*0(%rsp)\n"
400        "movq %rbx, 8*1(%rsp)\n"
401        "movq %rcx, 8*2(%rsp)\n"
402        "movq %rdx, 8*3(%rsp)\n"
403        "movq %rsi, 8*4(%rsp)\n"
404        "movq %rdi, 8*5(%rsp)\n"
405        "movq %rbp, 8*6(%rsp)\n"
406        "movq %rsp, 8*7(%rsp)\n"
407        "movq %r8, 8*8(%rsp)\n"
408        "movq %r9, 8*9(%rsp)\n"
409        "movq %r10, 8*10(%rsp)\n"
410        "movq %r11, 8*11(%rsp)\n"
411        "movq %r12, 8*12(%rsp)\n"
412        "movq %r13, 8*13(%rsp)\n"
413        "movq %r14, 8*14(%rsp)\n"
414        "movq %r15, 8*15(%rsp)\n"
415        // Save the flags register.
416        "pushfq\n"
417        "popq %rax\n"
418        "movq %rax, 8*17(%rsp)\n"
419        // Fill out the rip field with known value.
420        "leaq save_general_regs_and_exit_thread(%rip), %rax\n"
421        "movq %rax, 8*16(%rsp)\n"
422        "call zx_thread_exit@PLT\n"
423        "ud2\n"
424        ".popsection\n");
425#elif defined(__aarch64__)
426__asm__(".pushsection .text, \"ax\", %progbits\n"
427        ".global save_general_regs_and_exit_thread\n"
428        "save_general_regs_and_exit_thread:\n"
429        "stp x0, x1, [sp, #8*0]\n"
430        "stp x2, x3, [sp, #8*2]\n"
431        "stp x4, x5, [sp, #8*4]\n"
432        "stp x6, x7, [sp, #8*6]\n"
433        "stp x8, x9, [sp, #8*8]\n"
434        "stp x10, x11, [sp, #8*10]\n"
435        "stp x12, x13, [sp, #8*12]\n"
436        "stp x14, x15, [sp, #8*14]\n"
437        "stp x16, x17, [sp, #8*16]\n"
438        "stp x18, x19, [sp, #8*18]\n"
439        "stp x20, x21, [sp, #8*20]\n"
440        "stp x22, x23, [sp, #8*22]\n"
441        "stp x24, x25, [sp, #8*24]\n"
442        "stp x26, x27, [sp, #8*26]\n"
443        "stp x28, x29, [sp, #8*28]\n"
444        "str x30, [sp, #8*30]\n"
445        // Save the sp register.
446        "mov x0, sp\n"
447        "str x0, [sp, #8*31]\n"
448        // Fill out the pc field with known value.
449        "adr x0, save_general_regs_and_exit_thread\n"
450        "str x0, [sp, #8*32]\n"
451        // Save NZCV flags, a subset of the PSTATE/CPSR register.
452        "mrs x0, nzcv\n"
453        "str x0, [sp, #8*33]\n"
454        "bl zx_thread_exit\n"
455        "brk 0\n"
456        ".popsection\n");
457#else
458#error Unsupported architecture
459#endif
460
461// save_fp_regs_and_exit_thread() function.
462#if defined(__x86_64__)
463static_assert(offsetof(zx_thread_state_fp_regs, st) == 32, "");
464__asm__(".pushsection .text,\"ax\", @progbits\n"
465        ".global save_fp_regs_and_exit_thread\n"
466        "save_fp_regs_and_exit_thread:\n"
467
468        // This only saves the low 64 bits, which is the MMX register. Each slot in the struct is
469        // 128 bits so need to add 16 bytes each time. The 32 bytes is the start of the FP regs in
470        // the struct (see static assert above).
471        "movq %mm0, 32 + 16*0(%rsp)\n"
472        "movq %mm1, 32 + 16*1(%rsp)\n"
473        "movq %mm2, 32 + 16*2(%rsp)\n"
474        "movq %mm3, 32 + 16*3(%rsp)\n"
475        "movq %mm4, 32 + 16*4(%rsp)\n"
476        "movq %mm5, 32 + 16*5(%rsp)\n"
477        "movq %mm6, 32 + 16*6(%rsp)\n"
478        "movq %mm7, 32 + 16*7(%rsp)\n"
479
480        "call zx_thread_exit@PLT\n"
481        "ud2\n"
482        ".popsection\n");
483#elif defined(__aarch64__)
484__asm__(".pushsection .text, \"ax\", %progbits\n"
485        ".global save_fp_regs_and_exit_thread\n"
486        "save_fp_regs_and_exit_thread:\n"
487
488        // Does nothing (no FP values).
489
490        "bl zx_thread_exit\n"
491        "brk 0\n"
492        ".popsection\n");
493#else
494#error Unsupported architecture
495#endif
496
497// save_vector_regs_and_exit_thread() function.
498#if defined(__x86_64__)
499static_assert(offsetof(zx_thread_state_vector_regs, zmm) == 0, "");
500__asm__(".pushsection .text,\"ax\", @progbits\n"
501        ".global save_vector_regs_and_exit_thread\n"
502        "save_vector_regs_and_exit_thread:\n"
503
504        // Each vector is 512 bits (64 bytes). We only read the first 128 (xmm registers).
505        "movdqu %xmm0, 64*0(%rsp)\n"
506        "movdqu %xmm1, 64*1(%rsp)\n"
507        "movdqu %xmm2, 64*2(%rsp)\n"
508        "movdqu %xmm3, 64*3(%rsp)\n"
509        "movdqu %xmm4, 64*4(%rsp)\n"
510        "movdqu %xmm5, 64*5(%rsp)\n"
511        "movdqu %xmm6, 64*6(%rsp)\n"
512        "movdqu %xmm7, 64*7(%rsp)\n"
513        "movdqu %xmm8, 64*8(%rsp)\n"
514        "movdqu %xmm9, 64*9(%rsp)\n"
515        "movdqu %xmm10, 64*10(%rsp)\n"
516        "movdqu %xmm11, 64*11(%rsp)\n"
517        "movdqu %xmm12, 64*12(%rsp)\n"
518        "movdqu %xmm13, 64*13(%rsp)\n"
519        "movdqu %xmm14, 64*14(%rsp)\n"
520        "movdqu %xmm15, 64*15(%rsp)\n"
521
522        "call zx_thread_exit@PLT\n"
523        "ud2\n"
524        ".popsection\n");
525#elif defined(__aarch64__)
526__asm__(".pushsection .text, \"ax\", %progbits\n"
527        ".global save_vector_regs_and_exit_thread\n"
528        "save_vector_regs_and_exit_thread:\n"
529
530        // Input is in SP.
531        "mov x0, sp\n"
532
533        // FPCR and FPSR.
534        "mrs x1, fpcr\n"
535        "mrs x2, fpsr\n"
536        "stp w1, w2, [x0]\n"
537
538        // Skip to the vector registers
539        "add x0, x0, 8\n"
540
541        // Each register is 128 bits = 16 bytes, so each pair is 32 bytes.
542        "stp q0, q1, [x0, #(0 * 32)]\n"
543        "stp q2, q3, [x0, #(1 * 32)]\n"
544        "stp q4, q5, [x0, #(2 * 32)]\n"
545        "stp q6, q7, [x0, #(3 * 32)]\n"
546        "stp q8, q9, [x0, #(4 * 32)]\n"
547        "stp q10, q11, [x0, #(5 * 32)]\n"
548        "stp q12, q13, [x0, #(6 * 32)]\n"
549        "stp q14, q15, [x0, #(7 * 32)]\n"
550        "stp q16, q17, [x0, #(8 * 32)]\n"
551        "stp q18, q19, [x0, #(9 * 32)]\n"
552        "stp q20, q21, [x0, #(10 * 32)]\n"
553        "stp q22, q23, [x0, #(11 * 32)]\n"
554        "stp q24, q25, [x0, #(12 * 32)]\n"
555        "stp q26, q27, [x0, #(13 * 32)]\n"
556        "stp q28, q29, [x0, #(14* 32)]\n"
557        "stp q30, q31, [x0, #(15 * 32)]\n"
558
559        "bl zx_thread_exit\n"
560        "brk 0\n"
561        ".popsection\n");
562#else
563#error Unsupported architecture
564#endif
565