1/*	$NetBSD: sljitNativeX86_common.c,v 1.10 2021/11/30 12:32:09 christos Exp $	*/
2
3/*
4 *    Stack-less Just-In-Time compiler
5 *
6 *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without modification, are
9 * permitted provided that the following conditions are met:
10 *
11 *   1. Redistributions of source code must retain the above copyright notice, this list of
12 *      conditions and the following disclaimer.
13 *
14 *   2. Redistributions in binary form must reproduce the above copyright notice, this list
15 *      of conditions and the following disclaimer in the documentation and/or other materials
16 *      provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
21 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
24 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
30{
31	return "x86" SLJIT_CPUINFO;
32}
33
34/*
35   32b register indexes:
36     0 - EAX
37     1 - ECX
38     2 - EDX
39     3 - EBX
40     4 - none
41     5 - EBP
42     6 - ESI
43     7 - EDI
44*/
45
46/*
47   64b register indexes:
48     0 - RAX
49     1 - RCX
50     2 - RDX
51     3 - RBX
52     4 - none
53     5 - RBP
54     6 - RSI
55     7 - RDI
56     8 - R8   - From now on REX prefix is required
57     9 - R9
58    10 - R10
59    11 - R11
60    12 - R12
61    13 - R13
62    14 - R14
63    15 - R15
64*/
65
66#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
67
68/* Last register + 1. */
69#define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
70
71static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
72	0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 7, 6, 3, 4, 5
73};
74
75#define CHECK_EXTRA_REGS(p, w, do) \
76	if (p >= SLJIT_R3 && p <= SLJIT_S3) { \
77		if (p <= compiler->scratches) \
78			w = compiler->saveds_offset - ((p) - SLJIT_R2) * (sljit_sw)sizeof(sljit_sw); \
79		else \
80			w = compiler->locals_offset + ((p) - SLJIT_S2) * (sljit_sw)sizeof(sljit_sw); \
81		p = SLJIT_MEM1(SLJIT_SP); \
82		do; \
83	}
84
85#else /* SLJIT_CONFIG_X86_32 */
86
87/* Last register + 1. */
88#define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
89#define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
90#define TMP_REG3	(SLJIT_NUMBER_OF_REGISTERS + 4)
91
92/* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
93   Note: avoid to use r12 and r13 for memory addessing
94   therefore r12 is better for SAVED_EREG than SAVED_REG. */
95#ifndef _WIN64
96/* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
97static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
98	0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
99};
100/* low-map. reg_map & 0x7. */
101static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
102	0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
103};
104#else
105/* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
106static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
107	0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
108};
109/* low-map. reg_map & 0x7. */
110static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
111	0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
112};
113#endif
114
115#define REX_W		0x48
116#define REX_R		0x44
117#define REX_X		0x42
118#define REX_B		0x41
119#define REX		0x40
120
121#ifndef _WIN64
122#define HALFWORD_MAX 0x7fffffffl
123#define HALFWORD_MIN -0x80000000l
124#else
125#define HALFWORD_MAX 0x7fffffffll
126#define HALFWORD_MIN -0x80000000ll
127#endif
128
129#define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
130#define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
131
132#define CHECK_EXTRA_REGS(p, w, do)
133
134#endif /* SLJIT_CONFIG_X86_32 */
135
136#define TMP_FREG	(0)
137
138/* Size flags for emit_x86_instruction: */
139#define EX86_BIN_INS		0x0010
140#define EX86_SHIFT_INS		0x0020
141#define EX86_REX		0x0040
142#define EX86_NO_REXW		0x0080
143#define EX86_BYTE_ARG		0x0100
144#define EX86_HALF_ARG		0x0200
145#define EX86_PREF_66		0x0400
146#define EX86_PREF_F2		0x0800
147#define EX86_PREF_F3		0x1000
148#define EX86_SSE2_OP1		0x2000
149#define EX86_SSE2_OP2		0x4000
150#define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
151
152/* --------------------------------------------------------------------- */
153/*  Instrucion forms                                                     */
154/* --------------------------------------------------------------------- */
155
156#define ADD		(/* BINARY */ 0 << 3)
157#define ADD_EAX_i32	0x05
158#define ADD_r_rm	0x03
159#define ADD_rm_r	0x01
160#define ADDSD_x_xm	0x58
161#define ADC		(/* BINARY */ 2 << 3)
162#define ADC_EAX_i32	0x15
163#define ADC_r_rm	0x13
164#define ADC_rm_r	0x11
165#define AND		(/* BINARY */ 4 << 3)
166#define AND_EAX_i32	0x25
167#define AND_r_rm	0x23
168#define AND_rm_r	0x21
169#define ANDPD_x_xm	0x54
170#define BSR_r_rm	(/* GROUP_0F */ 0xbd)
171#define CALL_i32	0xe8
172#define CALL_rm		(/* GROUP_FF */ 2 << 3)
173#define CDQ		0x99
174#define CMOVNE_r_rm	(/* GROUP_0F */ 0x45)
175#define CMP		(/* BINARY */ 7 << 3)
176#define CMP_EAX_i32	0x3d
177#define CMP_r_rm	0x3b
178#define CMP_rm_r	0x39
179#define CVTPD2PS_x_xm	0x5a
180#define CVTSI2SD_x_rm	0x2a
181#define CVTTSD2SI_r_xm	0x2c
182#define DIV		(/* GROUP_F7 */ 6 << 3)
183#define DIVSD_x_xm	0x5e
184#define INT3		0xcc
185#define IDIV		(/* GROUP_F7 */ 7 << 3)
186#define IMUL		(/* GROUP_F7 */ 5 << 3)
187#define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
188#define IMUL_r_rm_i8	0x6b
189#define IMUL_r_rm_i32	0x69
190#define JE_i8		0x74
191#define JNE_i8		0x75
192#define JMP_i8		0xeb
193#define JMP_i32		0xe9
194#define JMP_rm		(/* GROUP_FF */ 4 << 3)
195#define LEA_r_m		0x8d
196#define MOV_r_rm	0x8b
197#define MOV_r_i32	0xb8
198#define MOV_rm_r	0x89
199#define MOV_rm_i32	0xc7
200#define MOV_rm8_i8	0xc6
201#define MOV_rm8_r8	0x88
202#define MOVSD_x_xm	0x10
203#define MOVSD_xm_x	0x11
204#define MOVSXD_r_rm	0x63
205#define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
206#define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
207#define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
208#define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
209#define MUL		(/* GROUP_F7 */ 4 << 3)
210#define MULSD_x_xm	0x59
211#define NEG_rm		(/* GROUP_F7 */ 3 << 3)
212#define NOP		0x90
213#define NOT_rm		(/* GROUP_F7 */ 2 << 3)
214#define OR		(/* BINARY */ 1 << 3)
215#define OR_r_rm		0x0b
216#define OR_EAX_i32	0x0d
217#define OR_rm_r		0x09
218#define OR_rm8_r8	0x08
219#define POP_r		0x58
220#define POP_rm		0x8f
221#define POPF		0x9d
222#define PUSH_i32	0x68
223#define PUSH_r		0x50
224#define PUSH_rm		(/* GROUP_FF */ 6 << 3)
225#define PUSHF		0x9c
226#define RET_near	0xc3
227#define RET_i16		0xc2
228#define SBB		(/* BINARY */ 3 << 3)
229#define SBB_EAX_i32	0x1d
230#define SBB_r_rm	0x1b
231#define SBB_rm_r	0x19
232#define SAR		(/* SHIFT */ 7 << 3)
233#define SHL		(/* SHIFT */ 4 << 3)
234#define SHR		(/* SHIFT */ 5 << 3)
235#define SUB		(/* BINARY */ 5 << 3)
236#define SUB_EAX_i32	0x2d
237#define SUB_r_rm	0x2b
238#define SUB_rm_r	0x29
239#define SUBSD_x_xm	0x5c
240#define TEST_EAX_i32	0xa9
241#define TEST_rm_r	0x85
242#define UCOMISD_x_xm	0x2e
243#define UNPCKLPD_x_xm	0x14
244#define XCHG_EAX_r	0x90
245#define XCHG_r_rm	0x87
246#define XOR		(/* BINARY */ 6 << 3)
247#define XOR_EAX_i32	0x35
248#define XOR_r_rm	0x33
249#define XOR_rm_r	0x31
250#define XORPD_x_xm	0x57
251
252#define GROUP_0F	0x0f
253#define GROUP_F7	0xf7
254#define GROUP_FF	0xff
255#define GROUP_BINARY_81	0x81
256#define GROUP_BINARY_83	0x83
257#define GROUP_SHIFT_1	0xd1
258#define GROUP_SHIFT_N	0xc1
259#define GROUP_SHIFT_CL	0xd3
260
261#define MOD_REG		0xc0
262#define MOD_DISP8	0x40
263
264#define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
265
266#define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
267#define POP_REG(r)			(*inst++ = (POP_r + (r)))
268#define RET()				(*inst++ = (RET_near))
269#define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
270/* r32, r/m32 */
271#define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
272
273/* Multithreading does not affect these static variables, since they store
274   built-in CPU features. Therefore they can be overwritten by different threads
275   if they detect the CPU features in the same time. */
276#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
277static sljit_s32 cpu_has_sse2 = -1;
278#endif
279static sljit_s32 cpu_has_cmov = -1;
280
281#ifdef _WIN32_WCE
282#include <cmnintrin.h>
283#elif defined(_MSC_VER) && _MSC_VER >= 1400
284#include <intrin.h>
285#endif
286
287/******************************************************/
288/*    Unaligned-store functions                       */
289/******************************************************/
290
291static SLJIT_INLINE void sljit_unaligned_store_s16(void *addr, sljit_s16 value)
292{
293	SLJIT_MEMCPY(addr, &value, sizeof(value));
294}
295
296static SLJIT_INLINE void sljit_unaligned_store_s32(void *addr, sljit_s32 value)
297{
298	SLJIT_MEMCPY(addr, &value, sizeof(value));
299}
300
301static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
302{
303	SLJIT_MEMCPY(addr, &value, sizeof(value));
304}
305
306/******************************************************/
307/*    Utility functions                               */
308/******************************************************/
309
310static void get_cpu_features(void)
311{
312	sljit_u32 features;
313
314#if defined(_MSC_VER) && _MSC_VER >= 1400
315
316	int CPUInfo[4];
317	__cpuid(CPUInfo, 1);
318	features = (sljit_u32)CPUInfo[3];
319
320#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__lint__)
321
322	/* AT&T syntax. */
323	__asm__ (
324		"movl $0x1, %%eax\n"
325#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
326		/* On x86-32, there is no red zone, so this
327		   should work (no need for a local variable). */
328		"push %%ebx\n"
329#endif
330		"cpuid\n"
331#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
332		"pop %%ebx\n"
333#endif
334		"movl %%edx, %0\n"
335		: "=g" (features)
336		:
337#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
338		: "%eax", "%ecx", "%edx"
339#else
340		: "%rax", "%rbx", "%rcx", "%rdx"
341#endif
342	);
343
344#else /* _MSC_VER && _MSC_VER >= 1400 */
345
346	/* Intel syntax. */
347	__asm {
348		mov eax, 1
349		cpuid
350		mov features, edx
351	}
352
353#endif /* _MSC_VER && _MSC_VER >= 1400 */
354
355#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
356	cpu_has_sse2 = (features >> 26) & 0x1;
357#endif
358	cpu_has_cmov = (features >> 15) & 0x1;
359}
360
361static sljit_u8 get_jump_code(sljit_s32 type)
362{
363	switch (type) {
364	case SLJIT_EQUAL:
365	case SLJIT_EQUAL_F64:
366		return 0x84 /* je */;
367
368	case SLJIT_NOT_EQUAL:
369	case SLJIT_NOT_EQUAL_F64:
370		return 0x85 /* jne */;
371
372	case SLJIT_LESS:
373	case SLJIT_LESS_F64:
374		return 0x82 /* jc */;
375
376	case SLJIT_GREATER_EQUAL:
377	case SLJIT_GREATER_EQUAL_F64:
378		return 0x83 /* jae */;
379
380	case SLJIT_GREATER:
381	case SLJIT_GREATER_F64:
382		return 0x87 /* jnbe */;
383
384	case SLJIT_LESS_EQUAL:
385	case SLJIT_LESS_EQUAL_F64:
386		return 0x86 /* jbe */;
387
388	case SLJIT_SIG_LESS:
389		return 0x8c /* jl */;
390
391	case SLJIT_SIG_GREATER_EQUAL:
392		return 0x8d /* jnl */;
393
394	case SLJIT_SIG_GREATER:
395		return 0x8f /* jnle */;
396
397	case SLJIT_SIG_LESS_EQUAL:
398		return 0x8e /* jle */;
399
400	case SLJIT_OVERFLOW:
401	case SLJIT_MUL_OVERFLOW:
402		return 0x80 /* jo */;
403
404	case SLJIT_NOT_OVERFLOW:
405	case SLJIT_MUL_NOT_OVERFLOW:
406		return 0x81 /* jno */;
407
408	case SLJIT_UNORDERED_F64:
409		return 0x8a /* jp */;
410
411	case SLJIT_ORDERED_F64:
412		return 0x8b /* jpo */;
413	}
414	return 0;
415}
416
417#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
418static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type, sljit_sw executable_offset);
419#else
420static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type);
421#endif
422
423static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_s32 type, sljit_sw executable_offset)
424{
425	sljit_s32 short_jump;
426	sljit_uw label_addr;
427
428	if (jump->flags & JUMP_LABEL)
429		label_addr = (sljit_uw)(code + jump->u.label->size);
430	else
431		label_addr = jump->u.target - executable_offset;
432
433	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
434
435#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
436	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
437		return generate_far_jump_code(jump, code_ptr, type);
438#endif
439
440	if (type == SLJIT_JUMP) {
441		if (short_jump)
442			*code_ptr++ = JMP_i8;
443		else
444			*code_ptr++ = JMP_i32;
445		jump->addr++;
446	}
447	else if (type >= SLJIT_FAST_CALL) {
448		short_jump = 0;
449		*code_ptr++ = CALL_i32;
450		jump->addr++;
451	}
452	else if (short_jump) {
453		*code_ptr++ = get_jump_code(type) - 0x10;
454		jump->addr++;
455	}
456	else {
457		*code_ptr++ = GROUP_0F;
458		*code_ptr++ = get_jump_code(type);
459		jump->addr += 2;
460	}
461
462	if (short_jump) {
463		jump->flags |= PATCH_MB;
464		code_ptr += sizeof(sljit_s8);
465	} else {
466		jump->flags |= PATCH_MW;
467#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
468		code_ptr += sizeof(sljit_sw);
469#else
470		code_ptr += sizeof(sljit_s32);
471#endif
472	}
473
474	return code_ptr;
475}
476
477SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
478{
479	struct sljit_memory_fragment *buf;
480	sljit_u8 *code;
481	sljit_u8 *code_ptr;
482	sljit_u8 *buf_ptr;
483	sljit_u8 *buf_end;
484	sljit_u8 len;
485	sljit_sw executable_offset;
486	sljit_sw jump_addr;
487
488	struct sljit_label *label;
489	struct sljit_jump *jump;
490	struct sljit_const *const_;
491
492	CHECK_ERROR_PTR();
493	CHECK_PTR(check_sljit_generate_code(compiler));
494	reverse_buf(compiler);
495
496	/* Second code generation pass. */
497	code = (sljit_u8*)SLJIT_MALLOC_EXEC(compiler->size);
498	PTR_FAIL_WITH_EXEC_IF(code);
499	buf = compiler->buf;
500
501	code_ptr = code;
502	label = compiler->labels;
503	jump = compiler->jumps;
504	const_ = compiler->consts;
505	executable_offset = SLJIT_EXEC_OFFSET(code);
506
507	do {
508		buf_ptr = buf->memory;
509		buf_end = buf_ptr + buf->used_size;
510		do {
511			len = *buf_ptr++;
512			if (len > 0) {
513				/* The code is already generated. */
514				SLJIT_MEMCPY(code_ptr, buf_ptr, len);
515				code_ptr += len;
516				buf_ptr += len;
517			}
518			else {
519				if (*buf_ptr >= 2) {
520					jump->addr = (sljit_uw)code_ptr;
521					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
522						code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 2, executable_offset);
523					else {
524#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
525						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 2, executable_offset);
526#else
527						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 2);
528#endif
529					}
530					jump = jump->next;
531				}
532				else if (*buf_ptr == 0) {
533					label->addr = ((sljit_uw)code_ptr) + executable_offset;
534					label->size = code_ptr - code;
535					label = label->next;
536				}
537				else { /* *buf_ptr is 1 */
538					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
539					const_ = const_->next;
540				}
541				buf_ptr++;
542			}
543		} while (buf_ptr < buf_end);
544		SLJIT_ASSERT(buf_ptr == buf_end);
545		buf = buf->next;
546	} while (buf);
547
548	SLJIT_ASSERT(!label);
549	SLJIT_ASSERT(!jump);
550	SLJIT_ASSERT(!const_);
551
552	jump = compiler->jumps;
553	while (jump) {
554		jump_addr = jump->addr + executable_offset;
555
556		if (jump->flags & PATCH_MB) {
557			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) <= 127);
558			*(sljit_u8*)jump->addr = (sljit_u8)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8)));
559		} else if (jump->flags & PATCH_MW) {
560			if (jump->flags & JUMP_LABEL) {
561#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
562				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_sw))));
563#else
564				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
565				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))));
566#endif
567			}
568			else {
569#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
570				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_sw))));
571#else
572				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
573				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.target - (jump_addr + sizeof(sljit_s32))));
574#endif
575			}
576		}
577#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
578		else if (jump->flags & PATCH_MD)
579			sljit_unaligned_store_sw((void*)jump->addr, jump->u.label->addr);
580#endif
581
582		jump = jump->next;
583	}
584
585	/* Some space may be wasted because of short jumps. */
586	SLJIT_ASSERT(code_ptr <= code + compiler->size);
587	compiler->error = SLJIT_ERR_COMPILED;
588	compiler->executable_offset = executable_offset;
589	compiler->executable_size = code_ptr - code;
590	return (void*)(code + executable_offset);
591}
592
593/* --------------------------------------------------------------------- */
594/*  Operators                                                            */
595/* --------------------------------------------------------------------- */
596
597static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
598	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
599	sljit_s32 dst, sljit_sw dstw,
600	sljit_s32 src1, sljit_sw src1w,
601	sljit_s32 src2, sljit_sw src2w);
602
603static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
604	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
605	sljit_s32 dst, sljit_sw dstw,
606	sljit_s32 src1, sljit_sw src1w,
607	sljit_s32 src2, sljit_sw src2w);
608
609static sljit_s32 emit_mov(struct sljit_compiler *compiler,
610	sljit_s32 dst, sljit_sw dstw,
611	sljit_s32 src, sljit_sw srcw);
612
613#define EMIT_MOV(compiler, dst, dstw, src, srcw) \
614	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
615
616#ifdef _WIN32
617#include <malloc.h>
618
619static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
620{
621	/* Workaround for calling the internal _chkstk() function on Windows.
622	This function touches all 4k pages belongs to the requested stack space,
623	which size is passed in local_size. This is necessary on Windows where
624	the stack can only grow in 4k steps. However, this function just burn
625	CPU cycles if the stack is large enough. However, you don't know it in
626	advance, so it must always be called. I think this is a bad design in
627	general even if it has some reasons. */
628	*(volatile sljit_s32*)alloca(local_size) = 0;
629}
630
631#endif
632
633#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
634#include "sljitNativeX86_32.c"
635#else
636#include "sljitNativeX86_64.c"
637#endif
638
639static sljit_s32 emit_mov(struct sljit_compiler *compiler,
640	sljit_s32 dst, sljit_sw dstw,
641	sljit_s32 src, sljit_sw srcw)
642{
643	sljit_u8* inst;
644
645	if (dst == SLJIT_UNUSED) {
646		/* No destination, doesn't need to setup flags. */
647		if (src & SLJIT_MEM) {
648			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
649			FAIL_IF(!inst);
650			*inst = MOV_r_rm;
651		}
652		return SLJIT_SUCCESS;
653	}
654	if (FAST_IS_REG(src)) {
655		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
656		FAIL_IF(!inst);
657		*inst = MOV_rm_r;
658		return SLJIT_SUCCESS;
659	}
660	if (src & SLJIT_IMM) {
661		if (FAST_IS_REG(dst)) {
662#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
663			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
664#else
665			if (!compiler->mode32) {
666				if (NOT_HALFWORD(srcw))
667					return emit_load_imm64(compiler, dst, srcw);
668			}
669			else
670				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
671#endif
672		}
673#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
674		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
675			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
676			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
677			FAIL_IF(!inst);
678			*inst = MOV_rm_r;
679			return SLJIT_SUCCESS;
680		}
681#endif
682		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
683		FAIL_IF(!inst);
684		*inst = MOV_rm_i32;
685		return SLJIT_SUCCESS;
686	}
687	if (FAST_IS_REG(dst)) {
688		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
689		FAIL_IF(!inst);
690		*inst = MOV_r_rm;
691		return SLJIT_SUCCESS;
692	}
693
694	/* Memory to memory move. Requires two instruction. */
695	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
696	FAIL_IF(!inst);
697	*inst = MOV_r_rm;
698	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
699	FAIL_IF(!inst);
700	*inst = MOV_rm_r;
701	return SLJIT_SUCCESS;
702}
703
704SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
705{
706	sljit_u8 *inst;
707#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
708	sljit_s32 size;
709#endif
710
711	CHECK_ERROR();
712	CHECK(check_sljit_emit_op0(compiler, op));
713
714	switch (GET_OPCODE(op)) {
715	case SLJIT_BREAKPOINT:
716		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
717		FAIL_IF(!inst);
718		INC_SIZE(1);
719		*inst = INT3;
720		break;
721	case SLJIT_NOP:
722		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
723		FAIL_IF(!inst);
724		INC_SIZE(1);
725		*inst = NOP;
726		break;
727	case SLJIT_LMUL_UW:
728	case SLJIT_LMUL_SW:
729	case SLJIT_DIVMOD_UW:
730	case SLJIT_DIVMOD_SW:
731	case SLJIT_DIV_UW:
732	case SLJIT_DIV_SW:
733#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
734#ifdef _WIN64
735		SLJIT_ASSERT(
736			reg_map[SLJIT_R0] == 0
737			&& reg_map[SLJIT_R1] == 2
738			&& reg_map[TMP_REG1] > 7);
739#else
740		SLJIT_ASSERT(
741			reg_map[SLJIT_R0] == 0
742			&& reg_map[SLJIT_R1] < 7
743			&& reg_map[TMP_REG1] == 2);
744#endif
745		compiler->mode32 = op & SLJIT_I32_OP;
746#endif
747		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
748
749		op = GET_OPCODE(op);
750		if ((op | 0x2) == SLJIT_DIV_UW) {
751#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
752			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
753			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
754#else
755			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
756#endif
757			FAIL_IF(!inst);
758			*inst = XOR_r_rm;
759		}
760
761		if ((op | 0x2) == SLJIT_DIV_SW) {
762#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
763			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
764#endif
765
766#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
767			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
768			FAIL_IF(!inst);
769			INC_SIZE(1);
770			*inst = CDQ;
771#else
772			if (compiler->mode32) {
773				inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
774				FAIL_IF(!inst);
775				INC_SIZE(1);
776				*inst = CDQ;
777			} else {
778				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
779				FAIL_IF(!inst);
780				INC_SIZE(2);
781				*inst++ = REX_W;
782				*inst = CDQ;
783			}
784#endif
785		}
786
787#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
788		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
789		FAIL_IF(!inst);
790		INC_SIZE(2);
791		*inst++ = GROUP_F7;
792		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
793#else
794#ifdef _WIN64
795		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
796#else
797		size = (!compiler->mode32) ? 3 : 2;
798#endif
799		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
800		FAIL_IF(!inst);
801		INC_SIZE(size);
802#ifdef _WIN64
803		if (!compiler->mode32)
804			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
805		else if (op >= SLJIT_DIVMOD_UW)
806			*inst++ = REX_B;
807		*inst++ = GROUP_F7;
808		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
809#else
810		if (!compiler->mode32)
811			*inst++ = REX_W;
812		*inst++ = GROUP_F7;
813		*inst = MOD_REG | reg_map[SLJIT_R1];
814#endif
815#endif
816		switch (op) {
817		case SLJIT_LMUL_UW:
818			*inst |= MUL;
819			break;
820		case SLJIT_LMUL_SW:
821			*inst |= IMUL;
822			break;
823		case SLJIT_DIVMOD_UW:
824		case SLJIT_DIV_UW:
825			*inst |= DIV;
826			break;
827		case SLJIT_DIVMOD_SW:
828		case SLJIT_DIV_SW:
829			*inst |= IDIV;
830			break;
831		}
832#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
833		if (op <= SLJIT_DIVMOD_SW)
834			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
835#else
836		if (op >= SLJIT_DIV_UW)
837			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
838#endif
839		break;
840	}
841
842	return SLJIT_SUCCESS;
843}
844
845#define ENCODE_PREFIX(prefix) \
846	do { \
847		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); \
848		FAIL_IF(!inst); \
849		INC_SIZE(1); \
850		*inst = (prefix); \
851	} while (0)
852
853static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
854	sljit_s32 dst, sljit_sw dstw,
855	sljit_s32 src, sljit_sw srcw)
856{
857	sljit_u8* inst;
858	sljit_s32 dst_r;
859#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
860	sljit_s32 work_r;
861#endif
862
863#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
864	compiler->mode32 = 0;
865#endif
866
867	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
868		return SLJIT_SUCCESS; /* Empty instruction. */
869
870	if (src & SLJIT_IMM) {
871		if (FAST_IS_REG(dst)) {
872#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
873			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
874#else
875			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
876			FAIL_IF(!inst);
877			*inst = MOV_rm_i32;
878			return SLJIT_SUCCESS;
879#endif
880		}
881		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
882		FAIL_IF(!inst);
883		*inst = MOV_rm8_i8;
884		return SLJIT_SUCCESS;
885	}
886
887	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
888
889	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
890#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
891		if (reg_map[src] >= 4) {
892			SLJIT_ASSERT(dst_r == TMP_REG1);
893			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
894		} else
895			dst_r = src;
896#else
897		dst_r = src;
898#endif
899	}
900#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
901	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
902		/* src, dst are registers. */
903		SLJIT_ASSERT(SLOW_IS_REG(dst));
904		if (reg_map[dst] < 4) {
905			if (dst != src)
906				EMIT_MOV(compiler, dst, 0, src, 0);
907			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
908			FAIL_IF(!inst);
909			*inst++ = GROUP_0F;
910			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
911		}
912		else {
913			if (dst != src)
914				EMIT_MOV(compiler, dst, 0, src, 0);
915			if (sign) {
916				/* shl reg, 24 */
917				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
918				FAIL_IF(!inst);
919				*inst |= SHL;
920				/* sar reg, 24 */
921				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
922				FAIL_IF(!inst);
923				*inst |= SAR;
924			}
925			else {
926				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
927				FAIL_IF(!inst);
928				*(inst + 1) |= AND;
929			}
930		}
931		return SLJIT_SUCCESS;
932	}
933#endif
934	else {
935		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
936		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
937		FAIL_IF(!inst);
938		*inst++ = GROUP_0F;
939		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
940	}
941
942	if (dst & SLJIT_MEM) {
943#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
944		if (dst_r == TMP_REG1) {
945			/* Find a non-used register, whose reg_map[src] < 4. */
946			if ((dst & REG_MASK) == SLJIT_R0) {
947				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
948					work_r = SLJIT_R2;
949				else
950					work_r = SLJIT_R1;
951			}
952			else {
953				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
954					work_r = SLJIT_R0;
955				else if ((dst & REG_MASK) == SLJIT_R1)
956					work_r = SLJIT_R2;
957				else
958					work_r = SLJIT_R1;
959			}
960
961			if (work_r == SLJIT_R0) {
962				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
963			}
964			else {
965				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
966				FAIL_IF(!inst);
967				*inst = XCHG_r_rm;
968			}
969
970			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
971			FAIL_IF(!inst);
972			*inst = MOV_rm8_r8;
973
974			if (work_r == SLJIT_R0) {
975				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
976			}
977			else {
978				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
979				FAIL_IF(!inst);
980				*inst = XCHG_r_rm;
981			}
982		}
983		else {
984			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
985			FAIL_IF(!inst);
986			*inst = MOV_rm8_r8;
987		}
988#else
989		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
990		FAIL_IF(!inst);
991		*inst = MOV_rm8_r8;
992#endif
993	}
994
995	return SLJIT_SUCCESS;
996}
997
998static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
999	sljit_s32 dst, sljit_sw dstw,
1000	sljit_s32 src, sljit_sw srcw)
1001{
1002	sljit_u8* inst;
1003	sljit_s32 dst_r;
1004
1005#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1006	compiler->mode32 = 0;
1007#endif
1008
1009	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
1010		return SLJIT_SUCCESS; /* Empty instruction. */
1011
1012	if (src & SLJIT_IMM) {
1013		if (FAST_IS_REG(dst)) {
1014#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1015			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1016#else
1017			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1018			FAIL_IF(!inst);
1019			*inst = MOV_rm_i32;
1020			return SLJIT_SUCCESS;
1021#endif
1022		}
1023		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1024		FAIL_IF(!inst);
1025		*inst = MOV_rm_i32;
1026		return SLJIT_SUCCESS;
1027	}
1028
1029	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1030
1031	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1032		dst_r = src;
1033	else {
1034		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1035		FAIL_IF(!inst);
1036		*inst++ = GROUP_0F;
1037		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1038	}
1039
1040	if (dst & SLJIT_MEM) {
1041		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1042		FAIL_IF(!inst);
1043		*inst = MOV_rm_r;
1044	}
1045
1046	return SLJIT_SUCCESS;
1047}
1048
1049static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1050	sljit_s32 dst, sljit_sw dstw,
1051	sljit_s32 src, sljit_sw srcw)
1052{
1053	sljit_u8* inst;
1054
1055	if (dst == SLJIT_UNUSED) {
1056		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1057		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1058		FAIL_IF(!inst);
1059		*inst++ = GROUP_F7;
1060		*inst |= opcode;
1061		return SLJIT_SUCCESS;
1062	}
1063	if (dst == src && dstw == srcw) {
1064		/* Same input and output */
1065		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1066		FAIL_IF(!inst);
1067		*inst++ = GROUP_F7;
1068		*inst |= opcode;
1069		return SLJIT_SUCCESS;
1070	}
1071	if (FAST_IS_REG(dst)) {
1072		EMIT_MOV(compiler, dst, 0, src, srcw);
1073		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1074		FAIL_IF(!inst);
1075		*inst++ = GROUP_F7;
1076		*inst |= opcode;
1077		return SLJIT_SUCCESS;
1078	}
1079	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1080	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1081	FAIL_IF(!inst);
1082	*inst++ = GROUP_F7;
1083	*inst |= opcode;
1084	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1085	return SLJIT_SUCCESS;
1086}
1087
1088static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler,
1089	sljit_s32 dst, sljit_sw dstw,
1090	sljit_s32 src, sljit_sw srcw)
1091{
1092	sljit_u8* inst;
1093
1094	if (dst == SLJIT_UNUSED) {
1095		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1096		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1097		FAIL_IF(!inst);
1098		*inst++ = GROUP_F7;
1099		*inst |= NOT_rm;
1100		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1101		FAIL_IF(!inst);
1102		*inst = OR_r_rm;
1103		return SLJIT_SUCCESS;
1104	}
1105	if (FAST_IS_REG(dst)) {
1106		EMIT_MOV(compiler, dst, 0, src, srcw);
1107		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1108		FAIL_IF(!inst);
1109		*inst++ = GROUP_F7;
1110		*inst |= NOT_rm;
1111		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1112		FAIL_IF(!inst);
1113		*inst = OR_r_rm;
1114		return SLJIT_SUCCESS;
1115	}
1116	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1117	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1118	FAIL_IF(!inst);
1119	*inst++ = GROUP_F7;
1120	*inst |= NOT_rm;
1121	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1122	FAIL_IF(!inst);
1123	*inst = OR_r_rm;
1124	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1125	return SLJIT_SUCCESS;
1126}
1127
1128static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
1129	sljit_s32 dst, sljit_sw dstw,
1130	sljit_s32 src, sljit_sw srcw)
1131{
1132	sljit_u8* inst;
1133	sljit_s32 dst_r;
1134
1135	SLJIT_UNUSED_ARG(op_flags);
1136	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
1137		/* Just set the zero flag. */
1138		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1139		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1140		FAIL_IF(!inst);
1141		*inst++ = GROUP_F7;
1142		*inst |= NOT_rm;
1143#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1144		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
1145#else
1146		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, TMP_REG1, 0);
1147#endif
1148		FAIL_IF(!inst);
1149		*inst |= SHR;
1150		return SLJIT_SUCCESS;
1151	}
1152
1153	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
1154		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
1155		src = TMP_REG1;
1156		srcw = 0;
1157	}
1158
1159	inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
1160	FAIL_IF(!inst);
1161	*inst++ = GROUP_0F;
1162	*inst = BSR_r_rm;
1163
1164#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1165	if (FAST_IS_REG(dst))
1166		dst_r = dst;
1167	else {
1168		/* Find an unused temporary register. */
1169		if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
1170			dst_r = SLJIT_R0;
1171		else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
1172			dst_r = SLJIT_R1;
1173		else
1174			dst_r = SLJIT_R2;
1175		EMIT_MOV(compiler, dst, dstw, dst_r, 0);
1176	}
1177	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
1178#else
1179	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
1180	compiler->mode32 = 0;
1181	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 64 + 63 : 32 + 31);
1182	compiler->mode32 = op_flags & SLJIT_I32_OP;
1183#endif
1184
1185	if (cpu_has_cmov == -1)
1186		get_cpu_features();
1187
1188	if (cpu_has_cmov) {
1189		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1190		FAIL_IF(!inst);
1191		*inst++ = GROUP_0F;
1192		*inst = CMOVNE_r_rm;
1193	} else {
1194#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1195		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1196		FAIL_IF(!inst);
1197		INC_SIZE(4);
1198
1199		*inst++ = JE_i8;
1200		*inst++ = 2;
1201		*inst++ = MOV_r_rm;
1202		*inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
1203#else
1204		inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
1205		FAIL_IF(!inst);
1206		INC_SIZE(5);
1207
1208		*inst++ = JE_i8;
1209		*inst++ = 3;
1210		*inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
1211		*inst++ = MOV_r_rm;
1212		*inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
1213#endif
1214	}
1215
1216#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1217	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1218#else
1219	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0);
1220#endif
1221	FAIL_IF(!inst);
1222	*(inst + 1) |= XOR;
1223
1224#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1225	if (dst & SLJIT_MEM) {
1226		inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1227		FAIL_IF(!inst);
1228		*inst = XCHG_r_rm;
1229	}
1230#else
1231	if (dst & SLJIT_MEM)
1232		EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
1233#endif
1234	return SLJIT_SUCCESS;
1235}
1236
1237SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1238	sljit_s32 dst, sljit_sw dstw,
1239	sljit_s32 src, sljit_sw srcw)
1240{
1241	sljit_s32 update = 0;
1242	sljit_s32 op_flags = GET_ALL_FLAGS(op);
1243#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1244	sljit_s32 dst_is_ereg = 0;
1245	sljit_s32 src_is_ereg = 0;
1246#else
1247#	define src_is_ereg 0
1248#endif
1249
1250	CHECK_ERROR();
1251	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1252	ADJUST_LOCAL_OFFSET(dst, dstw);
1253	ADJUST_LOCAL_OFFSET(src, srcw);
1254
1255	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1256	CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
1257#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1258	compiler->mode32 = op_flags & SLJIT_I32_OP;
1259#endif
1260
1261	op = GET_OPCODE(op);
1262	if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
1263#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1264		compiler->mode32 = 0;
1265#endif
1266
1267		if (op_flags & SLJIT_I32_OP) {
1268			if (FAST_IS_REG(src) && src == dst) {
1269				if (!TYPE_CAST_NEEDED(op))
1270					return SLJIT_SUCCESS;
1271			}
1272#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1273			if (op == SLJIT_MOV_S32 && (src & SLJIT_MEM))
1274				op = SLJIT_MOV_U32;
1275			if (op == SLJIT_MOVU_S32 && (src & SLJIT_MEM))
1276				op = SLJIT_MOVU_U32;
1277			if (op == SLJIT_MOV_U32 && (src & SLJIT_IMM))
1278				op = SLJIT_MOV_S32;
1279			if (op == SLJIT_MOVU_U32 && (src & SLJIT_IMM))
1280				op = SLJIT_MOVU_S32;
1281#endif
1282		}
1283
1284		SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
1285		if (op >= SLJIT_MOVU) {
1286			update = 1;
1287			op -= 8;
1288		}
1289
1290		if (src & SLJIT_IMM) {
1291			switch (op) {
1292			case SLJIT_MOV_U8:
1293				srcw = (sljit_u8)srcw;
1294				break;
1295			case SLJIT_MOV_S8:
1296				srcw = (sljit_s8)srcw;
1297				break;
1298			case SLJIT_MOV_U16:
1299				srcw = (sljit_u16)srcw;
1300				break;
1301			case SLJIT_MOV_S16:
1302				srcw = (sljit_s16)srcw;
1303				break;
1304#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1305			case SLJIT_MOV_U32:
1306				srcw = (sljit_u32)srcw;
1307				break;
1308			case SLJIT_MOV_S32:
1309				srcw = (sljit_s32)srcw;
1310				break;
1311#endif
1312			}
1313#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1314			if (SLJIT_UNLIKELY(dst_is_ereg))
1315				return emit_mov(compiler, dst, dstw, src, srcw);
1316#endif
1317		}
1318
1319#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1320		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1321			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1322			dst = TMP_REG1;
1323		}
1324#endif
1325
1326		switch (op) {
1327		case SLJIT_MOV:
1328		case SLJIT_MOV_P:
1329#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1330		case SLJIT_MOV_U32:
1331		case SLJIT_MOV_S32:
1332#endif
1333			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1334			break;
1335		case SLJIT_MOV_U8:
1336			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1337			break;
1338		case SLJIT_MOV_S8:
1339			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1340			break;
1341		case SLJIT_MOV_U16:
1342			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1343			break;
1344		case SLJIT_MOV_S16:
1345			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1346			break;
1347#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1348		case SLJIT_MOV_U32:
1349			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1350			break;
1351		case SLJIT_MOV_S32:
1352			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1353			break;
1354#endif
1355		}
1356
1357#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1358		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1359			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1360#endif
1361
1362		if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK)) {
1363			if ((src & OFFS_REG_MASK) != 0) {
1364				FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
1365						(src & REG_MASK), 0, (src & REG_MASK), 0, OFFS_REG(dst), 0));
1366			}
1367			else if (srcw != 0) {
1368				FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
1369						(src & REG_MASK), 0, (src & REG_MASK), 0, SLJIT_IMM, srcw));
1370			}
1371		}
1372
1373		if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK)) {
1374			if ((dst & OFFS_REG_MASK) != 0) {
1375				FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
1376						(dst & REG_MASK), 0, (dst & REG_MASK), 0, OFFS_REG(dst), 0));
1377			}
1378			else if (dstw != 0) {
1379				FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
1380						(dst & REG_MASK), 0, (dst & REG_MASK), 0, SLJIT_IMM, dstw));
1381			}
1382		}
1383		return SLJIT_SUCCESS;
1384	}
1385
1386	switch (op) {
1387	case SLJIT_NOT:
1388		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_Z))
1389			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1390		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1391
1392	case SLJIT_NEG:
1393		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1394
1395	case SLJIT_CLZ:
1396		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1397	}
1398
1399	return SLJIT_SUCCESS;
1400
1401#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1402#	undef src_is_ereg
1403#endif
1404}
1405
1406#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1407
1408#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1409	if (IS_HALFWORD(immw) || compiler->mode32) { \
1410		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1411		FAIL_IF(!inst); \
1412		*(inst + 1) |= (op_imm); \
1413	} \
1414	else { \
1415		FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
1416		inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
1417		FAIL_IF(!inst); \
1418		*inst = (op_mr); \
1419	}
1420
1421#define BINARY_EAX_IMM(op_eax_imm, immw) \
1422	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1423
1424#else
1425
1426#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1427	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1428	FAIL_IF(!inst); \
1429	*(inst + 1) |= (op_imm);
1430
1431#define BINARY_EAX_IMM(op_eax_imm, immw) \
1432	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1433
1434#endif
1435
1436static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1437	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
1438	sljit_s32 dst, sljit_sw dstw,
1439	sljit_s32 src1, sljit_sw src1w,
1440	sljit_s32 src2, sljit_sw src2w)
1441{
1442	sljit_u8* inst;
1443
1444	if (dst == SLJIT_UNUSED) {
1445		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1446		if (src2 & SLJIT_IMM) {
1447			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1448		}
1449		else {
1450			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1451			FAIL_IF(!inst);
1452			*inst = op_rm;
1453		}
1454		return SLJIT_SUCCESS;
1455	}
1456
1457	if (dst == src1 && dstw == src1w) {
1458		if (src2 & SLJIT_IMM) {
1459#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1460			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1461#else
1462			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1463#endif
1464				BINARY_EAX_IMM(op_eax_imm, src2w);
1465			}
1466			else {
1467				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1468			}
1469		}
1470		else if (FAST_IS_REG(dst)) {
1471			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1472			FAIL_IF(!inst);
1473			*inst = op_rm;
1474		}
1475		else if (FAST_IS_REG(src2)) {
1476			/* Special exception for sljit_emit_op_flags. */
1477			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1478			FAIL_IF(!inst);
1479			*inst = op_mr;
1480		}
1481		else {
1482			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1483			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1484			FAIL_IF(!inst);
1485			*inst = op_mr;
1486		}
1487		return SLJIT_SUCCESS;
1488	}
1489
1490	/* Only for cumulative operations. */
1491	if (dst == src2 && dstw == src2w) {
1492		if (src1 & SLJIT_IMM) {
1493#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1494			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1495#else
1496			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1497#endif
1498				BINARY_EAX_IMM(op_eax_imm, src1w);
1499			}
1500			else {
1501				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1502			}
1503		}
1504		else if (FAST_IS_REG(dst)) {
1505			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1506			FAIL_IF(!inst);
1507			*inst = op_rm;
1508		}
1509		else if (FAST_IS_REG(src1)) {
1510			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1511			FAIL_IF(!inst);
1512			*inst = op_mr;
1513		}
1514		else {
1515			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1516			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1517			FAIL_IF(!inst);
1518			*inst = op_mr;
1519		}
1520		return SLJIT_SUCCESS;
1521	}
1522
1523	/* General version. */
1524	if (FAST_IS_REG(dst)) {
1525		EMIT_MOV(compiler, dst, 0, src1, src1w);
1526		if (src2 & SLJIT_IMM) {
1527			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1528		}
1529		else {
1530			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1531			FAIL_IF(!inst);
1532			*inst = op_rm;
1533		}
1534	}
1535	else {
1536		/* This version requires less memory writing. */
1537		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1538		if (src2 & SLJIT_IMM) {
1539			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1540		}
1541		else {
1542			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1543			FAIL_IF(!inst);
1544			*inst = op_rm;
1545		}
1546		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1547	}
1548
1549	return SLJIT_SUCCESS;
1550}
1551
1552static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
1553	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
1554	sljit_s32 dst, sljit_sw dstw,
1555	sljit_s32 src1, sljit_sw src1w,
1556	sljit_s32 src2, sljit_sw src2w)
1557{
1558	sljit_u8* inst;
1559
1560	if (dst == SLJIT_UNUSED) {
1561		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1562		if (src2 & SLJIT_IMM) {
1563			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1564		}
1565		else {
1566			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1567			FAIL_IF(!inst);
1568			*inst = op_rm;
1569		}
1570		return SLJIT_SUCCESS;
1571	}
1572
1573	if (dst == src1 && dstw == src1w) {
1574		if (src2 & SLJIT_IMM) {
1575#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1576			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1577#else
1578			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1579#endif
1580				BINARY_EAX_IMM(op_eax_imm, src2w);
1581			}
1582			else {
1583				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1584			}
1585		}
1586		else if (FAST_IS_REG(dst)) {
1587			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1588			FAIL_IF(!inst);
1589			*inst = op_rm;
1590		}
1591		else if (FAST_IS_REG(src2)) {
1592			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1593			FAIL_IF(!inst);
1594			*inst = op_mr;
1595		}
1596		else {
1597			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1598			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1599			FAIL_IF(!inst);
1600			*inst = op_mr;
1601		}
1602		return SLJIT_SUCCESS;
1603	}
1604
1605	/* General version. */
1606	if (FAST_IS_REG(dst) && dst != src2) {
1607		EMIT_MOV(compiler, dst, 0, src1, src1w);
1608		if (src2 & SLJIT_IMM) {
1609			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1610		}
1611		else {
1612			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1613			FAIL_IF(!inst);
1614			*inst = op_rm;
1615		}
1616	}
1617	else {
1618		/* This version requires less memory writing. */
1619		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1620		if (src2 & SLJIT_IMM) {
1621			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1622		}
1623		else {
1624			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1625			FAIL_IF(!inst);
1626			*inst = op_rm;
1627		}
1628		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1629	}
1630
1631	return SLJIT_SUCCESS;
1632}
1633
1634static sljit_s32 emit_mul(struct sljit_compiler *compiler,
1635	sljit_s32 dst, sljit_sw dstw,
1636	sljit_s32 src1, sljit_sw src1w,
1637	sljit_s32 src2, sljit_sw src2w)
1638{
1639	sljit_u8* inst;
1640	sljit_s32 dst_r;
1641
1642	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1643
1644	/* Register destination. */
1645	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1646		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1647		FAIL_IF(!inst);
1648		*inst++ = GROUP_0F;
1649		*inst = IMUL_r_rm;
1650	}
1651	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1652		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1653		FAIL_IF(!inst);
1654		*inst++ = GROUP_0F;
1655		*inst = IMUL_r_rm;
1656	}
1657	else if (src1 & SLJIT_IMM) {
1658		if (src2 & SLJIT_IMM) {
1659			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1660			src2 = dst_r;
1661			src2w = 0;
1662		}
1663
1664		if (src1w <= 127 && src1w >= -128) {
1665			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1666			FAIL_IF(!inst);
1667			*inst = IMUL_r_rm_i8;
1668			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1669			FAIL_IF(!inst);
1670			INC_SIZE(1);
1671			*inst = (sljit_s8)src1w;
1672		}
1673#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1674		else {
1675			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1676			FAIL_IF(!inst);
1677			*inst = IMUL_r_rm_i32;
1678			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1679			FAIL_IF(!inst);
1680			INC_SIZE(4);
1681			sljit_unaligned_store_sw(inst, src1w);
1682		}
1683#else
1684		else if (IS_HALFWORD(src1w)) {
1685			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1686			FAIL_IF(!inst);
1687			*inst = IMUL_r_rm_i32;
1688			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1689			FAIL_IF(!inst);
1690			INC_SIZE(4);
1691			sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
1692		}
1693		else {
1694			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1695			if (dst_r != src2)
1696				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1697			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1698			FAIL_IF(!inst);
1699			*inst++ = GROUP_0F;
1700			*inst = IMUL_r_rm;
1701		}
1702#endif
1703	}
1704	else if (src2 & SLJIT_IMM) {
1705		/* Note: src1 is NOT immediate. */
1706
1707		if (src2w <= 127 && src2w >= -128) {
1708			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1709			FAIL_IF(!inst);
1710			*inst = IMUL_r_rm_i8;
1711			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1712			FAIL_IF(!inst);
1713			INC_SIZE(1);
1714			*inst = (sljit_s8)src2w;
1715		}
1716#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1717		else {
1718			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1719			FAIL_IF(!inst);
1720			*inst = IMUL_r_rm_i32;
1721			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1722			FAIL_IF(!inst);
1723			INC_SIZE(4);
1724			sljit_unaligned_store_sw(inst, src2w);
1725		}
1726#else
1727		else if (IS_HALFWORD(src2w)) {
1728			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1729			FAIL_IF(!inst);
1730			*inst = IMUL_r_rm_i32;
1731			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1732			FAIL_IF(!inst);
1733			INC_SIZE(4);
1734			sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
1735		}
1736		else {
1737			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src2w);
1738			if (dst_r != src1)
1739				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1740			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1741			FAIL_IF(!inst);
1742			*inst++ = GROUP_0F;
1743			*inst = IMUL_r_rm;
1744		}
1745#endif
1746	}
1747	else {
1748		/* Neither argument is immediate. */
1749		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1750			dst_r = TMP_REG1;
1751		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1752		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1753		FAIL_IF(!inst);
1754		*inst++ = GROUP_0F;
1755		*inst = IMUL_r_rm;
1756	}
1757
1758	if (dst_r == TMP_REG1)
1759		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1760
1761	return SLJIT_SUCCESS;
1762}
1763
1764static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler,
1765	sljit_s32 dst, sljit_sw dstw,
1766	sljit_s32 src1, sljit_sw src1w,
1767	sljit_s32 src2, sljit_sw src2w)
1768{
1769	sljit_u8* inst;
1770	sljit_s32 dst_r, done = 0;
1771
1772	/* These cases better be left to handled by normal way. */
1773	if (dst == src1 && dstw == src1w)
1774		return SLJIT_ERR_UNSUPPORTED;
1775	if (dst == src2 && dstw == src2w)
1776		return SLJIT_ERR_UNSUPPORTED;
1777
1778	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1779
1780	if (FAST_IS_REG(src1)) {
1781		if (FAST_IS_REG(src2)) {
1782			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1783			FAIL_IF(!inst);
1784			*inst = LEA_r_m;
1785			done = 1;
1786		}
1787#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1788		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1789			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
1790#else
1791		if (src2 & SLJIT_IMM) {
1792			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1793#endif
1794			FAIL_IF(!inst);
1795			*inst = LEA_r_m;
1796			done = 1;
1797		}
1798	}
1799	else if (FAST_IS_REG(src2)) {
1800#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1801		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1802			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
1803#else
1804		if (src1 & SLJIT_IMM) {
1805			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1806#endif
1807			FAIL_IF(!inst);
1808			*inst = LEA_r_m;
1809			done = 1;
1810		}
1811	}
1812
1813	if (done) {
1814		if (dst_r == TMP_REG1)
1815			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1816		return SLJIT_SUCCESS;
1817	}
1818	return SLJIT_ERR_UNSUPPORTED;
1819}
1820
1821static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
1822	sljit_s32 src1, sljit_sw src1w,
1823	sljit_s32 src2, sljit_sw src2w)
1824{
1825	sljit_u8* inst;
1826
1827#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1828	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1829#else
1830	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1831#endif
1832		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
1833		return SLJIT_SUCCESS;
1834	}
1835
1836	if (FAST_IS_REG(src1)) {
1837		if (src2 & SLJIT_IMM) {
1838			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
1839		}
1840		else {
1841			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1842			FAIL_IF(!inst);
1843			*inst = CMP_r_rm;
1844		}
1845		return SLJIT_SUCCESS;
1846	}
1847
1848	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
1849		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1850		FAIL_IF(!inst);
1851		*inst = CMP_rm_r;
1852		return SLJIT_SUCCESS;
1853	}
1854
1855	if (src2 & SLJIT_IMM) {
1856		if (src1 & SLJIT_IMM) {
1857			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1858			src1 = TMP_REG1;
1859			src1w = 0;
1860		}
1861		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
1862	}
1863	else {
1864		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1865		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1866		FAIL_IF(!inst);
1867		*inst = CMP_r_rm;
1868	}
1869	return SLJIT_SUCCESS;
1870}
1871
1872static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
1873	sljit_s32 src1, sljit_sw src1w,
1874	sljit_s32 src2, sljit_sw src2w)
1875{
1876	sljit_u8* inst;
1877
1878#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1879	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1880#else
1881	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1882#endif
1883		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
1884		return SLJIT_SUCCESS;
1885	}
1886
1887#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1888	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1889#else
1890	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
1891#endif
1892		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
1893		return SLJIT_SUCCESS;
1894	}
1895
1896	if (!(src1 & SLJIT_IMM)) {
1897		if (src2 & SLJIT_IMM) {
1898#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1899			if (IS_HALFWORD(src2w) || compiler->mode32) {
1900				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1901				FAIL_IF(!inst);
1902				*inst = GROUP_F7;
1903			}
1904			else {
1905				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1906				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
1907				FAIL_IF(!inst);
1908				*inst = TEST_rm_r;
1909			}
1910#else
1911			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1912			FAIL_IF(!inst);
1913			*inst = GROUP_F7;
1914#endif
1915			return SLJIT_SUCCESS;
1916		}
1917		else if (FAST_IS_REG(src1)) {
1918			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1919			FAIL_IF(!inst);
1920			*inst = TEST_rm_r;
1921			return SLJIT_SUCCESS;
1922		}
1923	}
1924
1925	if (!(src2 & SLJIT_IMM)) {
1926		if (src1 & SLJIT_IMM) {
1927#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1928			if (IS_HALFWORD(src1w) || compiler->mode32) {
1929				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
1930				FAIL_IF(!inst);
1931				*inst = GROUP_F7;
1932			}
1933			else {
1934				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1935				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
1936				FAIL_IF(!inst);
1937				*inst = TEST_rm_r;
1938			}
1939#else
1940			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
1941			FAIL_IF(!inst);
1942			*inst = GROUP_F7;
1943#endif
1944			return SLJIT_SUCCESS;
1945		}
1946		else if (FAST_IS_REG(src2)) {
1947			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1948			FAIL_IF(!inst);
1949			*inst = TEST_rm_r;
1950			return SLJIT_SUCCESS;
1951		}
1952	}
1953
1954	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1955	if (src2 & SLJIT_IMM) {
1956#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1957		if (IS_HALFWORD(src2w) || compiler->mode32) {
1958			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1959			FAIL_IF(!inst);
1960			*inst = GROUP_F7;
1961		}
1962		else {
1963			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1964			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
1965			FAIL_IF(!inst);
1966			*inst = TEST_rm_r;
1967		}
1968#else
1969		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1970		FAIL_IF(!inst);
1971		*inst = GROUP_F7;
1972#endif
1973	}
1974	else {
1975		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1976		FAIL_IF(!inst);
1977		*inst = TEST_rm_r;
1978	}
1979	return SLJIT_SUCCESS;
1980}
1981
1982static sljit_s32 emit_shift(struct sljit_compiler *compiler,
1983	sljit_u8 mode,
1984	sljit_s32 dst, sljit_sw dstw,
1985	sljit_s32 src1, sljit_sw src1w,
1986	sljit_s32 src2, sljit_sw src2w)
1987{
1988	sljit_u8* inst;
1989
1990	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
1991		if (dst == src1 && dstw == src1w) {
1992			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
1993			FAIL_IF(!inst);
1994			*inst |= mode;
1995			return SLJIT_SUCCESS;
1996		}
1997		if (dst == SLJIT_UNUSED) {
1998			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1999			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2000			FAIL_IF(!inst);
2001			*inst |= mode;
2002			return SLJIT_SUCCESS;
2003		}
2004		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2005			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2006			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2007			FAIL_IF(!inst);
2008			*inst |= mode;
2009			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2010			return SLJIT_SUCCESS;
2011		}
2012		if (FAST_IS_REG(dst)) {
2013			EMIT_MOV(compiler, dst, 0, src1, src1w);
2014			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2015			FAIL_IF(!inst);
2016			*inst |= mode;
2017			return SLJIT_SUCCESS;
2018		}
2019
2020		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2021		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2022		FAIL_IF(!inst);
2023		*inst |= mode;
2024		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2025		return SLJIT_SUCCESS;
2026	}
2027
2028	if (dst == SLJIT_PREF_SHIFT_REG) {
2029		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2030		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2031		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2032		FAIL_IF(!inst);
2033		*inst |= mode;
2034		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2035	}
2036	else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2037		if (src1 != dst)
2038			EMIT_MOV(compiler, dst, 0, src1, src1w);
2039		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2040		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2041		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2042		FAIL_IF(!inst);
2043		*inst |= mode;
2044		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2045	}
2046	else {
2047		/* This case is complex since ecx itself may be used for
2048		   addressing, and this case must be supported as well. */
2049		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2050#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2051		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2052#else
2053		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2054#endif
2055		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2056		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2057		FAIL_IF(!inst);
2058		*inst |= mode;
2059#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2060		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2061#else
2062		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
2063#endif
2064		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2065	}
2066
2067	return SLJIT_SUCCESS;
2068}
2069
2070static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2071	sljit_u8 mode, sljit_s32 set_flags,
2072	sljit_s32 dst, sljit_sw dstw,
2073	sljit_s32 src1, sljit_sw src1w,
2074	sljit_s32 src2, sljit_sw src2w)
2075{
2076	/* The CPU does not set flags if the shift count is 0. */
2077	if (src2 & SLJIT_IMM) {
2078#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2079		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2080			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2081#else
2082		if ((src2w & 0x1f) != 0)
2083			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2084#endif
2085		if (!set_flags)
2086			return emit_mov(compiler, dst, dstw, src1, src1w);
2087		/* OR dst, src, 0 */
2088		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2089			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2090	}
2091
2092	if (!set_flags)
2093		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2094
2095	if (!FAST_IS_REG(dst))
2096		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2097
2098	FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
2099
2100	if (FAST_IS_REG(dst))
2101		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2102	return SLJIT_SUCCESS;
2103}
2104
2105SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2106	sljit_s32 dst, sljit_sw dstw,
2107	sljit_s32 src1, sljit_sw src1w,
2108	sljit_s32 src2, sljit_sw src2w)
2109{
2110	CHECK_ERROR();
2111	CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2112	ADJUST_LOCAL_OFFSET(dst, dstw);
2113	ADJUST_LOCAL_OFFSET(src1, src1w);
2114	ADJUST_LOCAL_OFFSET(src2, src2w);
2115
2116	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2117	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2118	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2119#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2120	compiler->mode32 = op & SLJIT_I32_OP;
2121#endif
2122
2123	switch (GET_OPCODE(op)) {
2124	case SLJIT_ADD:
2125		if (!HAS_FLAGS(op)) {
2126			if (emit_lea_binary(compiler, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2127				return compiler->error;
2128		}
2129		return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
2130			dst, dstw, src1, src1w, src2, src2w);
2131	case SLJIT_ADDC:
2132		return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
2133			dst, dstw, src1, src1w, src2, src2w);
2134	case SLJIT_SUB:
2135		if (!HAS_FLAGS(op)) {
2136			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2137				return compiler->error;
2138		}
2139
2140		if (dst == SLJIT_UNUSED)
2141			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2142		return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
2143			dst, dstw, src1, src1w, src2, src2w);
2144	case SLJIT_SUBC:
2145		return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
2146			dst, dstw, src1, src1w, src2, src2w);
2147	case SLJIT_MUL:
2148		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2149	case SLJIT_AND:
2150		if (dst == SLJIT_UNUSED)
2151			return emit_test_binary(compiler, src1, src1w, src2, src2w);
2152		return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
2153			dst, dstw, src1, src1w, src2, src2w);
2154	case SLJIT_OR:
2155		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2156			dst, dstw, src1, src1w, src2, src2w);
2157	case SLJIT_XOR:
2158		return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
2159			dst, dstw, src1, src1w, src2, src2w);
2160	case SLJIT_SHL:
2161		return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
2162			dst, dstw, src1, src1w, src2, src2w);
2163	case SLJIT_LSHR:
2164		return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op),
2165			dst, dstw, src1, src1w, src2, src2w);
2166	case SLJIT_ASHR:
2167		return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op),
2168			dst, dstw, src1, src1w, src2, src2w);
2169	}
2170
2171	return SLJIT_SUCCESS;
2172}
2173
2174SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
2175{
2176	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
2177#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2178	if (reg >= SLJIT_R3 && reg <= SLJIT_R8)
2179		return -1;
2180#endif
2181	return reg_map[reg];
2182}
2183
2184SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
2185{
2186	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
2187	return reg;
2188}
2189
2190SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
2191	void *instruction, sljit_s32 size)
2192{
2193	sljit_u8 *inst;
2194
2195	CHECK_ERROR();
2196	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
2197
2198	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
2199	FAIL_IF(!inst);
2200	INC_SIZE(size);
2201	SLJIT_MEMCPY(inst, instruction, size);
2202	return SLJIT_SUCCESS;
2203}
2204
2205/* --------------------------------------------------------------------- */
2206/*  Floating point operators                                             */
2207/* --------------------------------------------------------------------- */
2208
2209/* Alignment + 2 * 16 bytes. */
2210static sljit_s32 sse2_data[3 + (4 + 4) * 2];
2211static sljit_s32 *sse2_buffer;
2212
2213static void init_compiler(void)
2214{
2215	sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf);
2216	/* Single precision constants. */
2217	sse2_buffer[0] = 0x80000000;
2218	sse2_buffer[4] = 0x7fffffff;
2219	/* Double precision constants. */
2220	sse2_buffer[8] = 0;
2221	sse2_buffer[9] = 0x80000000;
2222	sse2_buffer[12] = 0xffffffff;
2223	sse2_buffer[13] = 0x7fffffff;
2224}
2225
2226SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_is_fpu_available(void)
2227{
2228#ifdef SLJIT_IS_FPU_AVAILABLE
2229	return SLJIT_IS_FPU_AVAILABLE;
2230#elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2231	if (cpu_has_sse2 == -1)
2232		get_cpu_features();
2233	return cpu_has_sse2;
2234#else /* SLJIT_DETECT_SSE2 */
2235	return 1;
2236#endif /* SLJIT_DETECT_SSE2 */
2237}
2238
2239static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
2240	sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2241{
2242	sljit_u8 *inst;
2243
2244	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2245	FAIL_IF(!inst);
2246	*inst++ = GROUP_0F;
2247	*inst = opcode;
2248	return SLJIT_SUCCESS;
2249}
2250
2251static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode,
2252	sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2253{
2254	sljit_u8 *inst;
2255
2256	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2257	FAIL_IF(!inst);
2258	*inst++ = GROUP_0F;
2259	*inst = opcode;
2260	return SLJIT_SUCCESS;
2261}
2262
2263static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
2264	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2265{
2266	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2267}
2268
2269static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
2270	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
2271{
2272	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2273}
2274
2275static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
2276	sljit_s32 dst, sljit_sw dstw,
2277	sljit_s32 src, sljit_sw srcw)
2278{
2279	sljit_s32 dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2280	sljit_u8 *inst;
2281
2282#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2283	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
2284		compiler->mode32 = 0;
2285#endif
2286
2287	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
2288	FAIL_IF(!inst);
2289	*inst++ = GROUP_0F;
2290	*inst = CVTTSD2SI_r_xm;
2291
2292	if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
2293		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2294	return SLJIT_SUCCESS;
2295}
2296
2297static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
2298	sljit_s32 dst, sljit_sw dstw,
2299	sljit_s32 src, sljit_sw srcw)
2300{
2301	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2302	sljit_u8 *inst;
2303
2304#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2305	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
2306		compiler->mode32 = 0;
2307#endif
2308
2309	if (src & SLJIT_IMM) {
2310#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2311		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
2312			srcw = (sljit_s32)srcw;
2313#endif
2314		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2315		src = TMP_REG1;
2316		srcw = 0;
2317	}
2318
2319	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
2320	FAIL_IF(!inst);
2321	*inst++ = GROUP_0F;
2322	*inst = CVTSI2SD_x_rm;
2323
2324#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2325	compiler->mode32 = 1;
2326#endif
2327	if (dst_r == TMP_FREG)
2328		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2329	return SLJIT_SUCCESS;
2330}
2331
2332static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
2333	sljit_s32 src1, sljit_sw src1w,
2334	sljit_s32 src2, sljit_sw src2w)
2335{
2336	if (!FAST_IS_REG(src1)) {
2337		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2338		src1 = TMP_FREG;
2339	}
2340	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w);
2341}
2342
2343SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
2344	sljit_s32 dst, sljit_sw dstw,
2345	sljit_s32 src, sljit_sw srcw)
2346{
2347	sljit_s32 dst_r;
2348
2349#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2350	compiler->mode32 = 1;
2351#endif
2352
2353	CHECK_ERROR();
2354	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
2355
2356	if (GET_OPCODE(op) == SLJIT_MOV_F64) {
2357		if (FAST_IS_REG(dst))
2358			return emit_sse2_load(compiler, op & SLJIT_F32_OP, dst, src, srcw);
2359		if (FAST_IS_REG(src))
2360			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, src);
2361		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src, srcw));
2362		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2363	}
2364
2365	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
2366		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2367		if (FAST_IS_REG(src)) {
2368			/* We overwrite the high bits of source. From SLJIT point of view,
2369			   this is not an issue.
2370			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
2371			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_F32_OP, src, src, 0));
2372		}
2373		else {
2374			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_F32_OP), TMP_FREG, src, srcw));
2375			src = TMP_FREG;
2376		}
2377
2378		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_F32_OP, dst_r, src, 0));
2379		if (dst_r == TMP_FREG)
2380			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2381		return SLJIT_SUCCESS;
2382	}
2383
2384	if (SLOW_IS_REG(dst)) {
2385		dst_r = dst;
2386		if (dst != src)
2387			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2388	}
2389	else {
2390		dst_r = TMP_FREG;
2391		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2392	}
2393
2394	switch (GET_OPCODE(op)) {
2395	case SLJIT_NEG_F64:
2396		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer : sse2_buffer + 8)));
2397		break;
2398
2399	case SLJIT_ABS_F64:
2400		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2401		break;
2402	}
2403
2404	if (dst_r == TMP_FREG)
2405		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2406	return SLJIT_SUCCESS;
2407}
2408
2409SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
2410	sljit_s32 dst, sljit_sw dstw,
2411	sljit_s32 src1, sljit_sw src1w,
2412	sljit_s32 src2, sljit_sw src2w)
2413{
2414	sljit_s32 dst_r;
2415
2416	CHECK_ERROR();
2417	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2418	ADJUST_LOCAL_OFFSET(dst, dstw);
2419	ADJUST_LOCAL_OFFSET(src1, src1w);
2420	ADJUST_LOCAL_OFFSET(src2, src2w);
2421
2422#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2423	compiler->mode32 = 1;
2424#endif
2425
2426	if (FAST_IS_REG(dst)) {
2427		dst_r = dst;
2428		if (dst == src1)
2429			; /* Do nothing here. */
2430		else if (dst == src2 && (op == SLJIT_ADD_F64 || op == SLJIT_MUL_F64)) {
2431			/* Swap arguments. */
2432			src2 = src1;
2433			src2w = src1w;
2434		}
2435		else if (dst != src2)
2436			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src1, src1w));
2437		else {
2438			dst_r = TMP_FREG;
2439			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2440		}
2441	}
2442	else {
2443		dst_r = TMP_FREG;
2444		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2445	}
2446
2447	switch (GET_OPCODE(op)) {
2448	case SLJIT_ADD_F64:
2449		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2450		break;
2451
2452	case SLJIT_SUB_F64:
2453		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2454		break;
2455
2456	case SLJIT_MUL_F64:
2457		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2458		break;
2459
2460	case SLJIT_DIV_F64:
2461		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2462		break;
2463	}
2464
2465	if (dst_r == TMP_FREG)
2466		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2467	return SLJIT_SUCCESS;
2468}
2469
2470/* --------------------------------------------------------------------- */
2471/*  Conditional instructions                                             */
2472/* --------------------------------------------------------------------- */
2473
2474SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2475{
2476	sljit_u8 *inst;
2477	struct sljit_label *label;
2478
2479	CHECK_ERROR_PTR();
2480	CHECK_PTR(check_sljit_emit_label(compiler));
2481
2482	if (compiler->last_label && compiler->last_label->size == compiler->size)
2483		return compiler->last_label;
2484
2485	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2486	PTR_FAIL_IF(!label);
2487	set_label(label, compiler);
2488
2489	inst = (sljit_u8*)ensure_buf(compiler, 2);
2490	PTR_FAIL_IF(!inst);
2491
2492	*inst++ = 0;
2493	*inst++ = 0;
2494
2495	return label;
2496}
2497
2498SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
2499{
2500	sljit_u8 *inst;
2501	struct sljit_jump *jump;
2502
2503	CHECK_ERROR_PTR();
2504	CHECK_PTR(check_sljit_emit_jump(compiler, type));
2505
2506	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2507	PTR_FAIL_IF_NULL(jump);
2508	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
2509	type &= 0xff;
2510
2511	if (type >= SLJIT_CALL1)
2512		PTR_FAIL_IF(call_with_args(compiler, type));
2513
2514	/* Worst case size. */
2515#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2516	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2517#else
2518	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2519#endif
2520
2521	inst = (sljit_u8*)ensure_buf(compiler, 2);
2522	PTR_FAIL_IF_NULL(inst);
2523
2524	*inst++ = 0;
2525	*inst++ = type + 2;
2526	return jump;
2527}
2528
2529SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
2530{
2531	sljit_u8 *inst;
2532	struct sljit_jump *jump;
2533
2534	CHECK_ERROR();
2535	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
2536	ADJUST_LOCAL_OFFSET(src, srcw);
2537
2538	CHECK_EXTRA_REGS(src, srcw, (void)0);
2539
2540	if (type >= SLJIT_CALL1) {
2541#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2542#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
2543		if (src == SLJIT_R2) {
2544			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2545			src = TMP_REG1;
2546		}
2547		if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
2548			srcw += sizeof(sljit_sw);
2549#endif
2550#endif
2551#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
2552		if (src == SLJIT_R2) {
2553			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2554			src = TMP_REG1;
2555		}
2556#endif
2557		FAIL_IF(call_with_args(compiler, type));
2558	}
2559
2560	if (src == SLJIT_IMM) {
2561		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2562		FAIL_IF_NULL(jump);
2563		set_jump(jump, compiler, JUMP_ADDR);
2564		jump->u.target = srcw;
2565
2566		/* Worst case size. */
2567#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2568		compiler->size += 5;
2569#else
2570		compiler->size += 10 + 3;
2571#endif
2572
2573		inst = (sljit_u8*)ensure_buf(compiler, 2);
2574		FAIL_IF_NULL(inst);
2575
2576		*inst++ = 0;
2577		*inst++ = type + 2;
2578	}
2579	else {
2580#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2581		/* REX_W is not necessary (src is not immediate). */
2582		compiler->mode32 = 1;
2583#endif
2584		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2585		FAIL_IF(!inst);
2586		*inst++ = GROUP_FF;
2587		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2588	}
2589	return SLJIT_SUCCESS;
2590}
2591
2592SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
2593	sljit_s32 dst, sljit_sw dstw,
2594	sljit_s32 src, sljit_sw srcw,
2595	sljit_s32 type)
2596{
2597	sljit_u8 *inst;
2598	sljit_u8 cond_set = 0;
2599#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2600	sljit_s32 reg;
2601#endif
2602	/* ADJUST_LOCAL_OFFSET and CHECK_EXTRA_REGS might overwrite these values. */
2603	sljit_s32 dst_save = dst;
2604	sljit_sw dstw_save = dstw;
2605
2606	CHECK_ERROR();
2607	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
2608	SLJIT_UNUSED_ARG(srcw);
2609
2610	if (dst == SLJIT_UNUSED)
2611		return SLJIT_SUCCESS;
2612
2613	ADJUST_LOCAL_OFFSET(dst, dstw);
2614	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2615
2616	type &= 0xff;
2617	/* setcc = jcc + 0x10. */
2618	cond_set = get_jump_code(type) + 0x10;
2619
2620#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2621	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
2622		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
2623		FAIL_IF(!inst);
2624		INC_SIZE(4 + 3);
2625		/* Set low register to conditional flag. */
2626		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2627		*inst++ = GROUP_0F;
2628		*inst++ = cond_set;
2629		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
2630		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2631		*inst++ = OR_rm8_r8;
2632		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2633		return SLJIT_SUCCESS;
2634	}
2635
2636	reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2637
2638	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
2639	FAIL_IF(!inst);
2640	INC_SIZE(4 + 4);
2641	/* Set low register to conditional flag. */
2642	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2643	*inst++ = GROUP_0F;
2644	*inst++ = cond_set;
2645	*inst++ = MOD_REG | reg_lmap[reg];
2646	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2647	/* The movzx instruction does not affect flags. */
2648	*inst++ = GROUP_0F;
2649	*inst++ = MOVZX_r_rm8;
2650	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2651
2652	if (reg != TMP_REG1)
2653		return SLJIT_SUCCESS;
2654
2655	if (GET_OPCODE(op) < SLJIT_ADD) {
2656		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2657		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2658	}
2659#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2660		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2661	compiler->skip_checks = 1;
2662#endif
2663	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2664
2665#else
2666	/* The SLJIT_CONFIG_X86_32 code path starts here. */
2667	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2668		if (reg_map[dst] <= 4) {
2669			/* Low byte is accessible. */
2670			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
2671			FAIL_IF(!inst);
2672			INC_SIZE(3 + 3);
2673			/* Set low byte to conditional flag. */
2674			*inst++ = GROUP_0F;
2675			*inst++ = cond_set;
2676			*inst++ = MOD_REG | reg_map[dst];
2677
2678			*inst++ = GROUP_0F;
2679			*inst++ = MOVZX_r_rm8;
2680			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2681			return SLJIT_SUCCESS;
2682		}
2683
2684		/* Low byte is not accessible. */
2685		if (cpu_has_cmov == -1)
2686			get_cpu_features();
2687
2688		if (cpu_has_cmov) {
2689			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2690			/* a xor reg, reg operation would overwrite the flags. */
2691			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2692
2693			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
2694			FAIL_IF(!inst);
2695			INC_SIZE(3);
2696
2697			*inst++ = GROUP_0F;
2698			/* cmovcc = setcc - 0x50. */
2699			*inst++ = cond_set - 0x50;
2700			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2701			return SLJIT_SUCCESS;
2702		}
2703
2704		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2705		FAIL_IF(!inst);
2706		INC_SIZE(1 + 3 + 3 + 1);
2707		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2708		/* Set al to conditional flag. */
2709		*inst++ = GROUP_0F;
2710		*inst++ = cond_set;
2711		*inst++ = MOD_REG | 0 /* eax */;
2712
2713		*inst++ = GROUP_0F;
2714		*inst++ = MOVZX_r_rm8;
2715		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2716		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2717		return SLJIT_SUCCESS;
2718	}
2719
2720	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
2721		SLJIT_ASSERT(reg_map[SLJIT_R0] == 0);
2722
2723		if (dst != SLJIT_R0) {
2724			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2725			FAIL_IF(!inst);
2726			INC_SIZE(1 + 3 + 2 + 1);
2727			/* Set low register to conditional flag. */
2728			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2729			*inst++ = GROUP_0F;
2730			*inst++ = cond_set;
2731			*inst++ = MOD_REG | 0 /* eax */;
2732			*inst++ = OR_rm8_r8;
2733			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2734			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2735		}
2736		else {
2737			inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2738			FAIL_IF(!inst);
2739			INC_SIZE(2 + 3 + 2 + 2);
2740			/* Set low register to conditional flag. */
2741			*inst++ = XCHG_r_rm;
2742			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2743			*inst++ = GROUP_0F;
2744			*inst++ = cond_set;
2745			*inst++ = MOD_REG | 1 /* ecx */;
2746			*inst++ = OR_rm8_r8;
2747			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2748			*inst++ = XCHG_r_rm;
2749			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2750		}
2751		return SLJIT_SUCCESS;
2752	}
2753
2754	/* Set TMP_REG1 to the bit. */
2755	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2756	FAIL_IF(!inst);
2757	INC_SIZE(1 + 3 + 3 + 1);
2758	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2759	/* Set al to conditional flag. */
2760	*inst++ = GROUP_0F;
2761	*inst++ = cond_set;
2762	*inst++ = MOD_REG | 0 /* eax */;
2763
2764	*inst++ = GROUP_0F;
2765	*inst++ = MOVZX_r_rm8;
2766	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2767
2768	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2769
2770	if (GET_OPCODE(op) < SLJIT_ADD)
2771		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2772
2773#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2774		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2775	compiler->skip_checks = 1;
2776#endif
2777	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2778#endif /* SLJIT_CONFIG_X86_64 */
2779}
2780
2781SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
2782{
2783	CHECK_ERROR();
2784	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
2785	ADJUST_LOCAL_OFFSET(dst, dstw);
2786
2787	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2788
2789#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2790	compiler->mode32 = 0;
2791#endif
2792
2793	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
2794
2795#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2796	if (NOT_HALFWORD(offset)) {
2797		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
2798#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2799		SLJIT_ASSERT(emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
2800		return compiler->error;
2801#else
2802		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
2803#endif
2804	}
2805#endif
2806
2807	if (offset != 0)
2808		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
2809	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
2810}
2811
2812SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
2813{
2814	sljit_u8 *inst;
2815	struct sljit_const *const_;
2816#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2817	sljit_s32 reg;
2818#endif
2819
2820	CHECK_ERROR_PTR();
2821	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
2822	ADJUST_LOCAL_OFFSET(dst, dstw);
2823
2824	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2825
2826	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
2827	PTR_FAIL_IF(!const_);
2828	set_const(const_, compiler);
2829
2830#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2831	compiler->mode32 = 0;
2832	reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2833
2834	if (emit_load_imm64(compiler, reg, init_value))
2835		return NULL;
2836#else
2837	if (dst == SLJIT_UNUSED)
2838		dst = TMP_REG1;
2839
2840	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
2841		return NULL;
2842#endif
2843
2844	inst = (sljit_u8*)ensure_buf(compiler, 2);
2845	PTR_FAIL_IF(!inst);
2846
2847	*inst++ = 0;
2848	*inst++ = 1;
2849
2850#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2851	if (dst & SLJIT_MEM)
2852		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
2853			return NULL;
2854#endif
2855
2856	return const_;
2857}
2858
2859SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
2860{
2861	SLJIT_UNUSED_ARG(executable_offset);
2862#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2863	sljit_unaligned_store_sw((void*)addr, new_target - (addr + 4) - (sljit_uw)executable_offset);
2864#else
2865	sljit_unaligned_store_sw((void*)addr, (sljit_sw) new_target);
2866#endif
2867}
2868
2869SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
2870{
2871	SLJIT_UNUSED_ARG(executable_offset);
2872	sljit_unaligned_store_sw((void*)addr, new_constant);
2873}
2874
2875SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_sse2_available(void)
2876{
2877#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2878	if (cpu_has_sse2 == -1)
2879		get_cpu_features();
2880	return cpu_has_sse2;
2881#else
2882	return 1;
2883#endif
2884}
2885
2886SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_cmov_available(void)
2887{
2888	if (cpu_has_cmov == -1)
2889		get_cpu_features();
2890	return cpu_has_cmov;
2891}
2892
2893SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_emit_cmov(struct sljit_compiler *compiler,
2894	sljit_s32 type,
2895	sljit_s32 dst_reg,
2896	sljit_s32 src, sljit_sw srcw)
2897{
2898	sljit_u8* inst;
2899
2900	CHECK_ERROR();
2901#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2902	CHECK_ARGUMENT(sljit_x86_is_cmov_available());
2903	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_I32_OP)));
2904	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_ORDERED_F64);
2905	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_I32_OP));
2906	FUNCTION_CHECK_SRC(src, srcw);
2907
2908	if ((type & 0xff) <= SLJIT_NOT_ZERO)
2909		CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
2910	else
2911		CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff));
2912#endif
2913#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
2914	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
2915		fprintf(compiler->verbose, "  x86_cmov%s %s%s, ",
2916			!(dst_reg & SLJIT_I32_OP) ? "" : ".i",
2917			jump_names[type & 0xff], JUMP_POSTFIX(type));
2918		sljit_verbose_reg(compiler, dst_reg & ~SLJIT_I32_OP);
2919		fprintf(compiler->verbose, ", ");
2920		sljit_verbose_param(compiler, src, srcw);
2921		fprintf(compiler->verbose, "\n");
2922	}
2923#endif
2924
2925	ADJUST_LOCAL_OFFSET(src, srcw);
2926	CHECK_EXTRA_REGS(src, srcw, (void)0);
2927
2928#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2929	compiler->mode32 = dst_reg & SLJIT_I32_OP;
2930#endif
2931	dst_reg &= ~SLJIT_I32_OP;
2932
2933	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
2934		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
2935		src = TMP_REG1;
2936		srcw = 0;
2937	}
2938
2939	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
2940	FAIL_IF(!inst);
2941	*inst++ = GROUP_0F;
2942	*inst = get_jump_code(type & 0xff) - 0x40;
2943	return SLJIT_SUCCESS;
2944}
2945