xref: /netbsd-src/sys/external/bsd/sljit/dist/sljit_src/sljitNativeX86_common.c (revision aef3fbaba2ad34e16e8a8d5516b05e617374604a)
1 /*	$NetBSD: sljitNativeX86_common.c,v 1.10 2021/11/30 12:32:09 christos Exp $	*/
2 
3 /*
4  *    Stack-less Just-In-Time compiler
5  *
6  *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without modification, are
9  * permitted provided that the following conditions are met:
10  *
11  *   1. Redistributions of source code must retain the above copyright notice, this list of
12  *      conditions and the following disclaimer.
13  *
14  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
15  *      of conditions and the following disclaimer in the documentation and/or other materials
16  *      provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
21  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
24  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
sljit_get_platform_name(void)29 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
30 {
31 	return "x86" SLJIT_CPUINFO;
32 }
33 
34 /*
35    32b register indexes:
36      0 - EAX
37      1 - ECX
38      2 - EDX
39      3 - EBX
40      4 - none
41      5 - EBP
42      6 - ESI
43      7 - EDI
44 */
45 
46 /*
47    64b register indexes:
48      0 - RAX
49      1 - RCX
50      2 - RDX
51      3 - RBX
52      4 - none
53      5 - RBP
54      6 - RSI
55      7 - RDI
56      8 - R8   - From now on REX prefix is required
57      9 - R9
58     10 - R10
59     11 - R11
60     12 - R12
61     13 - R13
62     14 - R14
63     15 - R15
64 */
65 
66 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
67 
68 /* Last register + 1. */
69 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
70 
71 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
72 	0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 7, 6, 3, 4, 5
73 };
74 
75 #define CHECK_EXTRA_REGS(p, w, do) \
76 	if (p >= SLJIT_R3 && p <= SLJIT_S3) { \
77 		if (p <= compiler->scratches) \
78 			w = compiler->saveds_offset - ((p) - SLJIT_R2) * (sljit_sw)sizeof(sljit_sw); \
79 		else \
80 			w = compiler->locals_offset + ((p) - SLJIT_S2) * (sljit_sw)sizeof(sljit_sw); \
81 		p = SLJIT_MEM1(SLJIT_SP); \
82 		do; \
83 	}
84 
85 #else /* SLJIT_CONFIG_X86_32 */
86 
87 /* Last register + 1. */
88 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
89 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
90 #define TMP_REG3	(SLJIT_NUMBER_OF_REGISTERS + 4)
91 
92 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
93    Note: avoid to use r12 and r13 for memory addessing
94    therefore r12 is better for SAVED_EREG than SAVED_REG. */
95 #ifndef _WIN64
96 /* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
97 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
98 	0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
99 };
100 /* low-map. reg_map & 0x7. */
101 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
102 	0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
103 };
104 #else
105 /* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
106 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
107 	0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
108 };
109 /* low-map. reg_map & 0x7. */
110 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
111 	0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
112 };
113 #endif
114 
115 #define REX_W		0x48
116 #define REX_R		0x44
117 #define REX_X		0x42
118 #define REX_B		0x41
119 #define REX		0x40
120 
121 #ifndef _WIN64
122 #define HALFWORD_MAX 0x7fffffffl
123 #define HALFWORD_MIN -0x80000000l
124 #else
125 #define HALFWORD_MAX 0x7fffffffll
126 #define HALFWORD_MIN -0x80000000ll
127 #endif
128 
129 #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
130 #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
131 
132 #define CHECK_EXTRA_REGS(p, w, do)
133 
134 #endif /* SLJIT_CONFIG_X86_32 */
135 
136 #define TMP_FREG	(0)
137 
138 /* Size flags for emit_x86_instruction: */
139 #define EX86_BIN_INS		0x0010
140 #define EX86_SHIFT_INS		0x0020
141 #define EX86_REX		0x0040
142 #define EX86_NO_REXW		0x0080
143 #define EX86_BYTE_ARG		0x0100
144 #define EX86_HALF_ARG		0x0200
145 #define EX86_PREF_66		0x0400
146 #define EX86_PREF_F2		0x0800
147 #define EX86_PREF_F3		0x1000
148 #define EX86_SSE2_OP1		0x2000
149 #define EX86_SSE2_OP2		0x4000
150 #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
151 
152 /* --------------------------------------------------------------------- */
153 /*  Instrucion forms                                                     */
154 /* --------------------------------------------------------------------- */
155 
156 #define ADD		(/* BINARY */ 0 << 3)
157 #define ADD_EAX_i32	0x05
158 #define ADD_r_rm	0x03
159 #define ADD_rm_r	0x01
160 #define ADDSD_x_xm	0x58
161 #define ADC		(/* BINARY */ 2 << 3)
162 #define ADC_EAX_i32	0x15
163 #define ADC_r_rm	0x13
164 #define ADC_rm_r	0x11
165 #define AND		(/* BINARY */ 4 << 3)
166 #define AND_EAX_i32	0x25
167 #define AND_r_rm	0x23
168 #define AND_rm_r	0x21
169 #define ANDPD_x_xm	0x54
170 #define BSR_r_rm	(/* GROUP_0F */ 0xbd)
171 #define CALL_i32	0xe8
172 #define CALL_rm		(/* GROUP_FF */ 2 << 3)
173 #define CDQ		0x99
174 #define CMOVNE_r_rm	(/* GROUP_0F */ 0x45)
175 #define CMP		(/* BINARY */ 7 << 3)
176 #define CMP_EAX_i32	0x3d
177 #define CMP_r_rm	0x3b
178 #define CMP_rm_r	0x39
179 #define CVTPD2PS_x_xm	0x5a
180 #define CVTSI2SD_x_rm	0x2a
181 #define CVTTSD2SI_r_xm	0x2c
182 #define DIV		(/* GROUP_F7 */ 6 << 3)
183 #define DIVSD_x_xm	0x5e
184 #define INT3		0xcc
185 #define IDIV		(/* GROUP_F7 */ 7 << 3)
186 #define IMUL		(/* GROUP_F7 */ 5 << 3)
187 #define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
188 #define IMUL_r_rm_i8	0x6b
189 #define IMUL_r_rm_i32	0x69
190 #define JE_i8		0x74
191 #define JNE_i8		0x75
192 #define JMP_i8		0xeb
193 #define JMP_i32		0xe9
194 #define JMP_rm		(/* GROUP_FF */ 4 << 3)
195 #define LEA_r_m		0x8d
196 #define MOV_r_rm	0x8b
197 #define MOV_r_i32	0xb8
198 #define MOV_rm_r	0x89
199 #define MOV_rm_i32	0xc7
200 #define MOV_rm8_i8	0xc6
201 #define MOV_rm8_r8	0x88
202 #define MOVSD_x_xm	0x10
203 #define MOVSD_xm_x	0x11
204 #define MOVSXD_r_rm	0x63
205 #define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
206 #define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
207 #define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
208 #define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
209 #define MUL		(/* GROUP_F7 */ 4 << 3)
210 #define MULSD_x_xm	0x59
211 #define NEG_rm		(/* GROUP_F7 */ 3 << 3)
212 #define NOP		0x90
213 #define NOT_rm		(/* GROUP_F7 */ 2 << 3)
214 #define OR		(/* BINARY */ 1 << 3)
215 #define OR_r_rm		0x0b
216 #define OR_EAX_i32	0x0d
217 #define OR_rm_r		0x09
218 #define OR_rm8_r8	0x08
219 #define POP_r		0x58
220 #define POP_rm		0x8f
221 #define POPF		0x9d
222 #define PUSH_i32	0x68
223 #define PUSH_r		0x50
224 #define PUSH_rm		(/* GROUP_FF */ 6 << 3)
225 #define PUSHF		0x9c
226 #define RET_near	0xc3
227 #define RET_i16		0xc2
228 #define SBB		(/* BINARY */ 3 << 3)
229 #define SBB_EAX_i32	0x1d
230 #define SBB_r_rm	0x1b
231 #define SBB_rm_r	0x19
232 #define SAR		(/* SHIFT */ 7 << 3)
233 #define SHL		(/* SHIFT */ 4 << 3)
234 #define SHR		(/* SHIFT */ 5 << 3)
235 #define SUB		(/* BINARY */ 5 << 3)
236 #define SUB_EAX_i32	0x2d
237 #define SUB_r_rm	0x2b
238 #define SUB_rm_r	0x29
239 #define SUBSD_x_xm	0x5c
240 #define TEST_EAX_i32	0xa9
241 #define TEST_rm_r	0x85
242 #define UCOMISD_x_xm	0x2e
243 #define UNPCKLPD_x_xm	0x14
244 #define XCHG_EAX_r	0x90
245 #define XCHG_r_rm	0x87
246 #define XOR		(/* BINARY */ 6 << 3)
247 #define XOR_EAX_i32	0x35
248 #define XOR_r_rm	0x33
249 #define XOR_rm_r	0x31
250 #define XORPD_x_xm	0x57
251 
252 #define GROUP_0F	0x0f
253 #define GROUP_F7	0xf7
254 #define GROUP_FF	0xff
255 #define GROUP_BINARY_81	0x81
256 #define GROUP_BINARY_83	0x83
257 #define GROUP_SHIFT_1	0xd1
258 #define GROUP_SHIFT_N	0xc1
259 #define GROUP_SHIFT_CL	0xd3
260 
261 #define MOD_REG		0xc0
262 #define MOD_DISP8	0x40
263 
264 #define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
265 
266 #define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
267 #define POP_REG(r)			(*inst++ = (POP_r + (r)))
268 #define RET()				(*inst++ = (RET_near))
269 #define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
270 /* r32, r/m32 */
271 #define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
272 
273 /* Multithreading does not affect these static variables, since they store
274    built-in CPU features. Therefore they can be overwritten by different threads
275    if they detect the CPU features in the same time. */
276 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
277 static sljit_s32 cpu_has_sse2 = -1;
278 #endif
279 static sljit_s32 cpu_has_cmov = -1;
280 
281 #ifdef _WIN32_WCE
282 #include <cmnintrin.h>
283 #elif defined(_MSC_VER) && _MSC_VER >= 1400
284 #include <intrin.h>
285 #endif
286 
287 /******************************************************/
288 /*    Unaligned-store functions                       */
289 /******************************************************/
290 
sljit_unaligned_store_s16(void * addr,sljit_s16 value)291 static SLJIT_INLINE void sljit_unaligned_store_s16(void *addr, sljit_s16 value)
292 {
293 	SLJIT_MEMCPY(addr, &value, sizeof(value));
294 }
295 
sljit_unaligned_store_s32(void * addr,sljit_s32 value)296 static SLJIT_INLINE void sljit_unaligned_store_s32(void *addr, sljit_s32 value)
297 {
298 	SLJIT_MEMCPY(addr, &value, sizeof(value));
299 }
300 
sljit_unaligned_store_sw(void * addr,sljit_sw value)301 static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
302 {
303 	SLJIT_MEMCPY(addr, &value, sizeof(value));
304 }
305 
306 /******************************************************/
307 /*    Utility functions                               */
308 /******************************************************/
309 
get_cpu_features(void)310 static void get_cpu_features(void)
311 {
312 	sljit_u32 features;
313 
314 #if defined(_MSC_VER) && _MSC_VER >= 1400
315 
316 	int CPUInfo[4];
317 	__cpuid(CPUInfo, 1);
318 	features = (sljit_u32)CPUInfo[3];
319 
320 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__lint__)
321 
322 	/* AT&T syntax. */
323 	__asm__ (
324 		"movl $0x1, %%eax\n"
325 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
326 		/* On x86-32, there is no red zone, so this
327 		   should work (no need for a local variable). */
328 		"push %%ebx\n"
329 #endif
330 		"cpuid\n"
331 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
332 		"pop %%ebx\n"
333 #endif
334 		"movl %%edx, %0\n"
335 		: "=g" (features)
336 		:
337 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
338 		: "%eax", "%ecx", "%edx"
339 #else
340 		: "%rax", "%rbx", "%rcx", "%rdx"
341 #endif
342 	);
343 
344 #else /* _MSC_VER && _MSC_VER >= 1400 */
345 
346 	/* Intel syntax. */
347 	__asm {
348 		mov eax, 1
349 		cpuid
350 		mov features, edx
351 	}
352 
353 #endif /* _MSC_VER && _MSC_VER >= 1400 */
354 
355 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
356 	cpu_has_sse2 = (features >> 26) & 0x1;
357 #endif
358 	cpu_has_cmov = (features >> 15) & 0x1;
359 }
360 
get_jump_code(sljit_s32 type)361 static sljit_u8 get_jump_code(sljit_s32 type)
362 {
363 	switch (type) {
364 	case SLJIT_EQUAL:
365 	case SLJIT_EQUAL_F64:
366 		return 0x84 /* je */;
367 
368 	case SLJIT_NOT_EQUAL:
369 	case SLJIT_NOT_EQUAL_F64:
370 		return 0x85 /* jne */;
371 
372 	case SLJIT_LESS:
373 	case SLJIT_LESS_F64:
374 		return 0x82 /* jc */;
375 
376 	case SLJIT_GREATER_EQUAL:
377 	case SLJIT_GREATER_EQUAL_F64:
378 		return 0x83 /* jae */;
379 
380 	case SLJIT_GREATER:
381 	case SLJIT_GREATER_F64:
382 		return 0x87 /* jnbe */;
383 
384 	case SLJIT_LESS_EQUAL:
385 	case SLJIT_LESS_EQUAL_F64:
386 		return 0x86 /* jbe */;
387 
388 	case SLJIT_SIG_LESS:
389 		return 0x8c /* jl */;
390 
391 	case SLJIT_SIG_GREATER_EQUAL:
392 		return 0x8d /* jnl */;
393 
394 	case SLJIT_SIG_GREATER:
395 		return 0x8f /* jnle */;
396 
397 	case SLJIT_SIG_LESS_EQUAL:
398 		return 0x8e /* jle */;
399 
400 	case SLJIT_OVERFLOW:
401 	case SLJIT_MUL_OVERFLOW:
402 		return 0x80 /* jo */;
403 
404 	case SLJIT_NOT_OVERFLOW:
405 	case SLJIT_MUL_NOT_OVERFLOW:
406 		return 0x81 /* jno */;
407 
408 	case SLJIT_UNORDERED_F64:
409 		return 0x8a /* jp */;
410 
411 	case SLJIT_ORDERED_F64:
412 		return 0x8b /* jpo */;
413 	}
414 	return 0;
415 }
416 
417 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
418 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type, sljit_sw executable_offset);
419 #else
420 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type);
421 #endif
422 
generate_near_jump_code(struct sljit_jump * jump,sljit_u8 * code_ptr,sljit_u8 * code,sljit_s32 type,sljit_sw executable_offset)423 static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_s32 type, sljit_sw executable_offset)
424 {
425 	sljit_s32 short_jump;
426 	sljit_uw label_addr;
427 
428 	if (jump->flags & JUMP_LABEL)
429 		label_addr = (sljit_uw)(code + jump->u.label->size);
430 	else
431 		label_addr = jump->u.target - executable_offset;
432 
433 	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
434 
435 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
436 	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
437 		return generate_far_jump_code(jump, code_ptr, type);
438 #endif
439 
440 	if (type == SLJIT_JUMP) {
441 		if (short_jump)
442 			*code_ptr++ = JMP_i8;
443 		else
444 			*code_ptr++ = JMP_i32;
445 		jump->addr++;
446 	}
447 	else if (type >= SLJIT_FAST_CALL) {
448 		short_jump = 0;
449 		*code_ptr++ = CALL_i32;
450 		jump->addr++;
451 	}
452 	else if (short_jump) {
453 		*code_ptr++ = get_jump_code(type) - 0x10;
454 		jump->addr++;
455 	}
456 	else {
457 		*code_ptr++ = GROUP_0F;
458 		*code_ptr++ = get_jump_code(type);
459 		jump->addr += 2;
460 	}
461 
462 	if (short_jump) {
463 		jump->flags |= PATCH_MB;
464 		code_ptr += sizeof(sljit_s8);
465 	} else {
466 		jump->flags |= PATCH_MW;
467 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
468 		code_ptr += sizeof(sljit_sw);
469 #else
470 		code_ptr += sizeof(sljit_s32);
471 #endif
472 	}
473 
474 	return code_ptr;
475 }
476 
sljit_generate_code(struct sljit_compiler * compiler)477 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
478 {
479 	struct sljit_memory_fragment *buf;
480 	sljit_u8 *code;
481 	sljit_u8 *code_ptr;
482 	sljit_u8 *buf_ptr;
483 	sljit_u8 *buf_end;
484 	sljit_u8 len;
485 	sljit_sw executable_offset;
486 	sljit_sw jump_addr;
487 
488 	struct sljit_label *label;
489 	struct sljit_jump *jump;
490 	struct sljit_const *const_;
491 
492 	CHECK_ERROR_PTR();
493 	CHECK_PTR(check_sljit_generate_code(compiler));
494 	reverse_buf(compiler);
495 
496 	/* Second code generation pass. */
497 	code = (sljit_u8*)SLJIT_MALLOC_EXEC(compiler->size);
498 	PTR_FAIL_WITH_EXEC_IF(code);
499 	buf = compiler->buf;
500 
501 	code_ptr = code;
502 	label = compiler->labels;
503 	jump = compiler->jumps;
504 	const_ = compiler->consts;
505 	executable_offset = SLJIT_EXEC_OFFSET(code);
506 
507 	do {
508 		buf_ptr = buf->memory;
509 		buf_end = buf_ptr + buf->used_size;
510 		do {
511 			len = *buf_ptr++;
512 			if (len > 0) {
513 				/* The code is already generated. */
514 				SLJIT_MEMCPY(code_ptr, buf_ptr, len);
515 				code_ptr += len;
516 				buf_ptr += len;
517 			}
518 			else {
519 				if (*buf_ptr >= 2) {
520 					jump->addr = (sljit_uw)code_ptr;
521 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
522 						code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 2, executable_offset);
523 					else {
524 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
525 						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 2, executable_offset);
526 #else
527 						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 2);
528 #endif
529 					}
530 					jump = jump->next;
531 				}
532 				else if (*buf_ptr == 0) {
533 					label->addr = ((sljit_uw)code_ptr) + executable_offset;
534 					label->size = code_ptr - code;
535 					label = label->next;
536 				}
537 				else { /* *buf_ptr is 1 */
538 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
539 					const_ = const_->next;
540 				}
541 				buf_ptr++;
542 			}
543 		} while (buf_ptr < buf_end);
544 		SLJIT_ASSERT(buf_ptr == buf_end);
545 		buf = buf->next;
546 	} while (buf);
547 
548 	SLJIT_ASSERT(!label);
549 	SLJIT_ASSERT(!jump);
550 	SLJIT_ASSERT(!const_);
551 
552 	jump = compiler->jumps;
553 	while (jump) {
554 		jump_addr = jump->addr + executable_offset;
555 
556 		if (jump->flags & PATCH_MB) {
557 			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) <= 127);
558 			*(sljit_u8*)jump->addr = (sljit_u8)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8)));
559 		} else if (jump->flags & PATCH_MW) {
560 			if (jump->flags & JUMP_LABEL) {
561 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
562 				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_sw))));
563 #else
564 				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
565 				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))));
566 #endif
567 			}
568 			else {
569 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
570 				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_sw))));
571 #else
572 				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
573 				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.target - (jump_addr + sizeof(sljit_s32))));
574 #endif
575 			}
576 		}
577 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
578 		else if (jump->flags & PATCH_MD)
579 			sljit_unaligned_store_sw((void*)jump->addr, jump->u.label->addr);
580 #endif
581 
582 		jump = jump->next;
583 	}
584 
585 	/* Some space may be wasted because of short jumps. */
586 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
587 	compiler->error = SLJIT_ERR_COMPILED;
588 	compiler->executable_offset = executable_offset;
589 	compiler->executable_size = code_ptr - code;
590 	return (void*)(code + executable_offset);
591 }
592 
593 /* --------------------------------------------------------------------- */
594 /*  Operators                                                            */
595 /* --------------------------------------------------------------------- */
596 
597 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
598 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
599 	sljit_s32 dst, sljit_sw dstw,
600 	sljit_s32 src1, sljit_sw src1w,
601 	sljit_s32 src2, sljit_sw src2w);
602 
603 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
604 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
605 	sljit_s32 dst, sljit_sw dstw,
606 	sljit_s32 src1, sljit_sw src1w,
607 	sljit_s32 src2, sljit_sw src2w);
608 
609 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
610 	sljit_s32 dst, sljit_sw dstw,
611 	sljit_s32 src, sljit_sw srcw);
612 
613 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
614 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
615 
616 #ifdef _WIN32
617 #include <malloc.h>
618 
sljit_grow_stack(sljit_sw local_size)619 static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
620 {
621 	/* Workaround for calling the internal _chkstk() function on Windows.
622 	This function touches all 4k pages belongs to the requested stack space,
623 	which size is passed in local_size. This is necessary on Windows where
624 	the stack can only grow in 4k steps. However, this function just burn
625 	CPU cycles if the stack is large enough. However, you don't know it in
626 	advance, so it must always be called. I think this is a bad design in
627 	general even if it has some reasons. */
628 	*(volatile sljit_s32*)alloca(local_size) = 0;
629 }
630 
631 #endif
632 
633 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
634 #include "sljitNativeX86_32.c"
635 #else
636 #include "sljitNativeX86_64.c"
637 #endif
638 
emit_mov(struct sljit_compiler * compiler,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)639 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
640 	sljit_s32 dst, sljit_sw dstw,
641 	sljit_s32 src, sljit_sw srcw)
642 {
643 	sljit_u8* inst;
644 
645 	if (dst == SLJIT_UNUSED) {
646 		/* No destination, doesn't need to setup flags. */
647 		if (src & SLJIT_MEM) {
648 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
649 			FAIL_IF(!inst);
650 			*inst = MOV_r_rm;
651 		}
652 		return SLJIT_SUCCESS;
653 	}
654 	if (FAST_IS_REG(src)) {
655 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
656 		FAIL_IF(!inst);
657 		*inst = MOV_rm_r;
658 		return SLJIT_SUCCESS;
659 	}
660 	if (src & SLJIT_IMM) {
661 		if (FAST_IS_REG(dst)) {
662 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
663 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
664 #else
665 			if (!compiler->mode32) {
666 				if (NOT_HALFWORD(srcw))
667 					return emit_load_imm64(compiler, dst, srcw);
668 			}
669 			else
670 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
671 #endif
672 		}
673 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
674 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
675 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
676 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
677 			FAIL_IF(!inst);
678 			*inst = MOV_rm_r;
679 			return SLJIT_SUCCESS;
680 		}
681 #endif
682 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
683 		FAIL_IF(!inst);
684 		*inst = MOV_rm_i32;
685 		return SLJIT_SUCCESS;
686 	}
687 	if (FAST_IS_REG(dst)) {
688 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
689 		FAIL_IF(!inst);
690 		*inst = MOV_r_rm;
691 		return SLJIT_SUCCESS;
692 	}
693 
694 	/* Memory to memory move. Requires two instruction. */
695 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
696 	FAIL_IF(!inst);
697 	*inst = MOV_r_rm;
698 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
699 	FAIL_IF(!inst);
700 	*inst = MOV_rm_r;
701 	return SLJIT_SUCCESS;
702 }
703 
sljit_emit_op0(struct sljit_compiler * compiler,sljit_s32 op)704 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
705 {
706 	sljit_u8 *inst;
707 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
708 	sljit_s32 size;
709 #endif
710 
711 	CHECK_ERROR();
712 	CHECK(check_sljit_emit_op0(compiler, op));
713 
714 	switch (GET_OPCODE(op)) {
715 	case SLJIT_BREAKPOINT:
716 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
717 		FAIL_IF(!inst);
718 		INC_SIZE(1);
719 		*inst = INT3;
720 		break;
721 	case SLJIT_NOP:
722 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
723 		FAIL_IF(!inst);
724 		INC_SIZE(1);
725 		*inst = NOP;
726 		break;
727 	case SLJIT_LMUL_UW:
728 	case SLJIT_LMUL_SW:
729 	case SLJIT_DIVMOD_UW:
730 	case SLJIT_DIVMOD_SW:
731 	case SLJIT_DIV_UW:
732 	case SLJIT_DIV_SW:
733 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
734 #ifdef _WIN64
735 		SLJIT_ASSERT(
736 			reg_map[SLJIT_R0] == 0
737 			&& reg_map[SLJIT_R1] == 2
738 			&& reg_map[TMP_REG1] > 7);
739 #else
740 		SLJIT_ASSERT(
741 			reg_map[SLJIT_R0] == 0
742 			&& reg_map[SLJIT_R1] < 7
743 			&& reg_map[TMP_REG1] == 2);
744 #endif
745 		compiler->mode32 = op & SLJIT_I32_OP;
746 #endif
747 		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
748 
749 		op = GET_OPCODE(op);
750 		if ((op | 0x2) == SLJIT_DIV_UW) {
751 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
752 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
753 			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
754 #else
755 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
756 #endif
757 			FAIL_IF(!inst);
758 			*inst = XOR_r_rm;
759 		}
760 
761 		if ((op | 0x2) == SLJIT_DIV_SW) {
762 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
763 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
764 #endif
765 
766 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
767 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
768 			FAIL_IF(!inst);
769 			INC_SIZE(1);
770 			*inst = CDQ;
771 #else
772 			if (compiler->mode32) {
773 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
774 				FAIL_IF(!inst);
775 				INC_SIZE(1);
776 				*inst = CDQ;
777 			} else {
778 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
779 				FAIL_IF(!inst);
780 				INC_SIZE(2);
781 				*inst++ = REX_W;
782 				*inst = CDQ;
783 			}
784 #endif
785 		}
786 
787 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
788 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
789 		FAIL_IF(!inst);
790 		INC_SIZE(2);
791 		*inst++ = GROUP_F7;
792 		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
793 #else
794 #ifdef _WIN64
795 		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
796 #else
797 		size = (!compiler->mode32) ? 3 : 2;
798 #endif
799 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
800 		FAIL_IF(!inst);
801 		INC_SIZE(size);
802 #ifdef _WIN64
803 		if (!compiler->mode32)
804 			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
805 		else if (op >= SLJIT_DIVMOD_UW)
806 			*inst++ = REX_B;
807 		*inst++ = GROUP_F7;
808 		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
809 #else
810 		if (!compiler->mode32)
811 			*inst++ = REX_W;
812 		*inst++ = GROUP_F7;
813 		*inst = MOD_REG | reg_map[SLJIT_R1];
814 #endif
815 #endif
816 		switch (op) {
817 		case SLJIT_LMUL_UW:
818 			*inst |= MUL;
819 			break;
820 		case SLJIT_LMUL_SW:
821 			*inst |= IMUL;
822 			break;
823 		case SLJIT_DIVMOD_UW:
824 		case SLJIT_DIV_UW:
825 			*inst |= DIV;
826 			break;
827 		case SLJIT_DIVMOD_SW:
828 		case SLJIT_DIV_SW:
829 			*inst |= IDIV;
830 			break;
831 		}
832 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
833 		if (op <= SLJIT_DIVMOD_SW)
834 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
835 #else
836 		if (op >= SLJIT_DIV_UW)
837 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
838 #endif
839 		break;
840 	}
841 
842 	return SLJIT_SUCCESS;
843 }
844 
845 #define ENCODE_PREFIX(prefix) \
846 	do { \
847 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); \
848 		FAIL_IF(!inst); \
849 		INC_SIZE(1); \
850 		*inst = (prefix); \
851 	} while (0)
852 
emit_mov_byte(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)853 static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
854 	sljit_s32 dst, sljit_sw dstw,
855 	sljit_s32 src, sljit_sw srcw)
856 {
857 	sljit_u8* inst;
858 	sljit_s32 dst_r;
859 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
860 	sljit_s32 work_r;
861 #endif
862 
863 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
864 	compiler->mode32 = 0;
865 #endif
866 
867 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
868 		return SLJIT_SUCCESS; /* Empty instruction. */
869 
870 	if (src & SLJIT_IMM) {
871 		if (FAST_IS_REG(dst)) {
872 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
873 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
874 #else
875 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
876 			FAIL_IF(!inst);
877 			*inst = MOV_rm_i32;
878 			return SLJIT_SUCCESS;
879 #endif
880 		}
881 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
882 		FAIL_IF(!inst);
883 		*inst = MOV_rm8_i8;
884 		return SLJIT_SUCCESS;
885 	}
886 
887 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
888 
889 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
890 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
891 		if (reg_map[src] >= 4) {
892 			SLJIT_ASSERT(dst_r == TMP_REG1);
893 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
894 		} else
895 			dst_r = src;
896 #else
897 		dst_r = src;
898 #endif
899 	}
900 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
901 	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
902 		/* src, dst are registers. */
903 		SLJIT_ASSERT(SLOW_IS_REG(dst));
904 		if (reg_map[dst] < 4) {
905 			if (dst != src)
906 				EMIT_MOV(compiler, dst, 0, src, 0);
907 			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
908 			FAIL_IF(!inst);
909 			*inst++ = GROUP_0F;
910 			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
911 		}
912 		else {
913 			if (dst != src)
914 				EMIT_MOV(compiler, dst, 0, src, 0);
915 			if (sign) {
916 				/* shl reg, 24 */
917 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
918 				FAIL_IF(!inst);
919 				*inst |= SHL;
920 				/* sar reg, 24 */
921 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
922 				FAIL_IF(!inst);
923 				*inst |= SAR;
924 			}
925 			else {
926 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
927 				FAIL_IF(!inst);
928 				*(inst + 1) |= AND;
929 			}
930 		}
931 		return SLJIT_SUCCESS;
932 	}
933 #endif
934 	else {
935 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
936 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
937 		FAIL_IF(!inst);
938 		*inst++ = GROUP_0F;
939 		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
940 	}
941 
942 	if (dst & SLJIT_MEM) {
943 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
944 		if (dst_r == TMP_REG1) {
945 			/* Find a non-used register, whose reg_map[src] < 4. */
946 			if ((dst & REG_MASK) == SLJIT_R0) {
947 				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
948 					work_r = SLJIT_R2;
949 				else
950 					work_r = SLJIT_R1;
951 			}
952 			else {
953 				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
954 					work_r = SLJIT_R0;
955 				else if ((dst & REG_MASK) == SLJIT_R1)
956 					work_r = SLJIT_R2;
957 				else
958 					work_r = SLJIT_R1;
959 			}
960 
961 			if (work_r == SLJIT_R0) {
962 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
963 			}
964 			else {
965 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
966 				FAIL_IF(!inst);
967 				*inst = XCHG_r_rm;
968 			}
969 
970 			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
971 			FAIL_IF(!inst);
972 			*inst = MOV_rm8_r8;
973 
974 			if (work_r == SLJIT_R0) {
975 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
976 			}
977 			else {
978 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
979 				FAIL_IF(!inst);
980 				*inst = XCHG_r_rm;
981 			}
982 		}
983 		else {
984 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
985 			FAIL_IF(!inst);
986 			*inst = MOV_rm8_r8;
987 		}
988 #else
989 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
990 		FAIL_IF(!inst);
991 		*inst = MOV_rm8_r8;
992 #endif
993 	}
994 
995 	return SLJIT_SUCCESS;
996 }
997 
emit_mov_half(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)998 static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
999 	sljit_s32 dst, sljit_sw dstw,
1000 	sljit_s32 src, sljit_sw srcw)
1001 {
1002 	sljit_u8* inst;
1003 	sljit_s32 dst_r;
1004 
1005 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1006 	compiler->mode32 = 0;
1007 #endif
1008 
1009 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
1010 		return SLJIT_SUCCESS; /* Empty instruction. */
1011 
1012 	if (src & SLJIT_IMM) {
1013 		if (FAST_IS_REG(dst)) {
1014 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1015 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1016 #else
1017 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1018 			FAIL_IF(!inst);
1019 			*inst = MOV_rm_i32;
1020 			return SLJIT_SUCCESS;
1021 #endif
1022 		}
1023 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1024 		FAIL_IF(!inst);
1025 		*inst = MOV_rm_i32;
1026 		return SLJIT_SUCCESS;
1027 	}
1028 
1029 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1030 
1031 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1032 		dst_r = src;
1033 	else {
1034 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1035 		FAIL_IF(!inst);
1036 		*inst++ = GROUP_0F;
1037 		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1038 	}
1039 
1040 	if (dst & SLJIT_MEM) {
1041 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1042 		FAIL_IF(!inst);
1043 		*inst = MOV_rm_r;
1044 	}
1045 
1046 	return SLJIT_SUCCESS;
1047 }
1048 
emit_unary(struct sljit_compiler * compiler,sljit_u8 opcode,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1049 static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1050 	sljit_s32 dst, sljit_sw dstw,
1051 	sljit_s32 src, sljit_sw srcw)
1052 {
1053 	sljit_u8* inst;
1054 
1055 	if (dst == SLJIT_UNUSED) {
1056 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1057 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1058 		FAIL_IF(!inst);
1059 		*inst++ = GROUP_F7;
1060 		*inst |= opcode;
1061 		return SLJIT_SUCCESS;
1062 	}
1063 	if (dst == src && dstw == srcw) {
1064 		/* Same input and output */
1065 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1066 		FAIL_IF(!inst);
1067 		*inst++ = GROUP_F7;
1068 		*inst |= opcode;
1069 		return SLJIT_SUCCESS;
1070 	}
1071 	if (FAST_IS_REG(dst)) {
1072 		EMIT_MOV(compiler, dst, 0, src, srcw);
1073 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1074 		FAIL_IF(!inst);
1075 		*inst++ = GROUP_F7;
1076 		*inst |= opcode;
1077 		return SLJIT_SUCCESS;
1078 	}
1079 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1080 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1081 	FAIL_IF(!inst);
1082 	*inst++ = GROUP_F7;
1083 	*inst |= opcode;
1084 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1085 	return SLJIT_SUCCESS;
1086 }
1087 
emit_not_with_flags(struct sljit_compiler * compiler,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1088 static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler,
1089 	sljit_s32 dst, sljit_sw dstw,
1090 	sljit_s32 src, sljit_sw srcw)
1091 {
1092 	sljit_u8* inst;
1093 
1094 	if (dst == SLJIT_UNUSED) {
1095 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1096 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1097 		FAIL_IF(!inst);
1098 		*inst++ = GROUP_F7;
1099 		*inst |= NOT_rm;
1100 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1101 		FAIL_IF(!inst);
1102 		*inst = OR_r_rm;
1103 		return SLJIT_SUCCESS;
1104 	}
1105 	if (FAST_IS_REG(dst)) {
1106 		EMIT_MOV(compiler, dst, 0, src, srcw);
1107 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1108 		FAIL_IF(!inst);
1109 		*inst++ = GROUP_F7;
1110 		*inst |= NOT_rm;
1111 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1112 		FAIL_IF(!inst);
1113 		*inst = OR_r_rm;
1114 		return SLJIT_SUCCESS;
1115 	}
1116 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1117 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1118 	FAIL_IF(!inst);
1119 	*inst++ = GROUP_F7;
1120 	*inst |= NOT_rm;
1121 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1122 	FAIL_IF(!inst);
1123 	*inst = OR_r_rm;
1124 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1125 	return SLJIT_SUCCESS;
1126 }
1127 
emit_clz(struct sljit_compiler * compiler,sljit_s32 op_flags,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1128 static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
1129 	sljit_s32 dst, sljit_sw dstw,
1130 	sljit_s32 src, sljit_sw srcw)
1131 {
1132 	sljit_u8* inst;
1133 	sljit_s32 dst_r;
1134 
1135 	SLJIT_UNUSED_ARG(op_flags);
1136 	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
1137 		/* Just set the zero flag. */
1138 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1139 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1140 		FAIL_IF(!inst);
1141 		*inst++ = GROUP_F7;
1142 		*inst |= NOT_rm;
1143 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1144 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
1145 #else
1146 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, TMP_REG1, 0);
1147 #endif
1148 		FAIL_IF(!inst);
1149 		*inst |= SHR;
1150 		return SLJIT_SUCCESS;
1151 	}
1152 
1153 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
1154 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
1155 		src = TMP_REG1;
1156 		srcw = 0;
1157 	}
1158 
1159 	inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
1160 	FAIL_IF(!inst);
1161 	*inst++ = GROUP_0F;
1162 	*inst = BSR_r_rm;
1163 
1164 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1165 	if (FAST_IS_REG(dst))
1166 		dst_r = dst;
1167 	else {
1168 		/* Find an unused temporary register. */
1169 		if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
1170 			dst_r = SLJIT_R0;
1171 		else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
1172 			dst_r = SLJIT_R1;
1173 		else
1174 			dst_r = SLJIT_R2;
1175 		EMIT_MOV(compiler, dst, dstw, dst_r, 0);
1176 	}
1177 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
1178 #else
1179 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
1180 	compiler->mode32 = 0;
1181 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 64 + 63 : 32 + 31);
1182 	compiler->mode32 = op_flags & SLJIT_I32_OP;
1183 #endif
1184 
1185 	if (cpu_has_cmov == -1)
1186 		get_cpu_features();
1187 
1188 	if (cpu_has_cmov) {
1189 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1190 		FAIL_IF(!inst);
1191 		*inst++ = GROUP_0F;
1192 		*inst = CMOVNE_r_rm;
1193 	} else {
1194 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1195 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1196 		FAIL_IF(!inst);
1197 		INC_SIZE(4);
1198 
1199 		*inst++ = JE_i8;
1200 		*inst++ = 2;
1201 		*inst++ = MOV_r_rm;
1202 		*inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
1203 #else
1204 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
1205 		FAIL_IF(!inst);
1206 		INC_SIZE(5);
1207 
1208 		*inst++ = JE_i8;
1209 		*inst++ = 3;
1210 		*inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
1211 		*inst++ = MOV_r_rm;
1212 		*inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
1213 #endif
1214 	}
1215 
1216 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1217 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1218 #else
1219 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0);
1220 #endif
1221 	FAIL_IF(!inst);
1222 	*(inst + 1) |= XOR;
1223 
1224 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1225 	if (dst & SLJIT_MEM) {
1226 		inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1227 		FAIL_IF(!inst);
1228 		*inst = XCHG_r_rm;
1229 	}
1230 #else
1231 	if (dst & SLJIT_MEM)
1232 		EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
1233 #endif
1234 	return SLJIT_SUCCESS;
1235 }
1236 
sljit_emit_op1(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1237 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1238 	sljit_s32 dst, sljit_sw dstw,
1239 	sljit_s32 src, sljit_sw srcw)
1240 {
1241 	sljit_s32 update = 0;
1242 	sljit_s32 op_flags = GET_ALL_FLAGS(op);
1243 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1244 	sljit_s32 dst_is_ereg = 0;
1245 	sljit_s32 src_is_ereg = 0;
1246 #else
1247 #	define src_is_ereg 0
1248 #endif
1249 
1250 	CHECK_ERROR();
1251 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1252 	ADJUST_LOCAL_OFFSET(dst, dstw);
1253 	ADJUST_LOCAL_OFFSET(src, srcw);
1254 
1255 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1256 	CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
1257 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1258 	compiler->mode32 = op_flags & SLJIT_I32_OP;
1259 #endif
1260 
1261 	op = GET_OPCODE(op);
1262 	if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
1263 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1264 		compiler->mode32 = 0;
1265 #endif
1266 
1267 		if (op_flags & SLJIT_I32_OP) {
1268 			if (FAST_IS_REG(src) && src == dst) {
1269 				if (!TYPE_CAST_NEEDED(op))
1270 					return SLJIT_SUCCESS;
1271 			}
1272 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1273 			if (op == SLJIT_MOV_S32 && (src & SLJIT_MEM))
1274 				op = SLJIT_MOV_U32;
1275 			if (op == SLJIT_MOVU_S32 && (src & SLJIT_MEM))
1276 				op = SLJIT_MOVU_U32;
1277 			if (op == SLJIT_MOV_U32 && (src & SLJIT_IMM))
1278 				op = SLJIT_MOV_S32;
1279 			if (op == SLJIT_MOVU_U32 && (src & SLJIT_IMM))
1280 				op = SLJIT_MOVU_S32;
1281 #endif
1282 		}
1283 
1284 		SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
1285 		if (op >= SLJIT_MOVU) {
1286 			update = 1;
1287 			op -= 8;
1288 		}
1289 
1290 		if (src & SLJIT_IMM) {
1291 			switch (op) {
1292 			case SLJIT_MOV_U8:
1293 				srcw = (sljit_u8)srcw;
1294 				break;
1295 			case SLJIT_MOV_S8:
1296 				srcw = (sljit_s8)srcw;
1297 				break;
1298 			case SLJIT_MOV_U16:
1299 				srcw = (sljit_u16)srcw;
1300 				break;
1301 			case SLJIT_MOV_S16:
1302 				srcw = (sljit_s16)srcw;
1303 				break;
1304 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1305 			case SLJIT_MOV_U32:
1306 				srcw = (sljit_u32)srcw;
1307 				break;
1308 			case SLJIT_MOV_S32:
1309 				srcw = (sljit_s32)srcw;
1310 				break;
1311 #endif
1312 			}
1313 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1314 			if (SLJIT_UNLIKELY(dst_is_ereg))
1315 				return emit_mov(compiler, dst, dstw, src, srcw);
1316 #endif
1317 		}
1318 
1319 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1320 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1321 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1322 			dst = TMP_REG1;
1323 		}
1324 #endif
1325 
1326 		switch (op) {
1327 		case SLJIT_MOV:
1328 		case SLJIT_MOV_P:
1329 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1330 		case SLJIT_MOV_U32:
1331 		case SLJIT_MOV_S32:
1332 #endif
1333 			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1334 			break;
1335 		case SLJIT_MOV_U8:
1336 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1337 			break;
1338 		case SLJIT_MOV_S8:
1339 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1340 			break;
1341 		case SLJIT_MOV_U16:
1342 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1343 			break;
1344 		case SLJIT_MOV_S16:
1345 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1346 			break;
1347 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1348 		case SLJIT_MOV_U32:
1349 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1350 			break;
1351 		case SLJIT_MOV_S32:
1352 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1353 			break;
1354 #endif
1355 		}
1356 
1357 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1358 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1359 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1360 #endif
1361 
1362 		if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK)) {
1363 			if ((src & OFFS_REG_MASK) != 0) {
1364 				FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
1365 						(src & REG_MASK), 0, (src & REG_MASK), 0, OFFS_REG(dst), 0));
1366 			}
1367 			else if (srcw != 0) {
1368 				FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
1369 						(src & REG_MASK), 0, (src & REG_MASK), 0, SLJIT_IMM, srcw));
1370 			}
1371 		}
1372 
1373 		if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK)) {
1374 			if ((dst & OFFS_REG_MASK) != 0) {
1375 				FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
1376 						(dst & REG_MASK), 0, (dst & REG_MASK), 0, OFFS_REG(dst), 0));
1377 			}
1378 			else if (dstw != 0) {
1379 				FAIL_IF(emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
1380 						(dst & REG_MASK), 0, (dst & REG_MASK), 0, SLJIT_IMM, dstw));
1381 			}
1382 		}
1383 		return SLJIT_SUCCESS;
1384 	}
1385 
1386 	switch (op) {
1387 	case SLJIT_NOT:
1388 		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_Z))
1389 			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1390 		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1391 
1392 	case SLJIT_NEG:
1393 		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1394 
1395 	case SLJIT_CLZ:
1396 		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1397 	}
1398 
1399 	return SLJIT_SUCCESS;
1400 
1401 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1402 #	undef src_is_ereg
1403 #endif
1404 }
1405 
1406 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1407 
1408 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1409 	if (IS_HALFWORD(immw) || compiler->mode32) { \
1410 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1411 		FAIL_IF(!inst); \
1412 		*(inst + 1) |= (op_imm); \
1413 	} \
1414 	else { \
1415 		FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
1416 		inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
1417 		FAIL_IF(!inst); \
1418 		*inst = (op_mr); \
1419 	}
1420 
1421 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1422 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1423 
1424 #else
1425 
1426 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1427 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1428 	FAIL_IF(!inst); \
1429 	*(inst + 1) |= (op_imm);
1430 
1431 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1432 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1433 
1434 #endif
1435 
emit_cum_binary(struct sljit_compiler * compiler,sljit_u8 op_rm,sljit_u8 op_mr,sljit_u8 op_imm,sljit_u8 op_eax_imm,sljit_s32 dst,sljit_sw dstw,sljit_s32 src1,sljit_sw src1w,sljit_s32 src2,sljit_sw src2w)1436 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1437 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
1438 	sljit_s32 dst, sljit_sw dstw,
1439 	sljit_s32 src1, sljit_sw src1w,
1440 	sljit_s32 src2, sljit_sw src2w)
1441 {
1442 	sljit_u8* inst;
1443 
1444 	if (dst == SLJIT_UNUSED) {
1445 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1446 		if (src2 & SLJIT_IMM) {
1447 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1448 		}
1449 		else {
1450 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1451 			FAIL_IF(!inst);
1452 			*inst = op_rm;
1453 		}
1454 		return SLJIT_SUCCESS;
1455 	}
1456 
1457 	if (dst == src1 && dstw == src1w) {
1458 		if (src2 & SLJIT_IMM) {
1459 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1460 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1461 #else
1462 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1463 #endif
1464 				BINARY_EAX_IMM(op_eax_imm, src2w);
1465 			}
1466 			else {
1467 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1468 			}
1469 		}
1470 		else if (FAST_IS_REG(dst)) {
1471 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1472 			FAIL_IF(!inst);
1473 			*inst = op_rm;
1474 		}
1475 		else if (FAST_IS_REG(src2)) {
1476 			/* Special exception for sljit_emit_op_flags. */
1477 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1478 			FAIL_IF(!inst);
1479 			*inst = op_mr;
1480 		}
1481 		else {
1482 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1483 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1484 			FAIL_IF(!inst);
1485 			*inst = op_mr;
1486 		}
1487 		return SLJIT_SUCCESS;
1488 	}
1489 
1490 	/* Only for cumulative operations. */
1491 	if (dst == src2 && dstw == src2w) {
1492 		if (src1 & SLJIT_IMM) {
1493 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1494 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1495 #else
1496 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1497 #endif
1498 				BINARY_EAX_IMM(op_eax_imm, src1w);
1499 			}
1500 			else {
1501 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1502 			}
1503 		}
1504 		else if (FAST_IS_REG(dst)) {
1505 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1506 			FAIL_IF(!inst);
1507 			*inst = op_rm;
1508 		}
1509 		else if (FAST_IS_REG(src1)) {
1510 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1511 			FAIL_IF(!inst);
1512 			*inst = op_mr;
1513 		}
1514 		else {
1515 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1516 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1517 			FAIL_IF(!inst);
1518 			*inst = op_mr;
1519 		}
1520 		return SLJIT_SUCCESS;
1521 	}
1522 
1523 	/* General version. */
1524 	if (FAST_IS_REG(dst)) {
1525 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1526 		if (src2 & SLJIT_IMM) {
1527 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1528 		}
1529 		else {
1530 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1531 			FAIL_IF(!inst);
1532 			*inst = op_rm;
1533 		}
1534 	}
1535 	else {
1536 		/* This version requires less memory writing. */
1537 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1538 		if (src2 & SLJIT_IMM) {
1539 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1540 		}
1541 		else {
1542 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1543 			FAIL_IF(!inst);
1544 			*inst = op_rm;
1545 		}
1546 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1547 	}
1548 
1549 	return SLJIT_SUCCESS;
1550 }
1551 
1552 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
1553 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
1554 	sljit_s32 dst, sljit_sw dstw,
1555 	sljit_s32 src1, sljit_sw src1w,
1556 	sljit_s32 src2, sljit_sw src2w)
1557 {
1558 	sljit_u8* inst;
1559 
1560 	if (dst == SLJIT_UNUSED) {
1561 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1562 		if (src2 & SLJIT_IMM) {
1563 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1564 		}
1565 		else {
1566 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1567 			FAIL_IF(!inst);
1568 			*inst = op_rm;
1569 		}
1570 		return SLJIT_SUCCESS;
1571 	}
1572 
1573 	if (dst == src1 && dstw == src1w) {
1574 		if (src2 & SLJIT_IMM) {
1575 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1576 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1577 #else
1578 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1579 #endif
1580 				BINARY_EAX_IMM(op_eax_imm, src2w);
1581 			}
1582 			else {
1583 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1584 			}
1585 		}
1586 		else if (FAST_IS_REG(dst)) {
1587 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1588 			FAIL_IF(!inst);
1589 			*inst = op_rm;
1590 		}
1591 		else if (FAST_IS_REG(src2)) {
1592 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1593 			FAIL_IF(!inst);
1594 			*inst = op_mr;
1595 		}
1596 		else {
1597 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1598 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1599 			FAIL_IF(!inst);
1600 			*inst = op_mr;
1601 		}
1602 		return SLJIT_SUCCESS;
1603 	}
1604 
1605 	/* General version. */
1606 	if (FAST_IS_REG(dst) && dst != src2) {
1607 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1608 		if (src2 & SLJIT_IMM) {
1609 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1610 		}
1611 		else {
1612 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1613 			FAIL_IF(!inst);
1614 			*inst = op_rm;
1615 		}
1616 	}
1617 	else {
1618 		/* This version requires less memory writing. */
1619 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1620 		if (src2 & SLJIT_IMM) {
1621 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1622 		}
1623 		else {
1624 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1625 			FAIL_IF(!inst);
1626 			*inst = op_rm;
1627 		}
1628 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1629 	}
1630 
1631 	return SLJIT_SUCCESS;
1632 }
1633 
1634 static sljit_s32 emit_mul(struct sljit_compiler *compiler,
1635 	sljit_s32 dst, sljit_sw dstw,
1636 	sljit_s32 src1, sljit_sw src1w,
1637 	sljit_s32 src2, sljit_sw src2w)
1638 {
1639 	sljit_u8* inst;
1640 	sljit_s32 dst_r;
1641 
1642 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1643 
1644 	/* Register destination. */
1645 	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1646 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1647 		FAIL_IF(!inst);
1648 		*inst++ = GROUP_0F;
1649 		*inst = IMUL_r_rm;
1650 	}
1651 	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1652 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1653 		FAIL_IF(!inst);
1654 		*inst++ = GROUP_0F;
1655 		*inst = IMUL_r_rm;
1656 	}
1657 	else if (src1 & SLJIT_IMM) {
1658 		if (src2 & SLJIT_IMM) {
1659 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1660 			src2 = dst_r;
1661 			src2w = 0;
1662 		}
1663 
1664 		if (src1w <= 127 && src1w >= -128) {
1665 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1666 			FAIL_IF(!inst);
1667 			*inst = IMUL_r_rm_i8;
1668 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1669 			FAIL_IF(!inst);
1670 			INC_SIZE(1);
1671 			*inst = (sljit_s8)src1w;
1672 		}
1673 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1674 		else {
1675 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1676 			FAIL_IF(!inst);
1677 			*inst = IMUL_r_rm_i32;
1678 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1679 			FAIL_IF(!inst);
1680 			INC_SIZE(4);
1681 			sljit_unaligned_store_sw(inst, src1w);
1682 		}
1683 #else
1684 		else if (IS_HALFWORD(src1w)) {
1685 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1686 			FAIL_IF(!inst);
1687 			*inst = IMUL_r_rm_i32;
1688 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1689 			FAIL_IF(!inst);
1690 			INC_SIZE(4);
1691 			sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
1692 		}
1693 		else {
1694 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1695 			if (dst_r != src2)
1696 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1697 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1698 			FAIL_IF(!inst);
1699 			*inst++ = GROUP_0F;
1700 			*inst = IMUL_r_rm;
1701 		}
1702 #endif
1703 	}
1704 	else if (src2 & SLJIT_IMM) {
1705 		/* Note: src1 is NOT immediate. */
1706 
1707 		if (src2w <= 127 && src2w >= -128) {
1708 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1709 			FAIL_IF(!inst);
1710 			*inst = IMUL_r_rm_i8;
1711 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1712 			FAIL_IF(!inst);
1713 			INC_SIZE(1);
1714 			*inst = (sljit_s8)src2w;
1715 		}
1716 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1717 		else {
1718 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1719 			FAIL_IF(!inst);
1720 			*inst = IMUL_r_rm_i32;
1721 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1722 			FAIL_IF(!inst);
1723 			INC_SIZE(4);
1724 			sljit_unaligned_store_sw(inst, src2w);
1725 		}
1726 #else
1727 		else if (IS_HALFWORD(src2w)) {
1728 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1729 			FAIL_IF(!inst);
1730 			*inst = IMUL_r_rm_i32;
1731 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1732 			FAIL_IF(!inst);
1733 			INC_SIZE(4);
1734 			sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
1735 		}
1736 		else {
1737 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src2w);
1738 			if (dst_r != src1)
1739 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1740 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1741 			FAIL_IF(!inst);
1742 			*inst++ = GROUP_0F;
1743 			*inst = IMUL_r_rm;
1744 		}
1745 #endif
1746 	}
1747 	else {
1748 		/* Neither argument is immediate. */
1749 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1750 			dst_r = TMP_REG1;
1751 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1752 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1753 		FAIL_IF(!inst);
1754 		*inst++ = GROUP_0F;
1755 		*inst = IMUL_r_rm;
1756 	}
1757 
1758 	if (dst_r == TMP_REG1)
1759 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1760 
1761 	return SLJIT_SUCCESS;
1762 }
1763 
1764 static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler,
1765 	sljit_s32 dst, sljit_sw dstw,
1766 	sljit_s32 src1, sljit_sw src1w,
1767 	sljit_s32 src2, sljit_sw src2w)
1768 {
1769 	sljit_u8* inst;
1770 	sljit_s32 dst_r, done = 0;
1771 
1772 	/* These cases better be left to handled by normal way. */
1773 	if (dst == src1 && dstw == src1w)
1774 		return SLJIT_ERR_UNSUPPORTED;
1775 	if (dst == src2 && dstw == src2w)
1776 		return SLJIT_ERR_UNSUPPORTED;
1777 
1778 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1779 
1780 	if (FAST_IS_REG(src1)) {
1781 		if (FAST_IS_REG(src2)) {
1782 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1783 			FAIL_IF(!inst);
1784 			*inst = LEA_r_m;
1785 			done = 1;
1786 		}
1787 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1788 		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1789 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
1790 #else
1791 		if (src2 & SLJIT_IMM) {
1792 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1793 #endif
1794 			FAIL_IF(!inst);
1795 			*inst = LEA_r_m;
1796 			done = 1;
1797 		}
1798 	}
1799 	else if (FAST_IS_REG(src2)) {
1800 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1801 		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1802 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
1803 #else
1804 		if (src1 & SLJIT_IMM) {
1805 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1806 #endif
1807 			FAIL_IF(!inst);
1808 			*inst = LEA_r_m;
1809 			done = 1;
1810 		}
1811 	}
1812 
1813 	if (done) {
1814 		if (dst_r == TMP_REG1)
1815 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1816 		return SLJIT_SUCCESS;
1817 	}
1818 	return SLJIT_ERR_UNSUPPORTED;
1819 }
1820 
1821 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
1822 	sljit_s32 src1, sljit_sw src1w,
1823 	sljit_s32 src2, sljit_sw src2w)
1824 {
1825 	sljit_u8* inst;
1826 
1827 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1828 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1829 #else
1830 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1831 #endif
1832 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
1833 		return SLJIT_SUCCESS;
1834 	}
1835 
1836 	if (FAST_IS_REG(src1)) {
1837 		if (src2 & SLJIT_IMM) {
1838 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
1839 		}
1840 		else {
1841 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1842 			FAIL_IF(!inst);
1843 			*inst = CMP_r_rm;
1844 		}
1845 		return SLJIT_SUCCESS;
1846 	}
1847 
1848 	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
1849 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1850 		FAIL_IF(!inst);
1851 		*inst = CMP_rm_r;
1852 		return SLJIT_SUCCESS;
1853 	}
1854 
1855 	if (src2 & SLJIT_IMM) {
1856 		if (src1 & SLJIT_IMM) {
1857 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1858 			src1 = TMP_REG1;
1859 			src1w = 0;
1860 		}
1861 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
1862 	}
1863 	else {
1864 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1865 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1866 		FAIL_IF(!inst);
1867 		*inst = CMP_r_rm;
1868 	}
1869 	return SLJIT_SUCCESS;
1870 }
1871 
1872 static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
1873 	sljit_s32 src1, sljit_sw src1w,
1874 	sljit_s32 src2, sljit_sw src2w)
1875 {
1876 	sljit_u8* inst;
1877 
1878 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1879 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1880 #else
1881 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1882 #endif
1883 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
1884 		return SLJIT_SUCCESS;
1885 	}
1886 
1887 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1888 	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1889 #else
1890 	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
1891 #endif
1892 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
1893 		return SLJIT_SUCCESS;
1894 	}
1895 
1896 	if (!(src1 & SLJIT_IMM)) {
1897 		if (src2 & SLJIT_IMM) {
1898 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1899 			if (IS_HALFWORD(src2w) || compiler->mode32) {
1900 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1901 				FAIL_IF(!inst);
1902 				*inst = GROUP_F7;
1903 			}
1904 			else {
1905 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1906 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
1907 				FAIL_IF(!inst);
1908 				*inst = TEST_rm_r;
1909 			}
1910 #else
1911 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1912 			FAIL_IF(!inst);
1913 			*inst = GROUP_F7;
1914 #endif
1915 			return SLJIT_SUCCESS;
1916 		}
1917 		else if (FAST_IS_REG(src1)) {
1918 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1919 			FAIL_IF(!inst);
1920 			*inst = TEST_rm_r;
1921 			return SLJIT_SUCCESS;
1922 		}
1923 	}
1924 
1925 	if (!(src2 & SLJIT_IMM)) {
1926 		if (src1 & SLJIT_IMM) {
1927 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1928 			if (IS_HALFWORD(src1w) || compiler->mode32) {
1929 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
1930 				FAIL_IF(!inst);
1931 				*inst = GROUP_F7;
1932 			}
1933 			else {
1934 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1935 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
1936 				FAIL_IF(!inst);
1937 				*inst = TEST_rm_r;
1938 			}
1939 #else
1940 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
1941 			FAIL_IF(!inst);
1942 			*inst = GROUP_F7;
1943 #endif
1944 			return SLJIT_SUCCESS;
1945 		}
1946 		else if (FAST_IS_REG(src2)) {
1947 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1948 			FAIL_IF(!inst);
1949 			*inst = TEST_rm_r;
1950 			return SLJIT_SUCCESS;
1951 		}
1952 	}
1953 
1954 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1955 	if (src2 & SLJIT_IMM) {
1956 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1957 		if (IS_HALFWORD(src2w) || compiler->mode32) {
1958 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1959 			FAIL_IF(!inst);
1960 			*inst = GROUP_F7;
1961 		}
1962 		else {
1963 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1964 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
1965 			FAIL_IF(!inst);
1966 			*inst = TEST_rm_r;
1967 		}
1968 #else
1969 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1970 		FAIL_IF(!inst);
1971 		*inst = GROUP_F7;
1972 #endif
1973 	}
1974 	else {
1975 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1976 		FAIL_IF(!inst);
1977 		*inst = TEST_rm_r;
1978 	}
1979 	return SLJIT_SUCCESS;
1980 }
1981 
1982 static sljit_s32 emit_shift(struct sljit_compiler *compiler,
1983 	sljit_u8 mode,
1984 	sljit_s32 dst, sljit_sw dstw,
1985 	sljit_s32 src1, sljit_sw src1w,
1986 	sljit_s32 src2, sljit_sw src2w)
1987 {
1988 	sljit_u8* inst;
1989 
1990 	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
1991 		if (dst == src1 && dstw == src1w) {
1992 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
1993 			FAIL_IF(!inst);
1994 			*inst |= mode;
1995 			return SLJIT_SUCCESS;
1996 		}
1997 		if (dst == SLJIT_UNUSED) {
1998 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1999 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2000 			FAIL_IF(!inst);
2001 			*inst |= mode;
2002 			return SLJIT_SUCCESS;
2003 		}
2004 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2005 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2006 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2007 			FAIL_IF(!inst);
2008 			*inst |= mode;
2009 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2010 			return SLJIT_SUCCESS;
2011 		}
2012 		if (FAST_IS_REG(dst)) {
2013 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2014 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2015 			FAIL_IF(!inst);
2016 			*inst |= mode;
2017 			return SLJIT_SUCCESS;
2018 		}
2019 
2020 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2021 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2022 		FAIL_IF(!inst);
2023 		*inst |= mode;
2024 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2025 		return SLJIT_SUCCESS;
2026 	}
2027 
2028 	if (dst == SLJIT_PREF_SHIFT_REG) {
2029 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2030 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2031 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2032 		FAIL_IF(!inst);
2033 		*inst |= mode;
2034 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2035 	}
2036 	else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2037 		if (src1 != dst)
2038 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2039 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2040 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2041 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2042 		FAIL_IF(!inst);
2043 		*inst |= mode;
2044 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2045 	}
2046 	else {
2047 		/* This case is complex since ecx itself may be used for
2048 		   addressing, and this case must be supported as well. */
2049 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2050 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2051 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2052 #else
2053 		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2054 #endif
2055 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2056 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2057 		FAIL_IF(!inst);
2058 		*inst |= mode;
2059 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2060 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2061 #else
2062 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
2063 #endif
2064 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2065 	}
2066 
2067 	return SLJIT_SUCCESS;
2068 }
2069 
2070 static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2071 	sljit_u8 mode, sljit_s32 set_flags,
2072 	sljit_s32 dst, sljit_sw dstw,
2073 	sljit_s32 src1, sljit_sw src1w,
2074 	sljit_s32 src2, sljit_sw src2w)
2075 {
2076 	/* The CPU does not set flags if the shift count is 0. */
2077 	if (src2 & SLJIT_IMM) {
2078 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2079 		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2080 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2081 #else
2082 		if ((src2w & 0x1f) != 0)
2083 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2084 #endif
2085 		if (!set_flags)
2086 			return emit_mov(compiler, dst, dstw, src1, src1w);
2087 		/* OR dst, src, 0 */
2088 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2089 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2090 	}
2091 
2092 	if (!set_flags)
2093 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2094 
2095 	if (!FAST_IS_REG(dst))
2096 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2097 
2098 	FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
2099 
2100 	if (FAST_IS_REG(dst))
2101 		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2102 	return SLJIT_SUCCESS;
2103 }
2104 
2105 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2106 	sljit_s32 dst, sljit_sw dstw,
2107 	sljit_s32 src1, sljit_sw src1w,
2108 	sljit_s32 src2, sljit_sw src2w)
2109 {
2110 	CHECK_ERROR();
2111 	CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2112 	ADJUST_LOCAL_OFFSET(dst, dstw);
2113 	ADJUST_LOCAL_OFFSET(src1, src1w);
2114 	ADJUST_LOCAL_OFFSET(src2, src2w);
2115 
2116 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2117 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2118 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2119 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2120 	compiler->mode32 = op & SLJIT_I32_OP;
2121 #endif
2122 
2123 	switch (GET_OPCODE(op)) {
2124 	case SLJIT_ADD:
2125 		if (!HAS_FLAGS(op)) {
2126 			if (emit_lea_binary(compiler, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2127 				return compiler->error;
2128 		}
2129 		return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
2130 			dst, dstw, src1, src1w, src2, src2w);
2131 	case SLJIT_ADDC:
2132 		return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
2133 			dst, dstw, src1, src1w, src2, src2w);
2134 	case SLJIT_SUB:
2135 		if (!HAS_FLAGS(op)) {
2136 			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2137 				return compiler->error;
2138 		}
2139 
2140 		if (dst == SLJIT_UNUSED)
2141 			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2142 		return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
2143 			dst, dstw, src1, src1w, src2, src2w);
2144 	case SLJIT_SUBC:
2145 		return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
2146 			dst, dstw, src1, src1w, src2, src2w);
2147 	case SLJIT_MUL:
2148 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2149 	case SLJIT_AND:
2150 		if (dst == SLJIT_UNUSED)
2151 			return emit_test_binary(compiler, src1, src1w, src2, src2w);
2152 		return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
2153 			dst, dstw, src1, src1w, src2, src2w);
2154 	case SLJIT_OR:
2155 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2156 			dst, dstw, src1, src1w, src2, src2w);
2157 	case SLJIT_XOR:
2158 		return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
2159 			dst, dstw, src1, src1w, src2, src2w);
2160 	case SLJIT_SHL:
2161 		return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
2162 			dst, dstw, src1, src1w, src2, src2w);
2163 	case SLJIT_LSHR:
2164 		return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op),
2165 			dst, dstw, src1, src1w, src2, src2w);
2166 	case SLJIT_ASHR:
2167 		return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op),
2168 			dst, dstw, src1, src1w, src2, src2w);
2169 	}
2170 
2171 	return SLJIT_SUCCESS;
2172 }
2173 
2174 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
2175 {
2176 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
2177 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2178 	if (reg >= SLJIT_R3 && reg <= SLJIT_R8)
2179 		return -1;
2180 #endif
2181 	return reg_map[reg];
2182 }
2183 
2184 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
2185 {
2186 	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
2187 	return reg;
2188 }
2189 
2190 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
2191 	void *instruction, sljit_s32 size)
2192 {
2193 	sljit_u8 *inst;
2194 
2195 	CHECK_ERROR();
2196 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
2197 
2198 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
2199 	FAIL_IF(!inst);
2200 	INC_SIZE(size);
2201 	SLJIT_MEMCPY(inst, instruction, size);
2202 	return SLJIT_SUCCESS;
2203 }
2204 
2205 /* --------------------------------------------------------------------- */
2206 /*  Floating point operators                                             */
2207 /* --------------------------------------------------------------------- */
2208 
2209 /* Alignment + 2 * 16 bytes. */
2210 static sljit_s32 sse2_data[3 + (4 + 4) * 2];
2211 static sljit_s32 *sse2_buffer;
2212 
2213 static void init_compiler(void)
2214 {
2215 	sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf);
2216 	/* Single precision constants. */
2217 	sse2_buffer[0] = 0x80000000;
2218 	sse2_buffer[4] = 0x7fffffff;
2219 	/* Double precision constants. */
2220 	sse2_buffer[8] = 0;
2221 	sse2_buffer[9] = 0x80000000;
2222 	sse2_buffer[12] = 0xffffffff;
2223 	sse2_buffer[13] = 0x7fffffff;
2224 }
2225 
2226 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_is_fpu_available(void)
2227 {
2228 #ifdef SLJIT_IS_FPU_AVAILABLE
2229 	return SLJIT_IS_FPU_AVAILABLE;
2230 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2231 	if (cpu_has_sse2 == -1)
2232 		get_cpu_features();
2233 	return cpu_has_sse2;
2234 #else /* SLJIT_DETECT_SSE2 */
2235 	return 1;
2236 #endif /* SLJIT_DETECT_SSE2 */
2237 }
2238 
2239 static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
2240 	sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2241 {
2242 	sljit_u8 *inst;
2243 
2244 	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2245 	FAIL_IF(!inst);
2246 	*inst++ = GROUP_0F;
2247 	*inst = opcode;
2248 	return SLJIT_SUCCESS;
2249 }
2250 
2251 static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode,
2252 	sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2253 {
2254 	sljit_u8 *inst;
2255 
2256 	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2257 	FAIL_IF(!inst);
2258 	*inst++ = GROUP_0F;
2259 	*inst = opcode;
2260 	return SLJIT_SUCCESS;
2261 }
2262 
2263 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
2264 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2265 {
2266 	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2267 }
2268 
2269 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
2270 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
2271 {
2272 	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2273 }
2274 
2275 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
2276 	sljit_s32 dst, sljit_sw dstw,
2277 	sljit_s32 src, sljit_sw srcw)
2278 {
2279 	sljit_s32 dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2280 	sljit_u8 *inst;
2281 
2282 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2283 	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
2284 		compiler->mode32 = 0;
2285 #endif
2286 
2287 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
2288 	FAIL_IF(!inst);
2289 	*inst++ = GROUP_0F;
2290 	*inst = CVTTSD2SI_r_xm;
2291 
2292 	if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
2293 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2294 	return SLJIT_SUCCESS;
2295 }
2296 
2297 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
2298 	sljit_s32 dst, sljit_sw dstw,
2299 	sljit_s32 src, sljit_sw srcw)
2300 {
2301 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2302 	sljit_u8 *inst;
2303 
2304 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2305 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
2306 		compiler->mode32 = 0;
2307 #endif
2308 
2309 	if (src & SLJIT_IMM) {
2310 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2311 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
2312 			srcw = (sljit_s32)srcw;
2313 #endif
2314 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2315 		src = TMP_REG1;
2316 		srcw = 0;
2317 	}
2318 
2319 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
2320 	FAIL_IF(!inst);
2321 	*inst++ = GROUP_0F;
2322 	*inst = CVTSI2SD_x_rm;
2323 
2324 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2325 	compiler->mode32 = 1;
2326 #endif
2327 	if (dst_r == TMP_FREG)
2328 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2329 	return SLJIT_SUCCESS;
2330 }
2331 
2332 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
2333 	sljit_s32 src1, sljit_sw src1w,
2334 	sljit_s32 src2, sljit_sw src2w)
2335 {
2336 	if (!FAST_IS_REG(src1)) {
2337 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2338 		src1 = TMP_FREG;
2339 	}
2340 	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w);
2341 }
2342 
2343 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
2344 	sljit_s32 dst, sljit_sw dstw,
2345 	sljit_s32 src, sljit_sw srcw)
2346 {
2347 	sljit_s32 dst_r;
2348 
2349 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2350 	compiler->mode32 = 1;
2351 #endif
2352 
2353 	CHECK_ERROR();
2354 	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
2355 
2356 	if (GET_OPCODE(op) == SLJIT_MOV_F64) {
2357 		if (FAST_IS_REG(dst))
2358 			return emit_sse2_load(compiler, op & SLJIT_F32_OP, dst, src, srcw);
2359 		if (FAST_IS_REG(src))
2360 			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, src);
2361 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src, srcw));
2362 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2363 	}
2364 
2365 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
2366 		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2367 		if (FAST_IS_REG(src)) {
2368 			/* We overwrite the high bits of source. From SLJIT point of view,
2369 			   this is not an issue.
2370 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
2371 			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_F32_OP, src, src, 0));
2372 		}
2373 		else {
2374 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_F32_OP), TMP_FREG, src, srcw));
2375 			src = TMP_FREG;
2376 		}
2377 
2378 		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_F32_OP, dst_r, src, 0));
2379 		if (dst_r == TMP_FREG)
2380 			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2381 		return SLJIT_SUCCESS;
2382 	}
2383 
2384 	if (SLOW_IS_REG(dst)) {
2385 		dst_r = dst;
2386 		if (dst != src)
2387 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2388 	}
2389 	else {
2390 		dst_r = TMP_FREG;
2391 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2392 	}
2393 
2394 	switch (GET_OPCODE(op)) {
2395 	case SLJIT_NEG_F64:
2396 		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer : sse2_buffer + 8)));
2397 		break;
2398 
2399 	case SLJIT_ABS_F64:
2400 		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2401 		break;
2402 	}
2403 
2404 	if (dst_r == TMP_FREG)
2405 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2406 	return SLJIT_SUCCESS;
2407 }
2408 
2409 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
2410 	sljit_s32 dst, sljit_sw dstw,
2411 	sljit_s32 src1, sljit_sw src1w,
2412 	sljit_s32 src2, sljit_sw src2w)
2413 {
2414 	sljit_s32 dst_r;
2415 
2416 	CHECK_ERROR();
2417 	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2418 	ADJUST_LOCAL_OFFSET(dst, dstw);
2419 	ADJUST_LOCAL_OFFSET(src1, src1w);
2420 	ADJUST_LOCAL_OFFSET(src2, src2w);
2421 
2422 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2423 	compiler->mode32 = 1;
2424 #endif
2425 
2426 	if (FAST_IS_REG(dst)) {
2427 		dst_r = dst;
2428 		if (dst == src1)
2429 			; /* Do nothing here. */
2430 		else if (dst == src2 && (op == SLJIT_ADD_F64 || op == SLJIT_MUL_F64)) {
2431 			/* Swap arguments. */
2432 			src2 = src1;
2433 			src2w = src1w;
2434 		}
2435 		else if (dst != src2)
2436 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src1, src1w));
2437 		else {
2438 			dst_r = TMP_FREG;
2439 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2440 		}
2441 	}
2442 	else {
2443 		dst_r = TMP_FREG;
2444 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2445 	}
2446 
2447 	switch (GET_OPCODE(op)) {
2448 	case SLJIT_ADD_F64:
2449 		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2450 		break;
2451 
2452 	case SLJIT_SUB_F64:
2453 		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2454 		break;
2455 
2456 	case SLJIT_MUL_F64:
2457 		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2458 		break;
2459 
2460 	case SLJIT_DIV_F64:
2461 		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2462 		break;
2463 	}
2464 
2465 	if (dst_r == TMP_FREG)
2466 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2467 	return SLJIT_SUCCESS;
2468 }
2469 
2470 /* --------------------------------------------------------------------- */
2471 /*  Conditional instructions                                             */
2472 /* --------------------------------------------------------------------- */
2473 
2474 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2475 {
2476 	sljit_u8 *inst;
2477 	struct sljit_label *label;
2478 
2479 	CHECK_ERROR_PTR();
2480 	CHECK_PTR(check_sljit_emit_label(compiler));
2481 
2482 	if (compiler->last_label && compiler->last_label->size == compiler->size)
2483 		return compiler->last_label;
2484 
2485 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2486 	PTR_FAIL_IF(!label);
2487 	set_label(label, compiler);
2488 
2489 	inst = (sljit_u8*)ensure_buf(compiler, 2);
2490 	PTR_FAIL_IF(!inst);
2491 
2492 	*inst++ = 0;
2493 	*inst++ = 0;
2494 
2495 	return label;
2496 }
2497 
2498 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
2499 {
2500 	sljit_u8 *inst;
2501 	struct sljit_jump *jump;
2502 
2503 	CHECK_ERROR_PTR();
2504 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
2505 
2506 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2507 	PTR_FAIL_IF_NULL(jump);
2508 	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
2509 	type &= 0xff;
2510 
2511 	if (type >= SLJIT_CALL1)
2512 		PTR_FAIL_IF(call_with_args(compiler, type));
2513 
2514 	/* Worst case size. */
2515 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2516 	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2517 #else
2518 	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2519 #endif
2520 
2521 	inst = (sljit_u8*)ensure_buf(compiler, 2);
2522 	PTR_FAIL_IF_NULL(inst);
2523 
2524 	*inst++ = 0;
2525 	*inst++ = type + 2;
2526 	return jump;
2527 }
2528 
2529 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
2530 {
2531 	sljit_u8 *inst;
2532 	struct sljit_jump *jump;
2533 
2534 	CHECK_ERROR();
2535 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
2536 	ADJUST_LOCAL_OFFSET(src, srcw);
2537 
2538 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2539 
2540 	if (type >= SLJIT_CALL1) {
2541 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2542 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
2543 		if (src == SLJIT_R2) {
2544 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2545 			src = TMP_REG1;
2546 		}
2547 		if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
2548 			srcw += sizeof(sljit_sw);
2549 #endif
2550 #endif
2551 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
2552 		if (src == SLJIT_R2) {
2553 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2554 			src = TMP_REG1;
2555 		}
2556 #endif
2557 		FAIL_IF(call_with_args(compiler, type));
2558 	}
2559 
2560 	if (src == SLJIT_IMM) {
2561 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2562 		FAIL_IF_NULL(jump);
2563 		set_jump(jump, compiler, JUMP_ADDR);
2564 		jump->u.target = srcw;
2565 
2566 		/* Worst case size. */
2567 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2568 		compiler->size += 5;
2569 #else
2570 		compiler->size += 10 + 3;
2571 #endif
2572 
2573 		inst = (sljit_u8*)ensure_buf(compiler, 2);
2574 		FAIL_IF_NULL(inst);
2575 
2576 		*inst++ = 0;
2577 		*inst++ = type + 2;
2578 	}
2579 	else {
2580 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2581 		/* REX_W is not necessary (src is not immediate). */
2582 		compiler->mode32 = 1;
2583 #endif
2584 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2585 		FAIL_IF(!inst);
2586 		*inst++ = GROUP_FF;
2587 		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2588 	}
2589 	return SLJIT_SUCCESS;
2590 }
2591 
2592 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
2593 	sljit_s32 dst, sljit_sw dstw,
2594 	sljit_s32 src, sljit_sw srcw,
2595 	sljit_s32 type)
2596 {
2597 	sljit_u8 *inst;
2598 	sljit_u8 cond_set = 0;
2599 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2600 	sljit_s32 reg;
2601 #endif
2602 	/* ADJUST_LOCAL_OFFSET and CHECK_EXTRA_REGS might overwrite these values. */
2603 	sljit_s32 dst_save = dst;
2604 	sljit_sw dstw_save = dstw;
2605 
2606 	CHECK_ERROR();
2607 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
2608 	SLJIT_UNUSED_ARG(srcw);
2609 
2610 	if (dst == SLJIT_UNUSED)
2611 		return SLJIT_SUCCESS;
2612 
2613 	ADJUST_LOCAL_OFFSET(dst, dstw);
2614 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2615 
2616 	type &= 0xff;
2617 	/* setcc = jcc + 0x10. */
2618 	cond_set = get_jump_code(type) + 0x10;
2619 
2620 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2621 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
2622 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
2623 		FAIL_IF(!inst);
2624 		INC_SIZE(4 + 3);
2625 		/* Set low register to conditional flag. */
2626 		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2627 		*inst++ = GROUP_0F;
2628 		*inst++ = cond_set;
2629 		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
2630 		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2631 		*inst++ = OR_rm8_r8;
2632 		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2633 		return SLJIT_SUCCESS;
2634 	}
2635 
2636 	reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2637 
2638 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
2639 	FAIL_IF(!inst);
2640 	INC_SIZE(4 + 4);
2641 	/* Set low register to conditional flag. */
2642 	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2643 	*inst++ = GROUP_0F;
2644 	*inst++ = cond_set;
2645 	*inst++ = MOD_REG | reg_lmap[reg];
2646 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2647 	/* The movzx instruction does not affect flags. */
2648 	*inst++ = GROUP_0F;
2649 	*inst++ = MOVZX_r_rm8;
2650 	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2651 
2652 	if (reg != TMP_REG1)
2653 		return SLJIT_SUCCESS;
2654 
2655 	if (GET_OPCODE(op) < SLJIT_ADD) {
2656 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2657 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2658 	}
2659 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2660 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2661 	compiler->skip_checks = 1;
2662 #endif
2663 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2664 
2665 #else
2666 	/* The SLJIT_CONFIG_X86_32 code path starts here. */
2667 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2668 		if (reg_map[dst] <= 4) {
2669 			/* Low byte is accessible. */
2670 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
2671 			FAIL_IF(!inst);
2672 			INC_SIZE(3 + 3);
2673 			/* Set low byte to conditional flag. */
2674 			*inst++ = GROUP_0F;
2675 			*inst++ = cond_set;
2676 			*inst++ = MOD_REG | reg_map[dst];
2677 
2678 			*inst++ = GROUP_0F;
2679 			*inst++ = MOVZX_r_rm8;
2680 			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2681 			return SLJIT_SUCCESS;
2682 		}
2683 
2684 		/* Low byte is not accessible. */
2685 		if (cpu_has_cmov == -1)
2686 			get_cpu_features();
2687 
2688 		if (cpu_has_cmov) {
2689 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2690 			/* a xor reg, reg operation would overwrite the flags. */
2691 			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2692 
2693 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
2694 			FAIL_IF(!inst);
2695 			INC_SIZE(3);
2696 
2697 			*inst++ = GROUP_0F;
2698 			/* cmovcc = setcc - 0x50. */
2699 			*inst++ = cond_set - 0x50;
2700 			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2701 			return SLJIT_SUCCESS;
2702 		}
2703 
2704 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2705 		FAIL_IF(!inst);
2706 		INC_SIZE(1 + 3 + 3 + 1);
2707 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2708 		/* Set al to conditional flag. */
2709 		*inst++ = GROUP_0F;
2710 		*inst++ = cond_set;
2711 		*inst++ = MOD_REG | 0 /* eax */;
2712 
2713 		*inst++ = GROUP_0F;
2714 		*inst++ = MOVZX_r_rm8;
2715 		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2716 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2717 		return SLJIT_SUCCESS;
2718 	}
2719 
2720 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
2721 		SLJIT_ASSERT(reg_map[SLJIT_R0] == 0);
2722 
2723 		if (dst != SLJIT_R0) {
2724 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2725 			FAIL_IF(!inst);
2726 			INC_SIZE(1 + 3 + 2 + 1);
2727 			/* Set low register to conditional flag. */
2728 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2729 			*inst++ = GROUP_0F;
2730 			*inst++ = cond_set;
2731 			*inst++ = MOD_REG | 0 /* eax */;
2732 			*inst++ = OR_rm8_r8;
2733 			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2734 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2735 		}
2736 		else {
2737 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2738 			FAIL_IF(!inst);
2739 			INC_SIZE(2 + 3 + 2 + 2);
2740 			/* Set low register to conditional flag. */
2741 			*inst++ = XCHG_r_rm;
2742 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2743 			*inst++ = GROUP_0F;
2744 			*inst++ = cond_set;
2745 			*inst++ = MOD_REG | 1 /* ecx */;
2746 			*inst++ = OR_rm8_r8;
2747 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2748 			*inst++ = XCHG_r_rm;
2749 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2750 		}
2751 		return SLJIT_SUCCESS;
2752 	}
2753 
2754 	/* Set TMP_REG1 to the bit. */
2755 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2756 	FAIL_IF(!inst);
2757 	INC_SIZE(1 + 3 + 3 + 1);
2758 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2759 	/* Set al to conditional flag. */
2760 	*inst++ = GROUP_0F;
2761 	*inst++ = cond_set;
2762 	*inst++ = MOD_REG | 0 /* eax */;
2763 
2764 	*inst++ = GROUP_0F;
2765 	*inst++ = MOVZX_r_rm8;
2766 	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2767 
2768 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2769 
2770 	if (GET_OPCODE(op) < SLJIT_ADD)
2771 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2772 
2773 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2774 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2775 	compiler->skip_checks = 1;
2776 #endif
2777 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2778 #endif /* SLJIT_CONFIG_X86_64 */
2779 }
2780 
2781 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
2782 {
2783 	CHECK_ERROR();
2784 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
2785 	ADJUST_LOCAL_OFFSET(dst, dstw);
2786 
2787 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2788 
2789 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2790 	compiler->mode32 = 0;
2791 #endif
2792 
2793 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
2794 
2795 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2796 	if (NOT_HALFWORD(offset)) {
2797 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
2798 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2799 		SLJIT_ASSERT(emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
2800 		return compiler->error;
2801 #else
2802 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
2803 #endif
2804 	}
2805 #endif
2806 
2807 	if (offset != 0)
2808 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
2809 	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
2810 }
2811 
2812 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
2813 {
2814 	sljit_u8 *inst;
2815 	struct sljit_const *const_;
2816 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2817 	sljit_s32 reg;
2818 #endif
2819 
2820 	CHECK_ERROR_PTR();
2821 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
2822 	ADJUST_LOCAL_OFFSET(dst, dstw);
2823 
2824 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2825 
2826 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
2827 	PTR_FAIL_IF(!const_);
2828 	set_const(const_, compiler);
2829 
2830 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2831 	compiler->mode32 = 0;
2832 	reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2833 
2834 	if (emit_load_imm64(compiler, reg, init_value))
2835 		return NULL;
2836 #else
2837 	if (dst == SLJIT_UNUSED)
2838 		dst = TMP_REG1;
2839 
2840 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
2841 		return NULL;
2842 #endif
2843 
2844 	inst = (sljit_u8*)ensure_buf(compiler, 2);
2845 	PTR_FAIL_IF(!inst);
2846 
2847 	*inst++ = 0;
2848 	*inst++ = 1;
2849 
2850 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2851 	if (dst & SLJIT_MEM)
2852 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
2853 			return NULL;
2854 #endif
2855 
2856 	return const_;
2857 }
2858 
2859 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
2860 {
2861 	SLJIT_UNUSED_ARG(executable_offset);
2862 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2863 	sljit_unaligned_store_sw((void*)addr, new_target - (addr + 4) - (sljit_uw)executable_offset);
2864 #else
2865 	sljit_unaligned_store_sw((void*)addr, (sljit_sw) new_target);
2866 #endif
2867 }
2868 
2869 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
2870 {
2871 	SLJIT_UNUSED_ARG(executable_offset);
2872 	sljit_unaligned_store_sw((void*)addr, new_constant);
2873 }
2874 
2875 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_sse2_available(void)
2876 {
2877 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2878 	if (cpu_has_sse2 == -1)
2879 		get_cpu_features();
2880 	return cpu_has_sse2;
2881 #else
2882 	return 1;
2883 #endif
2884 }
2885 
2886 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_cmov_available(void)
2887 {
2888 	if (cpu_has_cmov == -1)
2889 		get_cpu_features();
2890 	return cpu_has_cmov;
2891 }
2892 
2893 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_emit_cmov(struct sljit_compiler *compiler,
2894 	sljit_s32 type,
2895 	sljit_s32 dst_reg,
2896 	sljit_s32 src, sljit_sw srcw)
2897 {
2898 	sljit_u8* inst;
2899 
2900 	CHECK_ERROR();
2901 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2902 	CHECK_ARGUMENT(sljit_x86_is_cmov_available());
2903 	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_I32_OP)));
2904 	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_ORDERED_F64);
2905 	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_I32_OP));
2906 	FUNCTION_CHECK_SRC(src, srcw);
2907 
2908 	if ((type & 0xff) <= SLJIT_NOT_ZERO)
2909 		CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
2910 	else
2911 		CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff));
2912 #endif
2913 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
2914 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
2915 		fprintf(compiler->verbose, "  x86_cmov%s %s%s, ",
2916 			!(dst_reg & SLJIT_I32_OP) ? "" : ".i",
2917 			jump_names[type & 0xff], JUMP_POSTFIX(type));
2918 		sljit_verbose_reg(compiler, dst_reg & ~SLJIT_I32_OP);
2919 		fprintf(compiler->verbose, ", ");
2920 		sljit_verbose_param(compiler, src, srcw);
2921 		fprintf(compiler->verbose, "\n");
2922 	}
2923 #endif
2924 
2925 	ADJUST_LOCAL_OFFSET(src, srcw);
2926 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2927 
2928 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2929 	compiler->mode32 = dst_reg & SLJIT_I32_OP;
2930 #endif
2931 	dst_reg &= ~SLJIT_I32_OP;
2932 
2933 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
2934 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
2935 		src = TMP_REG1;
2936 		srcw = 0;
2937 	}
2938 
2939 	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
2940 	FAIL_IF(!inst);
2941 	*inst++ = GROUP_0F;
2942 	*inst = get_jump_code(type & 0xff) - 0x40;
2943 	return SLJIT_SUCCESS;
2944 }
2945