xref: /netbsd-src/sys/external/bsd/sljit/dist/sljit_src/sljitNativeX86_common.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*	$NetBSD: sljitNativeX86_common.c,v 1.8 2016/05/29 17:09:33 alnsn Exp $	*/
2 
3 /*
4  *    Stack-less Just-In-Time compiler
5  *
6  *    Copyright 2009-2012 Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without modification, are
9  * permitted provided that the following conditions are met:
10  *
11  *   1. Redistributions of source code must retain the above copyright notice, this list of
12  *      conditions and the following disclaimer.
13  *
14  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
15  *      of conditions and the following disclaimer in the documentation and/or other materials
16  *      provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
21  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
24  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
30 {
31 	return "x86" SLJIT_CPUINFO;
32 }
33 
34 /*
35    32b register indexes:
36      0 - EAX
37      1 - ECX
38      2 - EDX
39      3 - EBX
40      4 - none
41      5 - EBP
42      6 - ESI
43      7 - EDI
44 */
45 
46 /*
47    64b register indexes:
48      0 - RAX
49      1 - RCX
50      2 - RDX
51      3 - RBX
52      4 - none
53      5 - RBP
54      6 - RSI
55      7 - RDI
56      8 - R8   - From now on REX prefix is required
57      9 - R9
58     10 - R10
59     11 - R11
60     12 - R12
61     13 - R13
62     14 - R14
63     15 - R15
64 */
65 
66 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
67 
68 /* Last register + 1. */
69 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
70 
71 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
72 	0, 0, 2, 1, 0, 0, 0, 0, 7, 6, 3, 4, 5
73 };
74 
75 #define CHECK_EXTRA_REGS(p, w, do) \
76 	if (p >= SLJIT_R3 && p <= SLJIT_R6) { \
77 		w = SLJIT_LOCALS_OFFSET + ((p) - (SLJIT_R3 + 4)) * sizeof(sljit_sw); \
78 		p = SLJIT_MEM1(SLJIT_SP); \
79 		do; \
80 	}
81 
82 #else /* SLJIT_CONFIG_X86_32 */
83 
84 /* Last register + 1. */
85 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
86 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
87 #define TMP_REG3	(SLJIT_NUMBER_OF_REGISTERS + 4)
88 
89 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
90    Note: avoid to use r12 and r13 for memory addessing
91    therefore r12 is better for SAVED_EREG than SAVED_REG. */
92 #ifndef _WIN64
93 /* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
94 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
95 	0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
96 };
97 /* low-map. reg_map & 0x7. */
98 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
99 	0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
100 };
101 #else
102 /* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
103 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
104 	0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
105 };
106 /* low-map. reg_map & 0x7. */
107 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
108 	0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
109 };
110 #endif
111 
112 #define REX_W		0x48
113 #define REX_R		0x44
114 #define REX_X		0x42
115 #define REX_B		0x41
116 #define REX		0x40
117 
118 #ifndef _WIN64
119 #define HALFWORD_MAX 0x7fffffffl
120 #define HALFWORD_MIN -0x80000000l
121 #else
122 #define HALFWORD_MAX 0x7fffffffll
123 #define HALFWORD_MIN -0x80000000ll
124 #endif
125 
126 #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
127 #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
128 
129 #define CHECK_EXTRA_REGS(p, w, do)
130 
131 #endif /* SLJIT_CONFIG_X86_32 */
132 
133 #define TMP_FREG	(0)
134 
135 /* Size flags for emit_x86_instruction: */
136 #define EX86_BIN_INS		0x0010
137 #define EX86_SHIFT_INS		0x0020
138 #define EX86_REX		0x0040
139 #define EX86_NO_REXW		0x0080
140 #define EX86_BYTE_ARG		0x0100
141 #define EX86_HALF_ARG		0x0200
142 #define EX86_PREF_66		0x0400
143 #define EX86_PREF_F2		0x0800
144 #define EX86_PREF_F3		0x1000
145 #define EX86_SSE2_OP1		0x2000
146 #define EX86_SSE2_OP2		0x4000
147 #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
148 
149 /* --------------------------------------------------------------------- */
150 /*  Instrucion forms                                                     */
151 /* --------------------------------------------------------------------- */
152 
153 #define ADD		(/* BINARY */ 0 << 3)
154 #define ADD_EAX_i32	0x05
155 #define ADD_r_rm	0x03
156 #define ADD_rm_r	0x01
157 #define ADDSD_x_xm	0x58
158 #define ADC		(/* BINARY */ 2 << 3)
159 #define ADC_EAX_i32	0x15
160 #define ADC_r_rm	0x13
161 #define ADC_rm_r	0x11
162 #define AND		(/* BINARY */ 4 << 3)
163 #define AND_EAX_i32	0x25
164 #define AND_r_rm	0x23
165 #define AND_rm_r	0x21
166 #define ANDPD_x_xm	0x54
167 #define BSR_r_rm	(/* GROUP_0F */ 0xbd)
168 #define CALL_i32	0xe8
169 #define CALL_rm		(/* GROUP_FF */ 2 << 3)
170 #define CDQ		0x99
171 #define CMOVNE_r_rm	(/* GROUP_0F */ 0x45)
172 #define CMP		(/* BINARY */ 7 << 3)
173 #define CMP_EAX_i32	0x3d
174 #define CMP_r_rm	0x3b
175 #define CMP_rm_r	0x39
176 #define CVTPD2PS_x_xm	0x5a
177 #define CVTSI2SD_x_rm	0x2a
178 #define CVTTSD2SI_r_xm	0x2c
179 #define DIV		(/* GROUP_F7 */ 6 << 3)
180 #define DIVSD_x_xm	0x5e
181 #define INT3		0xcc
182 #define IDIV		(/* GROUP_F7 */ 7 << 3)
183 #define IMUL		(/* GROUP_F7 */ 5 << 3)
184 #define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
185 #define IMUL_r_rm_i8	0x6b
186 #define IMUL_r_rm_i32	0x69
187 #define JE_i8		0x74
188 #define JNE_i8		0x75
189 #define JMP_i8		0xeb
190 #define JMP_i32		0xe9
191 #define JMP_rm		(/* GROUP_FF */ 4 << 3)
192 #define LEA_r_m		0x8d
193 #define MOV_r_rm	0x8b
194 #define MOV_r_i32	0xb8
195 #define MOV_rm_r	0x89
196 #define MOV_rm_i32	0xc7
197 #define MOV_rm8_i8	0xc6
198 #define MOV_rm8_r8	0x88
199 #define MOVSD_x_xm	0x10
200 #define MOVSD_xm_x	0x11
201 #define MOVSXD_r_rm	0x63
202 #define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
203 #define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
204 #define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
205 #define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
206 #define MUL		(/* GROUP_F7 */ 4 << 3)
207 #define MULSD_x_xm	0x59
208 #define NEG_rm		(/* GROUP_F7 */ 3 << 3)
209 #define NOP		0x90
210 #define NOT_rm		(/* GROUP_F7 */ 2 << 3)
211 #define OR		(/* BINARY */ 1 << 3)
212 #define OR_r_rm		0x0b
213 #define OR_EAX_i32	0x0d
214 #define OR_rm_r		0x09
215 #define OR_rm8_r8	0x08
216 #define POP_r		0x58
217 #define POP_rm		0x8f
218 #define POPF		0x9d
219 #define PUSH_i32	0x68
220 #define PUSH_r		0x50
221 #define PUSH_rm		(/* GROUP_FF */ 6 << 3)
222 #define PUSHF		0x9c
223 #define RET_near	0xc3
224 #define RET_i16		0xc2
225 #define SBB		(/* BINARY */ 3 << 3)
226 #define SBB_EAX_i32	0x1d
227 #define SBB_r_rm	0x1b
228 #define SBB_rm_r	0x19
229 #define SAR		(/* SHIFT */ 7 << 3)
230 #define SHL		(/* SHIFT */ 4 << 3)
231 #define SHR		(/* SHIFT */ 5 << 3)
232 #define SUB		(/* BINARY */ 5 << 3)
233 #define SUB_EAX_i32	0x2d
234 #define SUB_r_rm	0x2b
235 #define SUB_rm_r	0x29
236 #define SUBSD_x_xm	0x5c
237 #define TEST_EAX_i32	0xa9
238 #define TEST_rm_r	0x85
239 #define UCOMISD_x_xm	0x2e
240 #define UNPCKLPD_x_xm	0x14
241 #define XCHG_EAX_r	0x90
242 #define XCHG_r_rm	0x87
243 #define XOR		(/* BINARY */ 6 << 3)
244 #define XOR_EAX_i32	0x35
245 #define XOR_r_rm	0x33
246 #define XOR_rm_r	0x31
247 #define XORPD_x_xm	0x57
248 
249 #define GROUP_0F	0x0f
250 #define GROUP_F7	0xf7
251 #define GROUP_FF	0xff
252 #define GROUP_BINARY_81	0x81
253 #define GROUP_BINARY_83	0x83
254 #define GROUP_SHIFT_1	0xd1
255 #define GROUP_SHIFT_N	0xc1
256 #define GROUP_SHIFT_CL	0xd3
257 
258 #define MOD_REG		0xc0
259 #define MOD_DISP8	0x40
260 
261 #define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
262 
263 #define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
264 #define POP_REG(r)			(*inst++ = (POP_r + (r)))
265 #define RET()				(*inst++ = (RET_near))
266 #define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
267 /* r32, r/m32 */
268 #define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
269 
270 /* Multithreading does not affect these static variables, since they store
271    built-in CPU features. Therefore they can be overwritten by different threads
272    if they detect the CPU features in the same time. */
273 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
274 static sljit_s32 cpu_has_sse2 = -1;
275 #endif
276 static sljit_s32 cpu_has_cmov = -1;
277 
278 #ifdef _WIN32_WCE
279 #include <cmnintrin.h>
280 #elif defined(_MSC_VER) && _MSC_VER >= 1400
281 #include <intrin.h>
282 #endif
283 
284 static void get_cpu_features(void)
285 {
286 	sljit_u32 features;
287 
288 #if defined(_MSC_VER) && _MSC_VER >= 1400
289 
290 	int CPUInfo[4];
291 	__cpuid(CPUInfo, 1);
292 	features = (sljit_u32)CPUInfo[3];
293 
294 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
295 
296 	/* AT&T syntax. */
297 	__asm__ (
298 		"movl $0x1, %%eax\n"
299 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
300 		/* On x86-32, there is no red zone, so this
301 		   should work (no need for a local variable). */
302 		"push %%ebx\n"
303 #endif
304 		"cpuid\n"
305 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
306 		"pop %%ebx\n"
307 #endif
308 		"movl %%edx, %0\n"
309 		: "=g" (features)
310 		:
311 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
312 		: "%eax", "%ecx", "%edx"
313 #else
314 		: "%rax", "%rbx", "%rcx", "%rdx"
315 #endif
316 	);
317 
318 #else /* _MSC_VER && _MSC_VER >= 1400 */
319 
320 	/* Intel syntax. */
321 	__asm {
322 		mov eax, 1
323 		cpuid
324 		mov features, edx
325 	}
326 
327 #endif /* _MSC_VER && _MSC_VER >= 1400 */
328 
329 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
330 	cpu_has_sse2 = (features >> 26) & 0x1;
331 #endif
332 	cpu_has_cmov = (features >> 15) & 0x1;
333 }
334 
335 static sljit_u8 get_jump_code(sljit_s32 type)
336 {
337 	switch (type) {
338 	case SLJIT_EQUAL:
339 	case SLJIT_EQUAL_F64:
340 		return 0x84 /* je */;
341 
342 	case SLJIT_NOT_EQUAL:
343 	case SLJIT_NOT_EQUAL_F64:
344 		return 0x85 /* jne */;
345 
346 	case SLJIT_LESS:
347 	case SLJIT_LESS_F64:
348 		return 0x82 /* jc */;
349 
350 	case SLJIT_GREATER_EQUAL:
351 	case SLJIT_GREATER_EQUAL_F64:
352 		return 0x83 /* jae */;
353 
354 	case SLJIT_GREATER:
355 	case SLJIT_GREATER_F64:
356 		return 0x87 /* jnbe */;
357 
358 	case SLJIT_LESS_EQUAL:
359 	case SLJIT_LESS_EQUAL_F64:
360 		return 0x86 /* jbe */;
361 
362 	case SLJIT_SIG_LESS:
363 		return 0x8c /* jl */;
364 
365 	case SLJIT_SIG_GREATER_EQUAL:
366 		return 0x8d /* jnl */;
367 
368 	case SLJIT_SIG_GREATER:
369 		return 0x8f /* jnle */;
370 
371 	case SLJIT_SIG_LESS_EQUAL:
372 		return 0x8e /* jle */;
373 
374 	case SLJIT_OVERFLOW:
375 	case SLJIT_MUL_OVERFLOW:
376 		return 0x80 /* jo */;
377 
378 	case SLJIT_NOT_OVERFLOW:
379 	case SLJIT_MUL_NOT_OVERFLOW:
380 		return 0x81 /* jno */;
381 
382 	case SLJIT_UNORDERED_F64:
383 		return 0x8a /* jp */;
384 
385 	case SLJIT_ORDERED_F64:
386 		return 0x8b /* jpo */;
387 	}
388 	return 0;
389 }
390 
391 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type);
392 
393 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
394 static sljit_u8* generate_fixed_jump(sljit_u8 *code_ptr, sljit_sw addr, sljit_s32 type);
395 #endif
396 
397 static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_s32 type)
398 {
399 	sljit_s32 short_jump;
400 	sljit_uw label_addr;
401 
402 	if (jump->flags & JUMP_LABEL)
403 		label_addr = (sljit_uw)(code + jump->u.label->size);
404 	else
405 		label_addr = jump->u.target;
406 	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
407 
408 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
409 	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
410 		return generate_far_jump_code(jump, code_ptr, type);
411 #endif
412 
413 	if (type == SLJIT_JUMP) {
414 		if (short_jump)
415 			*code_ptr++ = JMP_i8;
416 		else
417 			*code_ptr++ = JMP_i32;
418 		jump->addr++;
419 	}
420 	else if (type >= SLJIT_FAST_CALL) {
421 		short_jump = 0;
422 		*code_ptr++ = CALL_i32;
423 		jump->addr++;
424 	}
425 	else if (short_jump) {
426 		*code_ptr++ = get_jump_code(type) - 0x10;
427 		jump->addr++;
428 	}
429 	else {
430 		*code_ptr++ = GROUP_0F;
431 		*code_ptr++ = get_jump_code(type);
432 		jump->addr += 2;
433 	}
434 
435 	if (short_jump) {
436 		jump->flags |= PATCH_MB;
437 		code_ptr += sizeof(sljit_s8);
438 	} else {
439 		jump->flags |= PATCH_MW;
440 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
441 		code_ptr += sizeof(sljit_sw);
442 #else
443 		code_ptr += sizeof(sljit_s32);
444 #endif
445 	}
446 
447 	return code_ptr;
448 }
449 
450 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
451 {
452 	struct sljit_memory_fragment *buf;
453 	sljit_u8 *code;
454 	sljit_u8 *code_ptr;
455 	sljit_u8 *buf_ptr;
456 	sljit_u8 *buf_end;
457 	sljit_u8 len;
458 
459 	struct sljit_label *label;
460 	struct sljit_jump *jump;
461 	struct sljit_const *const_;
462 
463 	CHECK_ERROR_PTR();
464 	CHECK_PTR(check_sljit_generate_code(compiler));
465 	reverse_buf(compiler);
466 
467 	/* Second code generation pass. */
468 	code = (sljit_u8*)SLJIT_MALLOC_EXEC(compiler->size);
469 	PTR_FAIL_WITH_EXEC_IF(code);
470 	buf = compiler->buf;
471 
472 	code_ptr = code;
473 	label = compiler->labels;
474 	jump = compiler->jumps;
475 	const_ = compiler->consts;
476 	do {
477 		buf_ptr = buf->memory;
478 		buf_end = buf_ptr + buf->used_size;
479 		do {
480 			len = *buf_ptr++;
481 			if (len > 0) {
482 				/* The code is already generated. */
483 				SLJIT_MEMMOVE(code_ptr, buf_ptr, len);
484 				code_ptr += len;
485 				buf_ptr += len;
486 			}
487 			else {
488 				if (*buf_ptr >= 4) {
489 					jump->addr = (sljit_uw)code_ptr;
490 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
491 						code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 4);
492 					else
493 						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 4);
494 					jump = jump->next;
495 				}
496 				else if (*buf_ptr == 0) {
497 					label->addr = (sljit_uw)code_ptr;
498 					label->size = code_ptr - code;
499 					label = label->next;
500 				}
501 				else if (*buf_ptr == 1) {
502 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
503 					const_ = const_->next;
504 				}
505 				else {
506 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
507 					*code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
508 					buf_ptr++;
509 					*(sljit_sw*)code_ptr = *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw));
510 					code_ptr += sizeof(sljit_sw);
511 					buf_ptr += sizeof(sljit_sw) - 1;
512 #else
513 					code_ptr = generate_fixed_jump(code_ptr, *(sljit_sw*)(buf_ptr + 1), *buf_ptr);
514 					buf_ptr += sizeof(sljit_sw);
515 #endif
516 				}
517 				buf_ptr++;
518 			}
519 		} while (buf_ptr < buf_end);
520 		SLJIT_ASSERT(buf_ptr == buf_end);
521 		buf = buf->next;
522 	} while (buf);
523 
524 	SLJIT_ASSERT(!label);
525 	SLJIT_ASSERT(!jump);
526 	SLJIT_ASSERT(!const_);
527 
528 	jump = compiler->jumps;
529 	while (jump) {
530 		if (jump->flags & PATCH_MB) {
531 			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8))) <= 127);
532 			*(sljit_u8*)jump->addr = (sljit_u8)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8)));
533 		} else if (jump->flags & PATCH_MW) {
534 			if (jump->flags & JUMP_LABEL) {
535 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
536 				*(sljit_sw*)jump->addr = (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sw)));
537 #else
538 				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
539 				*(sljit_s32*)jump->addr = (sljit_s32)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32)));
540 #endif
541 			}
542 			else {
543 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
544 				*(sljit_sw*)jump->addr = (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_sw)));
545 #else
546 				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
547 				*(sljit_s32*)jump->addr = (sljit_s32)(jump->u.target - (jump->addr + sizeof(sljit_s32)));
548 #endif
549 			}
550 		}
551 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
552 		else if (jump->flags & PATCH_MD)
553 			*(sljit_sw*)jump->addr = jump->u.label->addr;
554 #endif
555 
556 		jump = jump->next;
557 	}
558 
559 	/* Maybe we waste some space because of short jumps. */
560 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
561 	compiler->error = SLJIT_ERR_COMPILED;
562 	compiler->executable_size = code_ptr - code;
563 	return (void*)code;
564 }
565 
566 /* --------------------------------------------------------------------- */
567 /*  Operators                                                            */
568 /* --------------------------------------------------------------------- */
569 
570 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
571 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
572 	sljit_s32 dst, sljit_sw dstw,
573 	sljit_s32 src1, sljit_sw src1w,
574 	sljit_s32 src2, sljit_sw src2w);
575 
576 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
577 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
578 	sljit_s32 dst, sljit_sw dstw,
579 	sljit_s32 src1, sljit_sw src1w,
580 	sljit_s32 src2, sljit_sw src2w);
581 
582 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
583 	sljit_s32 dst, sljit_sw dstw,
584 	sljit_s32 src, sljit_sw srcw);
585 
586 static SLJIT_INLINE sljit_s32 emit_save_flags(struct sljit_compiler *compiler)
587 {
588 	sljit_u8 *inst;
589 
590 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
591 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
592 	FAIL_IF(!inst);
593 	INC_SIZE(5);
594 #else
595 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 6);
596 	FAIL_IF(!inst);
597 	INC_SIZE(6);
598 	*inst++ = REX_W;
599 #endif
600 	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
601 	*inst++ = 0x64;
602 	*inst++ = 0x24;
603 	*inst++ = (sljit_u8)sizeof(sljit_sw);
604 	*inst++ = PUSHF;
605 	compiler->flags_saved = 1;
606 	return SLJIT_SUCCESS;
607 }
608 
609 static SLJIT_INLINE sljit_s32 emit_restore_flags(struct sljit_compiler *compiler, sljit_s32 keep_flags)
610 {
611 	sljit_u8 *inst;
612 
613 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
614 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
615 	FAIL_IF(!inst);
616 	INC_SIZE(5);
617 	*inst++ = POPF;
618 #else
619 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 6);
620 	FAIL_IF(!inst);
621 	INC_SIZE(6);
622 	*inst++ = POPF;
623 	*inst++ = REX_W;
624 #endif
625 	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
626 	*inst++ = 0x64;
627 	*inst++ = 0x24;
628 	*inst++ = (sljit_u8)(-(sljit_s8)sizeof(sljit_sw));
629 	compiler->flags_saved = keep_flags;
630 	return SLJIT_SUCCESS;
631 }
632 
633 #ifdef _WIN32
634 #include <malloc.h>
635 
636 static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
637 {
638 	/* Workaround for calling the internal _chkstk() function on Windows.
639 	This function touches all 4k pages belongs to the requested stack space,
640 	which size is passed in local_size. This is necessary on Windows where
641 	the stack can only grow in 4k steps. However, this function just burn
642 	CPU cycles if the stack is large enough. However, you don't know it in
643 	advance, so it must always be called. I think this is a bad design in
644 	general even if it has some reasons. */
645 	*(volatile sljit_s32*)alloca(local_size) = 0;
646 }
647 
648 #endif
649 
650 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
651 #include "sljitNativeX86_32.c"
652 #else
653 #include "sljitNativeX86_64.c"
654 #endif
655 
656 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
657 	sljit_s32 dst, sljit_sw dstw,
658 	sljit_s32 src, sljit_sw srcw)
659 {
660 	sljit_u8* inst;
661 
662 	if (dst == SLJIT_UNUSED) {
663 		/* No destination, doesn't need to setup flags. */
664 		if (src & SLJIT_MEM) {
665 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
666 			FAIL_IF(!inst);
667 			*inst = MOV_r_rm;
668 		}
669 		return SLJIT_SUCCESS;
670 	}
671 	if (FAST_IS_REG(src)) {
672 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
673 		FAIL_IF(!inst);
674 		*inst = MOV_rm_r;
675 		return SLJIT_SUCCESS;
676 	}
677 	if (src & SLJIT_IMM) {
678 		if (FAST_IS_REG(dst)) {
679 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
680 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
681 #else
682 			if (!compiler->mode32) {
683 				if (NOT_HALFWORD(srcw))
684 					return emit_load_imm64(compiler, dst, srcw);
685 			}
686 			else
687 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
688 #endif
689 		}
690 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
691 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
692 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
693 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
694 			FAIL_IF(!inst);
695 			*inst = MOV_rm_r;
696 			return SLJIT_SUCCESS;
697 		}
698 #endif
699 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
700 		FAIL_IF(!inst);
701 		*inst = MOV_rm_i32;
702 		return SLJIT_SUCCESS;
703 	}
704 	if (FAST_IS_REG(dst)) {
705 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
706 		FAIL_IF(!inst);
707 		*inst = MOV_r_rm;
708 		return SLJIT_SUCCESS;
709 	}
710 
711 	/* Memory to memory move. Requires two instruction. */
712 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
713 	FAIL_IF(!inst);
714 	*inst = MOV_r_rm;
715 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
716 	FAIL_IF(!inst);
717 	*inst = MOV_rm_r;
718 	return SLJIT_SUCCESS;
719 }
720 
721 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
722 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
723 
724 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
725 {
726 	sljit_u8 *inst;
727 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
728 	sljit_s32 size;
729 #endif
730 
731 	CHECK_ERROR();
732 	CHECK(check_sljit_emit_op0(compiler, op));
733 
734 	switch (GET_OPCODE(op)) {
735 	case SLJIT_BREAKPOINT:
736 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
737 		FAIL_IF(!inst);
738 		INC_SIZE(1);
739 		*inst = INT3;
740 		break;
741 	case SLJIT_NOP:
742 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
743 		FAIL_IF(!inst);
744 		INC_SIZE(1);
745 		*inst = NOP;
746 		break;
747 	case SLJIT_LMUL_UW:
748 	case SLJIT_LMUL_SW:
749 	case SLJIT_DIVMOD_UW:
750 	case SLJIT_DIVMOD_SW:
751 	case SLJIT_DIV_UW:
752 	case SLJIT_DIV_SW:
753 		compiler->flags_saved = 0;
754 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
755 #ifdef _WIN64
756 		SLJIT_COMPILE_ASSERT(
757 			reg_map[SLJIT_R0] == 0
758 			&& reg_map[SLJIT_R1] == 2
759 			&& reg_map[TMP_REG1] > 7,
760 			invalid_register_assignment_for_div_mul);
761 #else
762 		SLJIT_COMPILE_ASSERT(
763 			reg_map[SLJIT_R0] == 0
764 			&& reg_map[SLJIT_R1] < 7
765 			&& reg_map[TMP_REG1] == 2,
766 			invalid_register_assignment_for_div_mul);
767 #endif
768 		compiler->mode32 = op & SLJIT_I32_OP;
769 #endif
770 		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
771 
772 		op = GET_OPCODE(op);
773 		if ((op | 0x2) == SLJIT_DIV_UW) {
774 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
775 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
776 			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
777 #else
778 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
779 #endif
780 			FAIL_IF(!inst);
781 			*inst = XOR_r_rm;
782 		}
783 
784 		if ((op | 0x2) == SLJIT_DIV_SW) {
785 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
786 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
787 #endif
788 
789 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
790 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
791 			FAIL_IF(!inst);
792 			INC_SIZE(1);
793 			*inst = CDQ;
794 #else
795 			if (compiler->mode32) {
796 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
797 				FAIL_IF(!inst);
798 				INC_SIZE(1);
799 				*inst = CDQ;
800 			} else {
801 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
802 				FAIL_IF(!inst);
803 				INC_SIZE(2);
804 				*inst++ = REX_W;
805 				*inst = CDQ;
806 			}
807 #endif
808 		}
809 
810 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
811 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
812 		FAIL_IF(!inst);
813 		INC_SIZE(2);
814 		*inst++ = GROUP_F7;
815 		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
816 #else
817 #ifdef _WIN64
818 		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
819 #else
820 		size = (!compiler->mode32) ? 3 : 2;
821 #endif
822 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
823 		FAIL_IF(!inst);
824 		INC_SIZE(size);
825 #ifdef _WIN64
826 		if (!compiler->mode32)
827 			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
828 		else if (op >= SLJIT_DIVMOD_UW)
829 			*inst++ = REX_B;
830 		*inst++ = GROUP_F7;
831 		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
832 #else
833 		if (!compiler->mode32)
834 			*inst++ = REX_W;
835 		*inst++ = GROUP_F7;
836 		*inst = MOD_REG | reg_map[SLJIT_R1];
837 #endif
838 #endif
839 		switch (op) {
840 		case SLJIT_LMUL_UW:
841 			*inst |= MUL;
842 			break;
843 		case SLJIT_LMUL_SW:
844 			*inst |= IMUL;
845 			break;
846 		case SLJIT_DIVMOD_UW:
847 		case SLJIT_DIV_UW:
848 			*inst |= DIV;
849 			break;
850 		case SLJIT_DIVMOD_SW:
851 		case SLJIT_DIV_SW:
852 			*inst |= IDIV;
853 			break;
854 		}
855 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
856 		if (op <= SLJIT_DIVMOD_SW)
857 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
858 #else
859 		if (op >= SLJIT_DIV_UW)
860 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
861 #endif
862 		break;
863 	}
864 
865 	return SLJIT_SUCCESS;
866 }
867 
868 #define ENCODE_PREFIX(prefix) \
869 	do { \
870 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); \
871 		FAIL_IF(!inst); \
872 		INC_SIZE(1); \
873 		*inst = (prefix); \
874 	} while (0)
875 
876 static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
877 	sljit_s32 dst, sljit_sw dstw,
878 	sljit_s32 src, sljit_sw srcw)
879 {
880 	sljit_u8* inst;
881 	sljit_s32 dst_r;
882 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
883 	sljit_s32 work_r;
884 #endif
885 
886 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
887 	compiler->mode32 = 0;
888 #endif
889 
890 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
891 		return SLJIT_SUCCESS; /* Empty instruction. */
892 
893 	if (src & SLJIT_IMM) {
894 		if (FAST_IS_REG(dst)) {
895 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
896 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
897 #else
898 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
899 			FAIL_IF(!inst);
900 			*inst = MOV_rm_i32;
901 			return SLJIT_SUCCESS;
902 #endif
903 		}
904 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
905 		FAIL_IF(!inst);
906 		*inst = MOV_rm8_i8;
907 		return SLJIT_SUCCESS;
908 	}
909 
910 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
911 
912 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
913 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
914 		if (reg_map[src] >= 4) {
915 			SLJIT_ASSERT(dst_r == TMP_REG1);
916 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
917 		} else
918 			dst_r = src;
919 #else
920 		dst_r = src;
921 #endif
922 	}
923 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
924 	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
925 		/* src, dst are registers. */
926 		SLJIT_ASSERT(SLOW_IS_REG(dst));
927 		if (reg_map[dst] < 4) {
928 			if (dst != src)
929 				EMIT_MOV(compiler, dst, 0, src, 0);
930 			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
931 			FAIL_IF(!inst);
932 			*inst++ = GROUP_0F;
933 			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
934 		}
935 		else {
936 			if (dst != src)
937 				EMIT_MOV(compiler, dst, 0, src, 0);
938 			if (sign) {
939 				/* shl reg, 24 */
940 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
941 				FAIL_IF(!inst);
942 				*inst |= SHL;
943 				/* sar reg, 24 */
944 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
945 				FAIL_IF(!inst);
946 				*inst |= SAR;
947 			}
948 			else {
949 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
950 				FAIL_IF(!inst);
951 				*(inst + 1) |= AND;
952 			}
953 		}
954 		return SLJIT_SUCCESS;
955 	}
956 #endif
957 	else {
958 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
959 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
960 		FAIL_IF(!inst);
961 		*inst++ = GROUP_0F;
962 		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
963 	}
964 
965 	if (dst & SLJIT_MEM) {
966 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
967 		if (dst_r == TMP_REG1) {
968 			/* Find a non-used register, whose reg_map[src] < 4. */
969 			if ((dst & REG_MASK) == SLJIT_R0) {
970 				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
971 					work_r = SLJIT_R2;
972 				else
973 					work_r = SLJIT_R1;
974 			}
975 			else {
976 				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
977 					work_r = SLJIT_R0;
978 				else if ((dst & REG_MASK) == SLJIT_R1)
979 					work_r = SLJIT_R2;
980 				else
981 					work_r = SLJIT_R1;
982 			}
983 
984 			if (work_r == SLJIT_R0) {
985 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
986 			}
987 			else {
988 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
989 				FAIL_IF(!inst);
990 				*inst = XCHG_r_rm;
991 			}
992 
993 			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
994 			FAIL_IF(!inst);
995 			*inst = MOV_rm8_r8;
996 
997 			if (work_r == SLJIT_R0) {
998 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
999 			}
1000 			else {
1001 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
1002 				FAIL_IF(!inst);
1003 				*inst = XCHG_r_rm;
1004 			}
1005 		}
1006 		else {
1007 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1008 			FAIL_IF(!inst);
1009 			*inst = MOV_rm8_r8;
1010 		}
1011 #else
1012 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1013 		FAIL_IF(!inst);
1014 		*inst = MOV_rm8_r8;
1015 #endif
1016 	}
1017 
1018 	return SLJIT_SUCCESS;
1019 }
1020 
1021 static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
1022 	sljit_s32 dst, sljit_sw dstw,
1023 	sljit_s32 src, sljit_sw srcw)
1024 {
1025 	sljit_u8* inst;
1026 	sljit_s32 dst_r;
1027 
1028 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1029 	compiler->mode32 = 0;
1030 #endif
1031 
1032 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
1033 		return SLJIT_SUCCESS; /* Empty instruction. */
1034 
1035 	if (src & SLJIT_IMM) {
1036 		if (FAST_IS_REG(dst)) {
1037 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1038 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1039 #else
1040 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1041 			FAIL_IF(!inst);
1042 			*inst = MOV_rm_i32;
1043 			return SLJIT_SUCCESS;
1044 #endif
1045 		}
1046 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1047 		FAIL_IF(!inst);
1048 		*inst = MOV_rm_i32;
1049 		return SLJIT_SUCCESS;
1050 	}
1051 
1052 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1053 
1054 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1055 		dst_r = src;
1056 	else {
1057 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1058 		FAIL_IF(!inst);
1059 		*inst++ = GROUP_0F;
1060 		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1061 	}
1062 
1063 	if (dst & SLJIT_MEM) {
1064 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1065 		FAIL_IF(!inst);
1066 		*inst = MOV_rm_r;
1067 	}
1068 
1069 	return SLJIT_SUCCESS;
1070 }
1071 
1072 static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1073 	sljit_s32 dst, sljit_sw dstw,
1074 	sljit_s32 src, sljit_sw srcw)
1075 {
1076 	sljit_u8* inst;
1077 
1078 	if (dst == SLJIT_UNUSED) {
1079 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1080 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1081 		FAIL_IF(!inst);
1082 		*inst++ = GROUP_F7;
1083 		*inst |= opcode;
1084 		return SLJIT_SUCCESS;
1085 	}
1086 	if (dst == src && dstw == srcw) {
1087 		/* Same input and output */
1088 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1089 		FAIL_IF(!inst);
1090 		*inst++ = GROUP_F7;
1091 		*inst |= opcode;
1092 		return SLJIT_SUCCESS;
1093 	}
1094 	if (FAST_IS_REG(dst)) {
1095 		EMIT_MOV(compiler, dst, 0, src, srcw);
1096 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1097 		FAIL_IF(!inst);
1098 		*inst++ = GROUP_F7;
1099 		*inst |= opcode;
1100 		return SLJIT_SUCCESS;
1101 	}
1102 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1103 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1104 	FAIL_IF(!inst);
1105 	*inst++ = GROUP_F7;
1106 	*inst |= opcode;
1107 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1108 	return SLJIT_SUCCESS;
1109 }
1110 
1111 static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler,
1112 	sljit_s32 dst, sljit_sw dstw,
1113 	sljit_s32 src, sljit_sw srcw)
1114 {
1115 	sljit_u8* inst;
1116 
1117 	if (dst == SLJIT_UNUSED) {
1118 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1119 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1120 		FAIL_IF(!inst);
1121 		*inst++ = GROUP_F7;
1122 		*inst |= NOT_rm;
1123 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1124 		FAIL_IF(!inst);
1125 		*inst = OR_r_rm;
1126 		return SLJIT_SUCCESS;
1127 	}
1128 	if (FAST_IS_REG(dst)) {
1129 		EMIT_MOV(compiler, dst, 0, src, srcw);
1130 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1131 		FAIL_IF(!inst);
1132 		*inst++ = GROUP_F7;
1133 		*inst |= NOT_rm;
1134 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1135 		FAIL_IF(!inst);
1136 		*inst = OR_r_rm;
1137 		return SLJIT_SUCCESS;
1138 	}
1139 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1140 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1141 	FAIL_IF(!inst);
1142 	*inst++ = GROUP_F7;
1143 	*inst |= NOT_rm;
1144 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1145 	FAIL_IF(!inst);
1146 	*inst = OR_r_rm;
1147 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1148 	return SLJIT_SUCCESS;
1149 }
1150 
1151 static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
1152 	sljit_s32 dst, sljit_sw dstw,
1153 	sljit_s32 src, sljit_sw srcw)
1154 {
1155 	sljit_u8* inst;
1156 	sljit_s32 dst_r;
1157 
1158 	SLJIT_UNUSED_ARG(op_flags);
1159 	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
1160 		/* Just set the zero flag. */
1161 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1162 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1163 		FAIL_IF(!inst);
1164 		*inst++ = GROUP_F7;
1165 		*inst |= NOT_rm;
1166 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1167 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
1168 #else
1169 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, TMP_REG1, 0);
1170 #endif
1171 		FAIL_IF(!inst);
1172 		*inst |= SHR;
1173 		return SLJIT_SUCCESS;
1174 	}
1175 
1176 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
1177 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
1178 		src = TMP_REG1;
1179 		srcw = 0;
1180 	}
1181 
1182 	inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
1183 	FAIL_IF(!inst);
1184 	*inst++ = GROUP_0F;
1185 	*inst = BSR_r_rm;
1186 
1187 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1188 	if (FAST_IS_REG(dst))
1189 		dst_r = dst;
1190 	else {
1191 		/* Find an unused temporary register. */
1192 		if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
1193 			dst_r = SLJIT_R0;
1194 		else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
1195 			dst_r = SLJIT_R1;
1196 		else
1197 			dst_r = SLJIT_R2;
1198 		EMIT_MOV(compiler, dst, dstw, dst_r, 0);
1199 	}
1200 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
1201 #else
1202 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
1203 	compiler->mode32 = 0;
1204 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 64 + 63 : 32 + 31);
1205 	compiler->mode32 = op_flags & SLJIT_I32_OP;
1206 #endif
1207 
1208 	if (cpu_has_cmov == -1)
1209 		get_cpu_features();
1210 
1211 	if (cpu_has_cmov) {
1212 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1213 		FAIL_IF(!inst);
1214 		*inst++ = GROUP_0F;
1215 		*inst = CMOVNE_r_rm;
1216 	} else {
1217 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1218 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1219 		FAIL_IF(!inst);
1220 		INC_SIZE(4);
1221 
1222 		*inst++ = JE_i8;
1223 		*inst++ = 2;
1224 		*inst++ = MOV_r_rm;
1225 		*inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
1226 #else
1227 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
1228 		FAIL_IF(!inst);
1229 		INC_SIZE(5);
1230 
1231 		*inst++ = JE_i8;
1232 		*inst++ = 3;
1233 		*inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
1234 		*inst++ = MOV_r_rm;
1235 		*inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
1236 #endif
1237 	}
1238 
1239 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1240 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1241 #else
1242 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0);
1243 #endif
1244 	FAIL_IF(!inst);
1245 	*(inst + 1) |= XOR;
1246 
1247 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1248 	if (dst & SLJIT_MEM) {
1249 		inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1250 		FAIL_IF(!inst);
1251 		*inst = XCHG_r_rm;
1252 	}
1253 #else
1254 	if (dst & SLJIT_MEM)
1255 		EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
1256 #endif
1257 	return SLJIT_SUCCESS;
1258 }
1259 
1260 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1261 	sljit_s32 dst, sljit_sw dstw,
1262 	sljit_s32 src, sljit_sw srcw)
1263 {
1264 	sljit_u8* inst;
1265 	sljit_s32 update = 0;
1266 	sljit_s32 op_flags = GET_ALL_FLAGS(op);
1267 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1268 	sljit_s32 dst_is_ereg = 0;
1269 	sljit_s32 src_is_ereg = 0;
1270 #else
1271 #	define src_is_ereg 0
1272 #endif
1273 
1274 	CHECK_ERROR();
1275 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1276 	ADJUST_LOCAL_OFFSET(dst, dstw);
1277 	ADJUST_LOCAL_OFFSET(src, srcw);
1278 
1279 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1280 	CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
1281 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1282 	compiler->mode32 = op_flags & SLJIT_I32_OP;
1283 #endif
1284 
1285 	op = GET_OPCODE(op);
1286 	if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
1287 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1288 		compiler->mode32 = 0;
1289 #endif
1290 
1291 		if (op_flags & SLJIT_I32_OP) {
1292 			if (FAST_IS_REG(src) && src == dst) {
1293 				if (!TYPE_CAST_NEEDED(op))
1294 					return SLJIT_SUCCESS;
1295 			}
1296 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1297 			if (op == SLJIT_MOV_S32 && (src & SLJIT_MEM))
1298 				op = SLJIT_MOV_U32;
1299 			if (op == SLJIT_MOVU_S32 && (src & SLJIT_MEM))
1300 				op = SLJIT_MOVU_U32;
1301 			if (op == SLJIT_MOV_U32 && (src & SLJIT_IMM))
1302 				op = SLJIT_MOV_S32;
1303 			if (op == SLJIT_MOVU_U32 && (src & SLJIT_IMM))
1304 				op = SLJIT_MOVU_S32;
1305 #endif
1306 		}
1307 
1308 		SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
1309 		if (op >= SLJIT_MOVU) {
1310 			update = 1;
1311 			op -= 8;
1312 		}
1313 
1314 		if (src & SLJIT_IMM) {
1315 			switch (op) {
1316 			case SLJIT_MOV_U8:
1317 				srcw = (sljit_u8)srcw;
1318 				break;
1319 			case SLJIT_MOV_S8:
1320 				srcw = (sljit_s8)srcw;
1321 				break;
1322 			case SLJIT_MOV_U16:
1323 				srcw = (sljit_u16)srcw;
1324 				break;
1325 			case SLJIT_MOV_S16:
1326 				srcw = (sljit_s16)srcw;
1327 				break;
1328 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1329 			case SLJIT_MOV_U32:
1330 				srcw = (sljit_u32)srcw;
1331 				break;
1332 			case SLJIT_MOV_S32:
1333 				srcw = (sljit_s32)srcw;
1334 				break;
1335 #endif
1336 			}
1337 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1338 			if (SLJIT_UNLIKELY(dst_is_ereg))
1339 				return emit_mov(compiler, dst, dstw, src, srcw);
1340 #endif
1341 		}
1342 
1343 		if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
1344 			inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
1345 			FAIL_IF(!inst);
1346 			*inst = LEA_r_m;
1347 			src &= SLJIT_MEM | 0xf;
1348 			srcw = 0;
1349 		}
1350 
1351 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1352 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1353 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1354 			dst = TMP_REG1;
1355 		}
1356 #endif
1357 
1358 		switch (op) {
1359 		case SLJIT_MOV:
1360 		case SLJIT_MOV_P:
1361 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1362 		case SLJIT_MOV_U32:
1363 		case SLJIT_MOV_S32:
1364 #endif
1365 			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1366 			break;
1367 		case SLJIT_MOV_U8:
1368 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1369 			break;
1370 		case SLJIT_MOV_S8:
1371 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1372 			break;
1373 		case SLJIT_MOV_U16:
1374 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1375 			break;
1376 		case SLJIT_MOV_S16:
1377 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1378 			break;
1379 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1380 		case SLJIT_MOV_U32:
1381 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1382 			break;
1383 		case SLJIT_MOV_S32:
1384 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1385 			break;
1386 #endif
1387 		}
1388 
1389 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1390 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1391 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1392 #endif
1393 
1394 		if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
1395 			inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
1396 			FAIL_IF(!inst);
1397 			*inst = LEA_r_m;
1398 		}
1399 		return SLJIT_SUCCESS;
1400 	}
1401 
1402 	if (SLJIT_UNLIKELY(GET_FLAGS(op_flags)))
1403 		compiler->flags_saved = 0;
1404 
1405 	switch (op) {
1406 	case SLJIT_NOT:
1407 		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
1408 			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1409 		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1410 
1411 	case SLJIT_NEG:
1412 		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1413 			FAIL_IF(emit_save_flags(compiler));
1414 		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1415 
1416 	case SLJIT_CLZ:
1417 		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1418 			FAIL_IF(emit_save_flags(compiler));
1419 		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1420 	}
1421 
1422 	return SLJIT_SUCCESS;
1423 
1424 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1425 #	undef src_is_ereg
1426 #endif
1427 }
1428 
1429 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1430 
1431 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1432 	if (IS_HALFWORD(immw) || compiler->mode32) { \
1433 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1434 		FAIL_IF(!inst); \
1435 		*(inst + 1) |= (op_imm); \
1436 	} \
1437 	else { \
1438 		FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
1439 		inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
1440 		FAIL_IF(!inst); \
1441 		*inst = (op_mr); \
1442 	}
1443 
1444 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1445 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1446 
1447 #else
1448 
1449 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1450 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1451 	FAIL_IF(!inst); \
1452 	*(inst + 1) |= (op_imm);
1453 
1454 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1455 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1456 
1457 #endif
1458 
1459 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1460 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
1461 	sljit_s32 dst, sljit_sw dstw,
1462 	sljit_s32 src1, sljit_sw src1w,
1463 	sljit_s32 src2, sljit_sw src2w)
1464 {
1465 	sljit_u8* inst;
1466 
1467 	if (dst == SLJIT_UNUSED) {
1468 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1469 		if (src2 & SLJIT_IMM) {
1470 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1471 		}
1472 		else {
1473 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1474 			FAIL_IF(!inst);
1475 			*inst = op_rm;
1476 		}
1477 		return SLJIT_SUCCESS;
1478 	}
1479 
1480 	if (dst == src1 && dstw == src1w) {
1481 		if (src2 & SLJIT_IMM) {
1482 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1483 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1484 #else
1485 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1486 #endif
1487 				BINARY_EAX_IMM(op_eax_imm, src2w);
1488 			}
1489 			else {
1490 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1491 			}
1492 		}
1493 		else if (FAST_IS_REG(dst)) {
1494 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1495 			FAIL_IF(!inst);
1496 			*inst = op_rm;
1497 		}
1498 		else if (FAST_IS_REG(src2)) {
1499 			/* Special exception for sljit_emit_op_flags. */
1500 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1501 			FAIL_IF(!inst);
1502 			*inst = op_mr;
1503 		}
1504 		else {
1505 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1506 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1507 			FAIL_IF(!inst);
1508 			*inst = op_mr;
1509 		}
1510 		return SLJIT_SUCCESS;
1511 	}
1512 
1513 	/* Only for cumulative operations. */
1514 	if (dst == src2 && dstw == src2w) {
1515 		if (src1 & SLJIT_IMM) {
1516 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1517 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1518 #else
1519 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1520 #endif
1521 				BINARY_EAX_IMM(op_eax_imm, src1w);
1522 			}
1523 			else {
1524 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1525 			}
1526 		}
1527 		else if (FAST_IS_REG(dst)) {
1528 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1529 			FAIL_IF(!inst);
1530 			*inst = op_rm;
1531 		}
1532 		else if (FAST_IS_REG(src1)) {
1533 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1534 			FAIL_IF(!inst);
1535 			*inst = op_mr;
1536 		}
1537 		else {
1538 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1539 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1540 			FAIL_IF(!inst);
1541 			*inst = op_mr;
1542 		}
1543 		return SLJIT_SUCCESS;
1544 	}
1545 
1546 	/* General version. */
1547 	if (FAST_IS_REG(dst)) {
1548 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1549 		if (src2 & SLJIT_IMM) {
1550 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1551 		}
1552 		else {
1553 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1554 			FAIL_IF(!inst);
1555 			*inst = op_rm;
1556 		}
1557 	}
1558 	else {
1559 		/* This version requires less memory writing. */
1560 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1561 		if (src2 & SLJIT_IMM) {
1562 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1563 		}
1564 		else {
1565 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1566 			FAIL_IF(!inst);
1567 			*inst = op_rm;
1568 		}
1569 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1570 	}
1571 
1572 	return SLJIT_SUCCESS;
1573 }
1574 
1575 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
1576 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
1577 	sljit_s32 dst, sljit_sw dstw,
1578 	sljit_s32 src1, sljit_sw src1w,
1579 	sljit_s32 src2, sljit_sw src2w)
1580 {
1581 	sljit_u8* inst;
1582 
1583 	if (dst == SLJIT_UNUSED) {
1584 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1585 		if (src2 & SLJIT_IMM) {
1586 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1587 		}
1588 		else {
1589 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1590 			FAIL_IF(!inst);
1591 			*inst = op_rm;
1592 		}
1593 		return SLJIT_SUCCESS;
1594 	}
1595 
1596 	if (dst == src1 && dstw == src1w) {
1597 		if (src2 & SLJIT_IMM) {
1598 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1599 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1600 #else
1601 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1602 #endif
1603 				BINARY_EAX_IMM(op_eax_imm, src2w);
1604 			}
1605 			else {
1606 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1607 			}
1608 		}
1609 		else if (FAST_IS_REG(dst)) {
1610 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1611 			FAIL_IF(!inst);
1612 			*inst = op_rm;
1613 		}
1614 		else if (FAST_IS_REG(src2)) {
1615 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1616 			FAIL_IF(!inst);
1617 			*inst = op_mr;
1618 		}
1619 		else {
1620 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1621 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1622 			FAIL_IF(!inst);
1623 			*inst = op_mr;
1624 		}
1625 		return SLJIT_SUCCESS;
1626 	}
1627 
1628 	/* General version. */
1629 	if (FAST_IS_REG(dst) && dst != src2) {
1630 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1631 		if (src2 & SLJIT_IMM) {
1632 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1633 		}
1634 		else {
1635 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1636 			FAIL_IF(!inst);
1637 			*inst = op_rm;
1638 		}
1639 	}
1640 	else {
1641 		/* This version requires less memory writing. */
1642 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1643 		if (src2 & SLJIT_IMM) {
1644 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1645 		}
1646 		else {
1647 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1648 			FAIL_IF(!inst);
1649 			*inst = op_rm;
1650 		}
1651 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1652 	}
1653 
1654 	return SLJIT_SUCCESS;
1655 }
1656 
1657 static sljit_s32 emit_mul(struct sljit_compiler *compiler,
1658 	sljit_s32 dst, sljit_sw dstw,
1659 	sljit_s32 src1, sljit_sw src1w,
1660 	sljit_s32 src2, sljit_sw src2w)
1661 {
1662 	sljit_u8* inst;
1663 	sljit_s32 dst_r;
1664 
1665 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1666 
1667 	/* Register destination. */
1668 	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1669 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1670 		FAIL_IF(!inst);
1671 		*inst++ = GROUP_0F;
1672 		*inst = IMUL_r_rm;
1673 	}
1674 	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1675 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1676 		FAIL_IF(!inst);
1677 		*inst++ = GROUP_0F;
1678 		*inst = IMUL_r_rm;
1679 	}
1680 	else if (src1 & SLJIT_IMM) {
1681 		if (src2 & SLJIT_IMM) {
1682 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1683 			src2 = dst_r;
1684 			src2w = 0;
1685 		}
1686 
1687 		if (src1w <= 127 && src1w >= -128) {
1688 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1689 			FAIL_IF(!inst);
1690 			*inst = IMUL_r_rm_i8;
1691 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1692 			FAIL_IF(!inst);
1693 			INC_SIZE(1);
1694 			*inst = (sljit_s8)src1w;
1695 		}
1696 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1697 		else {
1698 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1699 			FAIL_IF(!inst);
1700 			*inst = IMUL_r_rm_i32;
1701 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1702 			FAIL_IF(!inst);
1703 			INC_SIZE(4);
1704 			*(sljit_sw*)inst = src1w;
1705 		}
1706 #else
1707 		else if (IS_HALFWORD(src1w)) {
1708 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1709 			FAIL_IF(!inst);
1710 			*inst = IMUL_r_rm_i32;
1711 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1712 			FAIL_IF(!inst);
1713 			INC_SIZE(4);
1714 			*(sljit_s32*)inst = (sljit_s32)src1w;
1715 		}
1716 		else {
1717 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1718 			if (dst_r != src2)
1719 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1720 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1721 			FAIL_IF(!inst);
1722 			*inst++ = GROUP_0F;
1723 			*inst = IMUL_r_rm;
1724 		}
1725 #endif
1726 	}
1727 	else if (src2 & SLJIT_IMM) {
1728 		/* Note: src1 is NOT immediate. */
1729 
1730 		if (src2w <= 127 && src2w >= -128) {
1731 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1732 			FAIL_IF(!inst);
1733 			*inst = IMUL_r_rm_i8;
1734 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1735 			FAIL_IF(!inst);
1736 			INC_SIZE(1);
1737 			*inst = (sljit_s8)src2w;
1738 		}
1739 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1740 		else {
1741 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1742 			FAIL_IF(!inst);
1743 			*inst = IMUL_r_rm_i32;
1744 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1745 			FAIL_IF(!inst);
1746 			INC_SIZE(4);
1747 			*(sljit_sw*)inst = src2w;
1748 		}
1749 #else
1750 		else if (IS_HALFWORD(src2w)) {
1751 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1752 			FAIL_IF(!inst);
1753 			*inst = IMUL_r_rm_i32;
1754 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1755 			FAIL_IF(!inst);
1756 			INC_SIZE(4);
1757 			*(sljit_s32*)inst = (sljit_s32)src2w;
1758 		}
1759 		else {
1760 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src2w);
1761 			if (dst_r != src1)
1762 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1763 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1764 			FAIL_IF(!inst);
1765 			*inst++ = GROUP_0F;
1766 			*inst = IMUL_r_rm;
1767 		}
1768 #endif
1769 	}
1770 	else {
1771 		/* Neither argument is immediate. */
1772 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1773 			dst_r = TMP_REG1;
1774 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1775 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1776 		FAIL_IF(!inst);
1777 		*inst++ = GROUP_0F;
1778 		*inst = IMUL_r_rm;
1779 	}
1780 
1781 	if (dst_r == TMP_REG1)
1782 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1783 
1784 	return SLJIT_SUCCESS;
1785 }
1786 
1787 static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler, sljit_s32 keep_flags,
1788 	sljit_s32 dst, sljit_sw dstw,
1789 	sljit_s32 src1, sljit_sw src1w,
1790 	sljit_s32 src2, sljit_sw src2w)
1791 {
1792 	sljit_u8* inst;
1793 	sljit_s32 dst_r, done = 0;
1794 
1795 	/* These cases better be left to handled by normal way. */
1796 	if (!keep_flags) {
1797 		if (dst == src1 && dstw == src1w)
1798 			return SLJIT_ERR_UNSUPPORTED;
1799 		if (dst == src2 && dstw == src2w)
1800 			return SLJIT_ERR_UNSUPPORTED;
1801 	}
1802 
1803 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1804 
1805 	if (FAST_IS_REG(src1)) {
1806 		if (FAST_IS_REG(src2)) {
1807 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1808 			FAIL_IF(!inst);
1809 			*inst = LEA_r_m;
1810 			done = 1;
1811 		}
1812 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1813 		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1814 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
1815 #else
1816 		if (src2 & SLJIT_IMM) {
1817 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1818 #endif
1819 			FAIL_IF(!inst);
1820 			*inst = LEA_r_m;
1821 			done = 1;
1822 		}
1823 	}
1824 	else if (FAST_IS_REG(src2)) {
1825 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1826 		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1827 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
1828 #else
1829 		if (src1 & SLJIT_IMM) {
1830 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1831 #endif
1832 			FAIL_IF(!inst);
1833 			*inst = LEA_r_m;
1834 			done = 1;
1835 		}
1836 	}
1837 
1838 	if (done) {
1839 		if (dst_r == TMP_REG1)
1840 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1841 		return SLJIT_SUCCESS;
1842 	}
1843 	return SLJIT_ERR_UNSUPPORTED;
1844 }
1845 
1846 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
1847 	sljit_s32 src1, sljit_sw src1w,
1848 	sljit_s32 src2, sljit_sw src2w)
1849 {
1850 	sljit_u8* inst;
1851 
1852 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1853 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1854 #else
1855 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1856 #endif
1857 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
1858 		return SLJIT_SUCCESS;
1859 	}
1860 
1861 	if (FAST_IS_REG(src1)) {
1862 		if (src2 & SLJIT_IMM) {
1863 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
1864 		}
1865 		else {
1866 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1867 			FAIL_IF(!inst);
1868 			*inst = CMP_r_rm;
1869 		}
1870 		return SLJIT_SUCCESS;
1871 	}
1872 
1873 	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
1874 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1875 		FAIL_IF(!inst);
1876 		*inst = CMP_rm_r;
1877 		return SLJIT_SUCCESS;
1878 	}
1879 
1880 	if (src2 & SLJIT_IMM) {
1881 		if (src1 & SLJIT_IMM) {
1882 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1883 			src1 = TMP_REG1;
1884 			src1w = 0;
1885 		}
1886 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
1887 	}
1888 	else {
1889 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1890 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1891 		FAIL_IF(!inst);
1892 		*inst = CMP_r_rm;
1893 	}
1894 	return SLJIT_SUCCESS;
1895 }
1896 
1897 static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
1898 	sljit_s32 src1, sljit_sw src1w,
1899 	sljit_s32 src2, sljit_sw src2w)
1900 {
1901 	sljit_u8* inst;
1902 
1903 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1904 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1905 #else
1906 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1907 #endif
1908 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
1909 		return SLJIT_SUCCESS;
1910 	}
1911 
1912 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1913 	if (src2 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1914 #else
1915 	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
1916 #endif
1917 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
1918 		return SLJIT_SUCCESS;
1919 	}
1920 
1921 	if (!(src1 & SLJIT_IMM)) {
1922 		if (src2 & SLJIT_IMM) {
1923 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1924 			if (IS_HALFWORD(src2w) || compiler->mode32) {
1925 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1926 				FAIL_IF(!inst);
1927 				*inst = GROUP_F7;
1928 			}
1929 			else {
1930 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1931 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
1932 				FAIL_IF(!inst);
1933 				*inst = TEST_rm_r;
1934 			}
1935 #else
1936 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1937 			FAIL_IF(!inst);
1938 			*inst = GROUP_F7;
1939 #endif
1940 			return SLJIT_SUCCESS;
1941 		}
1942 		else if (FAST_IS_REG(src1)) {
1943 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1944 			FAIL_IF(!inst);
1945 			*inst = TEST_rm_r;
1946 			return SLJIT_SUCCESS;
1947 		}
1948 	}
1949 
1950 	if (!(src2 & SLJIT_IMM)) {
1951 		if (src1 & SLJIT_IMM) {
1952 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1953 			if (IS_HALFWORD(src1w) || compiler->mode32) {
1954 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
1955 				FAIL_IF(!inst);
1956 				*inst = GROUP_F7;
1957 			}
1958 			else {
1959 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1960 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
1961 				FAIL_IF(!inst);
1962 				*inst = TEST_rm_r;
1963 			}
1964 #else
1965 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
1966 			FAIL_IF(!inst);
1967 			*inst = GROUP_F7;
1968 #endif
1969 			return SLJIT_SUCCESS;
1970 		}
1971 		else if (FAST_IS_REG(src2)) {
1972 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1973 			FAIL_IF(!inst);
1974 			*inst = TEST_rm_r;
1975 			return SLJIT_SUCCESS;
1976 		}
1977 	}
1978 
1979 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1980 	if (src2 & SLJIT_IMM) {
1981 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1982 		if (IS_HALFWORD(src2w) || compiler->mode32) {
1983 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1984 			FAIL_IF(!inst);
1985 			*inst = GROUP_F7;
1986 		}
1987 		else {
1988 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1989 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
1990 			FAIL_IF(!inst);
1991 			*inst = TEST_rm_r;
1992 		}
1993 #else
1994 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1995 		FAIL_IF(!inst);
1996 		*inst = GROUP_F7;
1997 #endif
1998 	}
1999 	else {
2000 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2001 		FAIL_IF(!inst);
2002 		*inst = TEST_rm_r;
2003 	}
2004 	return SLJIT_SUCCESS;
2005 }
2006 
2007 static sljit_s32 emit_shift(struct sljit_compiler *compiler,
2008 	sljit_u8 mode,
2009 	sljit_s32 dst, sljit_sw dstw,
2010 	sljit_s32 src1, sljit_sw src1w,
2011 	sljit_s32 src2, sljit_sw src2w)
2012 {
2013 	sljit_u8* inst;
2014 
2015 	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
2016 		if (dst == src1 && dstw == src1w) {
2017 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2018 			FAIL_IF(!inst);
2019 			*inst |= mode;
2020 			return SLJIT_SUCCESS;
2021 		}
2022 		if (dst == SLJIT_UNUSED) {
2023 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2024 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2025 			FAIL_IF(!inst);
2026 			*inst |= mode;
2027 			return SLJIT_SUCCESS;
2028 		}
2029 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2030 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2031 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2032 			FAIL_IF(!inst);
2033 			*inst |= mode;
2034 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2035 			return SLJIT_SUCCESS;
2036 		}
2037 		if (FAST_IS_REG(dst)) {
2038 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2039 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2040 			FAIL_IF(!inst);
2041 			*inst |= mode;
2042 			return SLJIT_SUCCESS;
2043 		}
2044 
2045 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2046 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2047 		FAIL_IF(!inst);
2048 		*inst |= mode;
2049 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2050 		return SLJIT_SUCCESS;
2051 	}
2052 
2053 	if (dst == SLJIT_PREF_SHIFT_REG) {
2054 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2055 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2056 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2057 		FAIL_IF(!inst);
2058 		*inst |= mode;
2059 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2060 	}
2061 	else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2062 		if (src1 != dst)
2063 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2064 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2065 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2066 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2067 		FAIL_IF(!inst);
2068 		*inst |= mode;
2069 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2070 	}
2071 	else {
2072 		/* This case is really difficult, since ecx itself may used for
2073 		   addressing, and we must ensure to work even in that case. */
2074 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2075 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2076 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2077 #else
2078 		/* [esp+0] contains the flags. */
2079 		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
2080 #endif
2081 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2082 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2083 		FAIL_IF(!inst);
2084 		*inst |= mode;
2085 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2086 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2087 #else
2088 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
2089 #endif
2090 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2091 	}
2092 
2093 	return SLJIT_SUCCESS;
2094 }
2095 
2096 static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2097 	sljit_u8 mode, sljit_s32 set_flags,
2098 	sljit_s32 dst, sljit_sw dstw,
2099 	sljit_s32 src1, sljit_sw src1w,
2100 	sljit_s32 src2, sljit_sw src2w)
2101 {
2102 	/* The CPU does not set flags if the shift count is 0. */
2103 	if (src2 & SLJIT_IMM) {
2104 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2105 		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2106 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2107 #else
2108 		if ((src2w & 0x1f) != 0)
2109 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2110 #endif
2111 		if (!set_flags)
2112 			return emit_mov(compiler, dst, dstw, src1, src1w);
2113 		/* OR dst, src, 0 */
2114 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2115 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2116 	}
2117 
2118 	if (!set_flags)
2119 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2120 
2121 	if (!FAST_IS_REG(dst))
2122 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2123 
2124 	FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
2125 
2126 	if (FAST_IS_REG(dst))
2127 		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2128 	return SLJIT_SUCCESS;
2129 }
2130 
2131 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2132 	sljit_s32 dst, sljit_sw dstw,
2133 	sljit_s32 src1, sljit_sw src1w,
2134 	sljit_s32 src2, sljit_sw src2w)
2135 {
2136 	CHECK_ERROR();
2137 	CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2138 	ADJUST_LOCAL_OFFSET(dst, dstw);
2139 	ADJUST_LOCAL_OFFSET(src1, src1w);
2140 	ADJUST_LOCAL_OFFSET(src2, src2w);
2141 
2142 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2143 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2144 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2145 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2146 	compiler->mode32 = op & SLJIT_I32_OP;
2147 #endif
2148 
2149 	if (GET_OPCODE(op) >= SLJIT_MUL) {
2150 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2151 			compiler->flags_saved = 0;
2152 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2153 			FAIL_IF(emit_save_flags(compiler));
2154 	}
2155 
2156 	switch (GET_OPCODE(op)) {
2157 	case SLJIT_ADD:
2158 		if (!GET_FLAGS(op)) {
2159 			if (emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2160 				return compiler->error;
2161 		}
2162 		else
2163 			compiler->flags_saved = 0;
2164 		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2165 			FAIL_IF(emit_save_flags(compiler));
2166 		return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
2167 			dst, dstw, src1, src1w, src2, src2w);
2168 	case SLJIT_ADDC:
2169 		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2170 			FAIL_IF(emit_restore_flags(compiler, 1));
2171 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2172 			FAIL_IF(emit_save_flags(compiler));
2173 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2174 			compiler->flags_saved = 0;
2175 		return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
2176 			dst, dstw, src1, src1w, src2, src2w);
2177 	case SLJIT_SUB:
2178 		if (!GET_FLAGS(op)) {
2179 			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2180 				return compiler->error;
2181 		}
2182 		else
2183 			compiler->flags_saved = 0;
2184 		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2185 			FAIL_IF(emit_save_flags(compiler));
2186 		if (dst == SLJIT_UNUSED)
2187 			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2188 		return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
2189 			dst, dstw, src1, src1w, src2, src2w);
2190 	case SLJIT_SUBC:
2191 		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2192 			FAIL_IF(emit_restore_flags(compiler, 1));
2193 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2194 			FAIL_IF(emit_save_flags(compiler));
2195 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2196 			compiler->flags_saved = 0;
2197 		return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
2198 			dst, dstw, src1, src1w, src2, src2w);
2199 	case SLJIT_MUL:
2200 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2201 	case SLJIT_AND:
2202 		if (dst == SLJIT_UNUSED)
2203 			return emit_test_binary(compiler, src1, src1w, src2, src2w);
2204 		return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
2205 			dst, dstw, src1, src1w, src2, src2w);
2206 	case SLJIT_OR:
2207 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2208 			dst, dstw, src1, src1w, src2, src2w);
2209 	case SLJIT_XOR:
2210 		return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
2211 			dst, dstw, src1, src1w, src2, src2w);
2212 	case SLJIT_SHL:
2213 		return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
2214 			dst, dstw, src1, src1w, src2, src2w);
2215 	case SLJIT_LSHR:
2216 		return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
2217 			dst, dstw, src1, src1w, src2, src2w);
2218 	case SLJIT_ASHR:
2219 		return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
2220 			dst, dstw, src1, src1w, src2, src2w);
2221 	}
2222 
2223 	return SLJIT_SUCCESS;
2224 }
2225 
2226 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
2227 {
2228 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
2229 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2230 	if (reg >= SLJIT_R3 && reg <= SLJIT_R6)
2231 		return -1;
2232 #endif
2233 	return reg_map[reg];
2234 }
2235 
2236 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
2237 {
2238 	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
2239 	return reg;
2240 }
2241 
2242 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
2243 	void *instruction, sljit_s32 size)
2244 {
2245 	sljit_u8 *inst;
2246 
2247 	CHECK_ERROR();
2248 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
2249 
2250 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
2251 	FAIL_IF(!inst);
2252 	INC_SIZE(size);
2253 	SLJIT_MEMMOVE(inst, instruction, size);
2254 	return SLJIT_SUCCESS;
2255 }
2256 
2257 /* --------------------------------------------------------------------- */
2258 /*  Floating point operators                                             */
2259 /* --------------------------------------------------------------------- */
2260 
2261 /* Alignment + 2 * 16 bytes. */
2262 static sljit_s32 sse2_data[3 + (4 + 4) * 2];
2263 static sljit_s32 *sse2_buffer;
2264 
2265 static void init_compiler(void)
2266 {
2267 	sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf);
2268 	/* Single precision constants. */
2269 	sse2_buffer[0] = 0x80000000;
2270 	sse2_buffer[4] = 0x7fffffff;
2271 	/* Double precision constants. */
2272 	sse2_buffer[8] = 0;
2273 	sse2_buffer[9] = 0x80000000;
2274 	sse2_buffer[12] = 0xffffffff;
2275 	sse2_buffer[13] = 0x7fffffff;
2276 }
2277 
2278 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_is_fpu_available(void)
2279 {
2280 #ifdef SLJIT_IS_FPU_AVAILABLE
2281 	return SLJIT_IS_FPU_AVAILABLE;
2282 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2283 	if (cpu_has_sse2 == -1)
2284 		get_cpu_features();
2285 	return cpu_has_sse2;
2286 #else /* SLJIT_DETECT_SSE2 */
2287 	return 1;
2288 #endif /* SLJIT_DETECT_SSE2 */
2289 }
2290 
2291 static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
2292 	sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2293 {
2294 	sljit_u8 *inst;
2295 
2296 	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2297 	FAIL_IF(!inst);
2298 	*inst++ = GROUP_0F;
2299 	*inst = opcode;
2300 	return SLJIT_SUCCESS;
2301 }
2302 
2303 static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode,
2304 	sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2305 {
2306 	sljit_u8 *inst;
2307 
2308 	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2309 	FAIL_IF(!inst);
2310 	*inst++ = GROUP_0F;
2311 	*inst = opcode;
2312 	return SLJIT_SUCCESS;
2313 }
2314 
2315 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
2316 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2317 {
2318 	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2319 }
2320 
2321 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
2322 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
2323 {
2324 	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2325 }
2326 
2327 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
2328 	sljit_s32 dst, sljit_sw dstw,
2329 	sljit_s32 src, sljit_sw srcw)
2330 {
2331 	sljit_s32 dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2332 	sljit_u8 *inst;
2333 
2334 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2335 	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
2336 		compiler->mode32 = 0;
2337 #endif
2338 
2339 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
2340 	FAIL_IF(!inst);
2341 	*inst++ = GROUP_0F;
2342 	*inst = CVTTSD2SI_r_xm;
2343 
2344 	if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
2345 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2346 	return SLJIT_SUCCESS;
2347 }
2348 
2349 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
2350 	sljit_s32 dst, sljit_sw dstw,
2351 	sljit_s32 src, sljit_sw srcw)
2352 {
2353 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2354 	sljit_u8 *inst;
2355 
2356 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2357 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
2358 		compiler->mode32 = 0;
2359 #endif
2360 
2361 	if (src & SLJIT_IMM) {
2362 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2363 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
2364 			srcw = (sljit_s32)srcw;
2365 #endif
2366 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2367 		src = TMP_REG1;
2368 		srcw = 0;
2369 	}
2370 
2371 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
2372 	FAIL_IF(!inst);
2373 	*inst++ = GROUP_0F;
2374 	*inst = CVTSI2SD_x_rm;
2375 
2376 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2377 	compiler->mode32 = 1;
2378 #endif
2379 	if (dst_r == TMP_FREG)
2380 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2381 	return SLJIT_SUCCESS;
2382 }
2383 
2384 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
2385 	sljit_s32 src1, sljit_sw src1w,
2386 	sljit_s32 src2, sljit_sw src2w)
2387 {
2388 	compiler->flags_saved = 0;
2389 	if (!FAST_IS_REG(src1)) {
2390 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2391 		src1 = TMP_FREG;
2392 	}
2393 	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w);
2394 }
2395 
2396 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
2397 	sljit_s32 dst, sljit_sw dstw,
2398 	sljit_s32 src, sljit_sw srcw)
2399 {
2400 	sljit_s32 dst_r;
2401 
2402 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2403 	compiler->mode32 = 1;
2404 #endif
2405 
2406 	CHECK_ERROR();
2407 	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
2408 
2409 	if (GET_OPCODE(op) == SLJIT_MOV_F64) {
2410 		if (FAST_IS_REG(dst))
2411 			return emit_sse2_load(compiler, op & SLJIT_F32_OP, dst, src, srcw);
2412 		if (FAST_IS_REG(src))
2413 			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, src);
2414 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src, srcw));
2415 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2416 	}
2417 
2418 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
2419 		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2420 		if (FAST_IS_REG(src)) {
2421 			/* We overwrite the high bits of source. From SLJIT point of view,
2422 			   this is not an issue.
2423 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
2424 			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_F32_OP, src, src, 0));
2425 		}
2426 		else {
2427 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_F32_OP), TMP_FREG, src, srcw));
2428 			src = TMP_FREG;
2429 		}
2430 
2431 		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_F32_OP, dst_r, src, 0));
2432 		if (dst_r == TMP_FREG)
2433 			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2434 		return SLJIT_SUCCESS;
2435 	}
2436 
2437 	if (SLOW_IS_REG(dst)) {
2438 		dst_r = dst;
2439 		if (dst != src)
2440 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2441 	}
2442 	else {
2443 		dst_r = TMP_FREG;
2444 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2445 	}
2446 
2447 	switch (GET_OPCODE(op)) {
2448 	case SLJIT_NEG_F64:
2449 		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer : sse2_buffer + 8)));
2450 		break;
2451 
2452 	case SLJIT_ABS_F64:
2453 		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2454 		break;
2455 	}
2456 
2457 	if (dst_r == TMP_FREG)
2458 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2459 	return SLJIT_SUCCESS;
2460 }
2461 
2462 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
2463 	sljit_s32 dst, sljit_sw dstw,
2464 	sljit_s32 src1, sljit_sw src1w,
2465 	sljit_s32 src2, sljit_sw src2w)
2466 {
2467 	sljit_s32 dst_r;
2468 
2469 	CHECK_ERROR();
2470 	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2471 	ADJUST_LOCAL_OFFSET(dst, dstw);
2472 	ADJUST_LOCAL_OFFSET(src1, src1w);
2473 	ADJUST_LOCAL_OFFSET(src2, src2w);
2474 
2475 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2476 	compiler->mode32 = 1;
2477 #endif
2478 
2479 	if (FAST_IS_REG(dst)) {
2480 		dst_r = dst;
2481 		if (dst == src1)
2482 			; /* Do nothing here. */
2483 		else if (dst == src2 && (op == SLJIT_ADD_F64 || op == SLJIT_MUL_F64)) {
2484 			/* Swap arguments. */
2485 			src2 = src1;
2486 			src2w = src1w;
2487 		}
2488 		else if (dst != src2)
2489 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src1, src1w));
2490 		else {
2491 			dst_r = TMP_FREG;
2492 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2493 		}
2494 	}
2495 	else {
2496 		dst_r = TMP_FREG;
2497 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2498 	}
2499 
2500 	switch (GET_OPCODE(op)) {
2501 	case SLJIT_ADD_F64:
2502 		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2503 		break;
2504 
2505 	case SLJIT_SUB_F64:
2506 		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2507 		break;
2508 
2509 	case SLJIT_MUL_F64:
2510 		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2511 		break;
2512 
2513 	case SLJIT_DIV_F64:
2514 		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2515 		break;
2516 	}
2517 
2518 	if (dst_r == TMP_FREG)
2519 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2520 	return SLJIT_SUCCESS;
2521 }
2522 
2523 /* --------------------------------------------------------------------- */
2524 /*  Conditional instructions                                             */
2525 /* --------------------------------------------------------------------- */
2526 
2527 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2528 {
2529 	sljit_u8 *inst;
2530 	struct sljit_label *label;
2531 
2532 	CHECK_ERROR_PTR();
2533 	CHECK_PTR(check_sljit_emit_label(compiler));
2534 
2535 	/* We should restore the flags before the label,
2536 	   since other taken jumps has their own flags as well. */
2537 	if (SLJIT_UNLIKELY(compiler->flags_saved))
2538 		PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2539 
2540 	if (compiler->last_label && compiler->last_label->size == compiler->size)
2541 		return compiler->last_label;
2542 
2543 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2544 	PTR_FAIL_IF(!label);
2545 	set_label(label, compiler);
2546 
2547 	inst = (sljit_u8*)ensure_buf(compiler, 2);
2548 	PTR_FAIL_IF(!inst);
2549 
2550 	*inst++ = 0;
2551 	*inst++ = 0;
2552 
2553 	return label;
2554 }
2555 
2556 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
2557 {
2558 	sljit_u8 *inst;
2559 	struct sljit_jump *jump;
2560 
2561 	CHECK_ERROR_PTR();
2562 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
2563 
2564 	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2565 		if ((type & 0xff) <= SLJIT_JUMP)
2566 			PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2567 		compiler->flags_saved = 0;
2568 	}
2569 
2570 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2571 	PTR_FAIL_IF_NULL(jump);
2572 	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
2573 	type &= 0xff;
2574 
2575 	if (type >= SLJIT_CALL1)
2576 		PTR_FAIL_IF(call_with_args(compiler, type));
2577 
2578 	/* Worst case size. */
2579 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2580 	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2581 #else
2582 	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2583 #endif
2584 
2585 	inst = (sljit_u8*)ensure_buf(compiler, 2);
2586 	PTR_FAIL_IF_NULL(inst);
2587 
2588 	*inst++ = 0;
2589 	*inst++ = type + 4;
2590 	return jump;
2591 }
2592 
2593 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
2594 {
2595 	sljit_u8 *inst;
2596 	struct sljit_jump *jump;
2597 
2598 	CHECK_ERROR();
2599 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
2600 	ADJUST_LOCAL_OFFSET(src, srcw);
2601 
2602 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2603 
2604 	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2605 		if (type <= SLJIT_JUMP)
2606 			FAIL_IF(emit_restore_flags(compiler, 0));
2607 		compiler->flags_saved = 0;
2608 	}
2609 
2610 	if (type >= SLJIT_CALL1) {
2611 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2612 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
2613 		if (src == SLJIT_R2) {
2614 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2615 			src = TMP_REG1;
2616 		}
2617 		if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
2618 			srcw += sizeof(sljit_sw);
2619 #endif
2620 #endif
2621 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
2622 		if (src == SLJIT_R2) {
2623 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2624 			src = TMP_REG1;
2625 		}
2626 #endif
2627 		FAIL_IF(call_with_args(compiler, type));
2628 	}
2629 
2630 	if (src == SLJIT_IMM) {
2631 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2632 		FAIL_IF_NULL(jump);
2633 		set_jump(jump, compiler, JUMP_ADDR);
2634 		jump->u.target = srcw;
2635 
2636 		/* Worst case size. */
2637 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2638 		compiler->size += 5;
2639 #else
2640 		compiler->size += 10 + 3;
2641 #endif
2642 
2643 		inst = (sljit_u8*)ensure_buf(compiler, 2);
2644 		FAIL_IF_NULL(inst);
2645 
2646 		*inst++ = 0;
2647 		*inst++ = type + 4;
2648 	}
2649 	else {
2650 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2651 		/* REX_W is not necessary (src is not immediate). */
2652 		compiler->mode32 = 1;
2653 #endif
2654 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2655 		FAIL_IF(!inst);
2656 		*inst++ = GROUP_FF;
2657 		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2658 	}
2659 	return SLJIT_SUCCESS;
2660 }
2661 
2662 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
2663 	sljit_s32 dst, sljit_sw dstw,
2664 	sljit_s32 src, sljit_sw srcw,
2665 	sljit_s32 type)
2666 {
2667 	sljit_u8 *inst;
2668 	sljit_u8 cond_set = 0;
2669 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2670 	sljit_s32 reg;
2671 #else
2672 	/* CHECK_EXTRA_REGS migh overwrite these values. */
2673 	sljit_s32 dst_save = dst;
2674 	sljit_sw dstw_save = dstw;
2675 #endif
2676 
2677 	CHECK_ERROR();
2678 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
2679 	SLJIT_UNUSED_ARG(srcw);
2680 
2681 	if (dst == SLJIT_UNUSED)
2682 		return SLJIT_SUCCESS;
2683 
2684 	ADJUST_LOCAL_OFFSET(dst, dstw);
2685 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2686 	if (SLJIT_UNLIKELY(compiler->flags_saved))
2687 		FAIL_IF(emit_restore_flags(compiler, op & SLJIT_KEEP_FLAGS));
2688 
2689 	type &= 0xff;
2690 	/* setcc = jcc + 0x10. */
2691 	cond_set = get_jump_code(type) + 0x10;
2692 
2693 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2694 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
2695 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
2696 		FAIL_IF(!inst);
2697 		INC_SIZE(4 + 3);
2698 		/* Set low register to conditional flag. */
2699 		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2700 		*inst++ = GROUP_0F;
2701 		*inst++ = cond_set;
2702 		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
2703 		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2704 		*inst++ = OR_rm8_r8;
2705 		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2706 		return SLJIT_SUCCESS;
2707 	}
2708 
2709 	reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2710 
2711 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
2712 	FAIL_IF(!inst);
2713 	INC_SIZE(4 + 4);
2714 	/* Set low register to conditional flag. */
2715 	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2716 	*inst++ = GROUP_0F;
2717 	*inst++ = cond_set;
2718 	*inst++ = MOD_REG | reg_lmap[reg];
2719 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2720 	*inst++ = GROUP_0F;
2721 	*inst++ = MOVZX_r_rm8;
2722 	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2723 
2724 	if (reg != TMP_REG1)
2725 		return SLJIT_SUCCESS;
2726 
2727 	if (GET_OPCODE(op) < SLJIT_ADD) {
2728 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2729 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2730 	}
2731 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2732 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2733 	compiler->skip_checks = 1;
2734 #endif
2735 	return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
2736 #else /* SLJIT_CONFIG_X86_64 */
2737 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2738 		if (reg_map[dst] <= 4) {
2739 			/* Low byte is accessible. */
2740 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
2741 			FAIL_IF(!inst);
2742 			INC_SIZE(3 + 3);
2743 			/* Set low byte to conditional flag. */
2744 			*inst++ = GROUP_0F;
2745 			*inst++ = cond_set;
2746 			*inst++ = MOD_REG | reg_map[dst];
2747 
2748 			*inst++ = GROUP_0F;
2749 			*inst++ = MOVZX_r_rm8;
2750 			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2751 			return SLJIT_SUCCESS;
2752 		}
2753 
2754 		/* Low byte is not accessible. */
2755 		if (cpu_has_cmov == -1)
2756 			get_cpu_features();
2757 
2758 		if (cpu_has_cmov) {
2759 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2760 			/* a xor reg, reg operation would overwrite the flags. */
2761 			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2762 
2763 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
2764 			FAIL_IF(!inst);
2765 			INC_SIZE(3);
2766 
2767 			*inst++ = GROUP_0F;
2768 			/* cmovcc = setcc - 0x50. */
2769 			*inst++ = cond_set - 0x50;
2770 			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2771 			return SLJIT_SUCCESS;
2772 		}
2773 
2774 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2775 		FAIL_IF(!inst);
2776 		INC_SIZE(1 + 3 + 3 + 1);
2777 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2778 		/* Set al to conditional flag. */
2779 		*inst++ = GROUP_0F;
2780 		*inst++ = cond_set;
2781 		*inst++ = MOD_REG | 0 /* eax */;
2782 
2783 		*inst++ = GROUP_0F;
2784 		*inst++ = MOVZX_r_rm8;
2785 		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2786 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2787 		return SLJIT_SUCCESS;
2788 	}
2789 
2790 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
2791 		SLJIT_COMPILE_ASSERT(reg_map[SLJIT_R0] == 0, scratch_reg1_must_be_eax);
2792 		if (dst != SLJIT_R0) {
2793 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2794 			FAIL_IF(!inst);
2795 			INC_SIZE(1 + 3 + 2 + 1);
2796 			/* Set low register to conditional flag. */
2797 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2798 			*inst++ = GROUP_0F;
2799 			*inst++ = cond_set;
2800 			*inst++ = MOD_REG | 0 /* eax */;
2801 			*inst++ = OR_rm8_r8;
2802 			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2803 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2804 		}
2805 		else {
2806 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2807 			FAIL_IF(!inst);
2808 			INC_SIZE(2 + 3 + 2 + 2);
2809 			/* Set low register to conditional flag. */
2810 			*inst++ = XCHG_r_rm;
2811 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2812 			*inst++ = GROUP_0F;
2813 			*inst++ = cond_set;
2814 			*inst++ = MOD_REG | 1 /* ecx */;
2815 			*inst++ = OR_rm8_r8;
2816 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2817 			*inst++ = XCHG_r_rm;
2818 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2819 		}
2820 		return SLJIT_SUCCESS;
2821 	}
2822 
2823 	/* Set TMP_REG1 to the bit. */
2824 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2825 	FAIL_IF(!inst);
2826 	INC_SIZE(1 + 3 + 3 + 1);
2827 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2828 	/* Set al to conditional flag. */
2829 	*inst++ = GROUP_0F;
2830 	*inst++ = cond_set;
2831 	*inst++ = MOD_REG | 0 /* eax */;
2832 
2833 	*inst++ = GROUP_0F;
2834 	*inst++ = MOVZX_r_rm8;
2835 	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2836 
2837 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2838 
2839 	if (GET_OPCODE(op) < SLJIT_ADD)
2840 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2841 
2842 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2843 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2844 	compiler->skip_checks = 1;
2845 #endif
2846 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2847 #endif /* SLJIT_CONFIG_X86_64 */
2848 }
2849 
2850 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
2851 {
2852 	CHECK_ERROR();
2853 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
2854 	ADJUST_LOCAL_OFFSET(dst, dstw);
2855 
2856 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2857 
2858 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2859 	compiler->mode32 = 0;
2860 #endif
2861 
2862 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
2863 
2864 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2865 	if (NOT_HALFWORD(offset)) {
2866 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
2867 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2868 		SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
2869 		return compiler->error;
2870 #else
2871 		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
2872 #endif
2873 	}
2874 #endif
2875 
2876 	if (offset != 0)
2877 		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
2878 	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
2879 }
2880 
2881 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
2882 {
2883 	sljit_u8 *inst;
2884 	struct sljit_const *const_;
2885 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2886 	sljit_s32 reg;
2887 #endif
2888 
2889 	CHECK_ERROR_PTR();
2890 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
2891 	ADJUST_LOCAL_OFFSET(dst, dstw);
2892 
2893 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2894 
2895 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
2896 	PTR_FAIL_IF(!const_);
2897 	set_const(const_, compiler);
2898 
2899 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2900 	compiler->mode32 = 0;
2901 	reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2902 
2903 	if (emit_load_imm64(compiler, reg, init_value))
2904 		return NULL;
2905 #else
2906 	if (dst == SLJIT_UNUSED)
2907 		dst = TMP_REG1;
2908 
2909 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
2910 		return NULL;
2911 #endif
2912 
2913 	inst = (sljit_u8*)ensure_buf(compiler, 2);
2914 	PTR_FAIL_IF(!inst);
2915 
2916 	*inst++ = 0;
2917 	*inst++ = 1;
2918 
2919 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2920 	if (dst & SLJIT_MEM)
2921 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
2922 			return NULL;
2923 #endif
2924 
2925 	return const_;
2926 }
2927 
2928 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
2929 {
2930 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2931 	*(sljit_sw*)addr = new_addr - (addr + 4);
2932 #else
2933 	*(sljit_uw*)addr = new_addr;
2934 #endif
2935 }
2936 
2937 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
2938 {
2939 	*(sljit_sw*)addr = new_constant;
2940 }
2941 
2942 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_sse2_available(void)
2943 {
2944 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2945 	if (cpu_has_sse2 == -1)
2946 		get_cpu_features();
2947 	return cpu_has_sse2;
2948 #else
2949 	return 1;
2950 #endif
2951 }
2952 
2953 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_cmov_available(void)
2954 {
2955 	if (cpu_has_cmov == -1)
2956 		get_cpu_features();
2957 	return cpu_has_cmov;
2958 }
2959 
2960 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_emit_cmov(struct sljit_compiler *compiler,
2961 	sljit_s32 type,
2962 	sljit_s32 dst_reg,
2963 	sljit_s32 src, sljit_sw srcw)
2964 {
2965 	sljit_u8* inst;
2966 
2967 	CHECK_ERROR();
2968 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2969 	CHECK_ARGUMENT(sljit_x86_is_cmov_available());
2970 	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_I32_OP)));
2971 	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_ORDERED_F64);
2972 	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_I32_OP));
2973 	FUNCTION_CHECK_SRC(src, srcw);
2974 #endif
2975 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
2976 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
2977 		fprintf(compiler->verbose, "  x86_cmov%s %s%s, ",
2978 			!(dst_reg & SLJIT_I32_OP) ? "" : ".i",
2979 			jump_names[type & 0xff], JUMP_POSTFIX(type));
2980 		sljit_verbose_reg(compiler, dst_reg & ~SLJIT_I32_OP);
2981 		fprintf(compiler->verbose, ", ");
2982 		sljit_verbose_param(compiler, src, srcw);
2983 		fprintf(compiler->verbose, "\n");
2984 	}
2985 #endif
2986 
2987 	ADJUST_LOCAL_OFFSET(src, srcw);
2988 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2989 
2990 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2991 	compiler->mode32 = dst_reg & SLJIT_I32_OP;
2992 #endif
2993 	dst_reg &= ~SLJIT_I32_OP;
2994 
2995 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
2996 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
2997 		src = TMP_REG1;
2998 		srcw = 0;
2999 	}
3000 
3001 	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
3002 	FAIL_IF(!inst);
3003 	*inst++ = GROUP_0F;
3004 	*inst = get_jump_code(type & 0xff) - 0x40;
3005 	return SLJIT_SUCCESS;
3006 }
3007