xref: /netbsd-src/sys/external/bsd/sljit/dist/sljit_src/sljitNativeX86_common.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /*	$NetBSD: sljitNativeX86_common.c,v 1.6 2014/06/17 19:33:20 alnsn Exp $	*/
2 
3 /*
4  *    Stack-less Just-In-Time compiler
5  *
6  *    Copyright 2009-2012 Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without modification, are
9  * permitted provided that the following conditions are met:
10  *
11  *   1. Redistributions of source code must retain the above copyright notice, this list of
12  *      conditions and the following disclaimer.
13  *
14  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
15  *      of conditions and the following disclaimer in the documentation and/or other materials
16  *      provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
21  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
24  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 SLJIT_API_FUNC_ATTRIBUTE SLJIT_CONST char* sljit_get_platform_name(void)
30 {
31 	return "x86" SLJIT_CPUINFO;
32 }
33 
34 /*
35    32b register indexes:
36      0 - EAX
37      1 - ECX
38      2 - EDX
39      3 - EBX
40      4 - none
41      5 - EBP
42      6 - ESI
43      7 - EDI
44 */
45 
46 /*
47    64b register indexes:
48      0 - RAX
49      1 - RCX
50      2 - RDX
51      3 - RBX
52      4 - none
53      5 - RBP
54      6 - RSI
55      7 - RDI
56      8 - R8   - From now on REX prefix is required
57      9 - R9
58     10 - R10
59     11 - R11
60     12 - R12
61     13 - R13
62     14 - R14
63     15 - R15
64 */
65 
66 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
67 
68 /* Last register + 1. */
69 #define TMP_REG1	(SLJIT_NO_REGISTERS + 1)
70 
71 static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 2] = {
72 	0, 0, 2, 1, 0, 0, 3, 6, 7, 0, 0, 4, 5
73 };
74 
75 #define CHECK_EXTRA_REGS(p, w, do) \
76 	if (p >= SLJIT_TEMPORARY_EREG1 && p <= SLJIT_TEMPORARY_EREG2) { \
77 		w = compiler->scratches_start + (p - SLJIT_TEMPORARY_EREG1) * sizeof(sljit_sw); \
78 		p = SLJIT_MEM1(SLJIT_LOCALS_REG); \
79 		do; \
80 	} \
81 	else if (p >= SLJIT_SAVED_EREG1 && p <= SLJIT_SAVED_EREG2) { \
82 		w = compiler->saveds_start + (p - SLJIT_SAVED_EREG1) * sizeof(sljit_sw); \
83 		p = SLJIT_MEM1(SLJIT_LOCALS_REG); \
84 		do; \
85 	}
86 
87 #else /* SLJIT_CONFIG_X86_32 */
88 
89 /* Last register + 1. */
90 #define TMP_REG1	(SLJIT_NO_REGISTERS + 1)
91 #define TMP_REG2	(SLJIT_NO_REGISTERS + 2)
92 #define TMP_REG3	(SLJIT_NO_REGISTERS + 3)
93 
94 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
95    Note: avoid to use r12 and r13 for memory addessing
96    therefore r12 is better for SAVED_EREG than SAVED_REG. */
97 #ifndef _WIN64
98 /* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
99 static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 4] = {
100 	0, 0, 6, 1, 8, 11, 3, 15, 14, 13, 12, 4, 2, 7, 9
101 };
102 /* low-map. reg_map & 0x7. */
103 static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NO_REGISTERS + 4] = {
104 	0, 0, 6, 1, 0, 3,  3, 7,  6,  5,  4,  4, 2, 7, 1
105 };
106 #else
107 /* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
108 static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 4] = {
109 	0, 0, 2, 1, 11, 13, 3, 6, 7, 14, 15, 4, 10, 8, 9
110 };
111 /* low-map. reg_map & 0x7. */
112 static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NO_REGISTERS + 4] = {
113 	0, 0, 2, 1, 3,  5,  3, 6, 7,  6,  7, 4, 2,  0, 1
114 };
115 #endif
116 
117 #define REX_W		0x48
118 #define REX_R		0x44
119 #define REX_X		0x42
120 #define REX_B		0x41
121 #define REX		0x40
122 
123 #ifndef _WIN64
124 #define HALFWORD_MAX 0x7fffffffl
125 #define HALFWORD_MIN -0x80000000l
126 #else
127 #define HALFWORD_MAX 0x7fffffffll
128 #define HALFWORD_MIN -0x80000000ll
129 #endif
130 
131 #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
132 #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
133 
134 #define CHECK_EXTRA_REGS(p, w, do)
135 
136 #endif /* SLJIT_CONFIG_X86_32 */
137 
138 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
139 #define TMP_FREG	(0)
140 #endif
141 
142 /* Size flags for emit_x86_instruction: */
143 #define EX86_BIN_INS		0x0010
144 #define EX86_SHIFT_INS		0x0020
145 #define EX86_REX		0x0040
146 #define EX86_NO_REXW		0x0080
147 #define EX86_BYTE_ARG		0x0100
148 #define EX86_HALF_ARG		0x0200
149 #define EX86_PREF_66		0x0400
150 
151 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
152 #define EX86_SSE2		0x0800
153 #define EX86_PREF_F2		0x1000
154 #define EX86_PREF_F3		0x2000
155 #endif
156 
157 /* --------------------------------------------------------------------- */
158 /*  Instrucion forms                                                     */
159 /* --------------------------------------------------------------------- */
160 
161 #define ADD		(/* BINARY */ 0 << 3)
162 #define ADD_EAX_i32	0x05
163 #define ADD_r_rm	0x03
164 #define ADD_rm_r	0x01
165 #define ADDSD_x_xm	0x58
166 #define ADC		(/* BINARY */ 2 << 3)
167 #define ADC_EAX_i32	0x15
168 #define ADC_r_rm	0x13
169 #define ADC_rm_r	0x11
170 #define AND		(/* BINARY */ 4 << 3)
171 #define AND_EAX_i32	0x25
172 #define AND_r_rm	0x23
173 #define AND_rm_r	0x21
174 #define ANDPD_x_xm	0x54
175 #define BSR_r_rm	(/* GROUP_0F */ 0xbd)
176 #define CALL_i32	0xe8
177 #define CALL_rm		(/* GROUP_FF */ 2 << 3)
178 #define CDQ		0x99
179 #define CMOVNE_r_rm	(/* GROUP_0F */ 0x45)
180 #define CMP		(/* BINARY */ 7 << 3)
181 #define CMP_EAX_i32	0x3d
182 #define CMP_r_rm	0x3b
183 #define CMP_rm_r	0x39
184 #define DIV		(/* GROUP_F7 */ 6 << 3)
185 #define DIVSD_x_xm	0x5e
186 #define INT3		0xcc
187 #define IDIV		(/* GROUP_F7 */ 7 << 3)
188 #define IMUL		(/* GROUP_F7 */ 5 << 3)
189 #define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
190 #define IMUL_r_rm_i8	0x6b
191 #define IMUL_r_rm_i32	0x69
192 #define JE_i8		0x74
193 #define JMP_i8		0xeb
194 #define JMP_i32		0xe9
195 #define JMP_rm		(/* GROUP_FF */ 4 << 3)
196 #define LEA_r_m		0x8d
197 #define MOV_r_rm	0x8b
198 #define MOV_r_i32	0xb8
199 #define MOV_rm_r	0x89
200 #define MOV_rm_i32	0xc7
201 #define MOV_rm8_i8	0xc6
202 #define MOV_rm8_r8	0x88
203 #define MOVSD_x_xm	0x10
204 #define MOVSD_xm_x	0x11
205 #define MOVSXD_r_rm	0x63
206 #define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
207 #define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
208 #define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
209 #define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
210 #define MUL		(/* GROUP_F7 */ 4 << 3)
211 #define MULSD_x_xm	0x59
212 #define NEG_rm		(/* GROUP_F7 */ 3 << 3)
213 #define NOP		0x90
214 #define NOT_rm		(/* GROUP_F7 */ 2 << 3)
215 #define OR		(/* BINARY */ 1 << 3)
216 #define OR_r_rm		0x0b
217 #define OR_EAX_i32	0x0d
218 #define OR_rm_r		0x09
219 #define OR_rm8_r8	0x08
220 #define POP_r		0x58
221 #define POP_rm		0x8f
222 #define POPF		0x9d
223 #define PUSH_i32	0x68
224 #define PUSH_r		0x50
225 #define PUSH_rm		(/* GROUP_FF */ 6 << 3)
226 #define PUSHF		0x9c
227 #define RET_near	0xc3
228 #define RET_i16		0xc2
229 #define SBB		(/* BINARY */ 3 << 3)
230 #define SBB_EAX_i32	0x1d
231 #define SBB_r_rm	0x1b
232 #define SBB_rm_r	0x19
233 #define SAR		(/* SHIFT */ 7 << 3)
234 #define SHL		(/* SHIFT */ 4 << 3)
235 #define SHR		(/* SHIFT */ 5 << 3)
236 #define SUB		(/* BINARY */ 5 << 3)
237 #define SUB_EAX_i32	0x2d
238 #define SUB_r_rm	0x2b
239 #define SUB_rm_r	0x29
240 #define SUBSD_x_xm	0x5c
241 #define TEST_EAX_i32	0xa9
242 #define TEST_rm_r	0x85
243 #define UCOMISD_x_xm	0x2e
244 #define XCHG_EAX_r	0x90
245 #define XCHG_r_rm	0x87
246 #define XOR		(/* BINARY */ 6 << 3)
247 #define XOR_EAX_i32	0x35
248 #define XOR_r_rm	0x33
249 #define XOR_rm_r	0x31
250 #define XORPD_x_xm	0x57
251 
252 #define GROUP_0F	0x0f
253 #define GROUP_F7	0xf7
254 #define GROUP_FF	0xff
255 #define GROUP_BINARY_81	0x81
256 #define GROUP_BINARY_83	0x83
257 #define GROUP_SHIFT_1	0xd1
258 #define GROUP_SHIFT_N	0xc1
259 #define GROUP_SHIFT_CL	0xd3
260 
261 #define MOD_REG		0xc0
262 #define MOD_DISP8	0x40
263 
264 #define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
265 
266 #define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
267 #define POP_REG(r)			(*inst++ = (POP_r + (r)))
268 #define RET()				(*inst++ = (RET_near))
269 #define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
270 /* r32, r/m32 */
271 #define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
272 
273 /* Multithreading does not affect these static variables, since they store
274    built-in CPU features. Therefore they can be overwritten by different threads
275    if they detect the CPU features in the same time. */
276 #if (defined SLJIT_SSE2 && SLJIT_SSE2) && (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
277 static sljit_si cpu_has_sse2 = -1;
278 #endif
279 static sljit_si cpu_has_cmov = -1;
280 
281 #if defined(_MSC_VER) && _MSC_VER >= 1400
282 #include <intrin.h>
283 #endif
284 
285 static void get_cpu_features(void)
286 {
287 	sljit_ui features;
288 
289 #if defined(_MSC_VER) && _MSC_VER >= 1400
290 
291 	int CPUInfo[4];
292 	__cpuid(CPUInfo, 1);
293 	features = (sljit_ui)CPUInfo[3];
294 
295 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
296 
297 	/* AT&T syntax. */
298 	__asm__ (
299 		"movl $0x1, %%eax\n"
300 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
301 		/* On x86-32, there is no red zone, so this
302 		   should work (no need for a local variable). */
303 		"push %%ebx\n"
304 #endif
305 		"cpuid\n"
306 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
307 		"pop %%ebx\n"
308 #endif
309 		"movl %%edx, %0\n"
310 		: "=g" (features)
311 		:
312 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
313 		: "%eax", "%ecx", "%edx"
314 #else
315 		: "%rax", "%rbx", "%rcx", "%rdx"
316 #endif
317 	);
318 
319 #else /* _MSC_VER && _MSC_VER >= 1400 */
320 
321 	/* Intel syntax. */
322 	__asm {
323 		mov eax, 1
324 		cpuid
325 		mov features, edx
326 	}
327 
328 #endif /* _MSC_VER && _MSC_VER >= 1400 */
329 
330 #if (defined SLJIT_SSE2 && SLJIT_SSE2) && (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
331 	cpu_has_sse2 = (features >> 26) & 0x1;
332 #endif
333 	cpu_has_cmov = (features >> 15) & 0x1;
334 }
335 
336 static sljit_ub get_jump_code(sljit_si type)
337 {
338 	switch (type) {
339 	case SLJIT_C_EQUAL:
340 	case SLJIT_C_FLOAT_EQUAL:
341 		return 0x84 /* je */;
342 
343 	case SLJIT_C_NOT_EQUAL:
344 	case SLJIT_C_FLOAT_NOT_EQUAL:
345 		return 0x85 /* jne */;
346 
347 	case SLJIT_C_LESS:
348 	case SLJIT_C_FLOAT_LESS:
349 		return 0x82 /* jc */;
350 
351 	case SLJIT_C_GREATER_EQUAL:
352 	case SLJIT_C_FLOAT_GREATER_EQUAL:
353 		return 0x83 /* jae */;
354 
355 	case SLJIT_C_GREATER:
356 	case SLJIT_C_FLOAT_GREATER:
357 		return 0x87 /* jnbe */;
358 
359 	case SLJIT_C_LESS_EQUAL:
360 	case SLJIT_C_FLOAT_LESS_EQUAL:
361 		return 0x86 /* jbe */;
362 
363 	case SLJIT_C_SIG_LESS:
364 		return 0x8c /* jl */;
365 
366 	case SLJIT_C_SIG_GREATER_EQUAL:
367 		return 0x8d /* jnl */;
368 
369 	case SLJIT_C_SIG_GREATER:
370 		return 0x8f /* jnle */;
371 
372 	case SLJIT_C_SIG_LESS_EQUAL:
373 		return 0x8e /* jle */;
374 
375 	case SLJIT_C_OVERFLOW:
376 	case SLJIT_C_MUL_OVERFLOW:
377 		return 0x80 /* jo */;
378 
379 	case SLJIT_C_NOT_OVERFLOW:
380 	case SLJIT_C_MUL_NOT_OVERFLOW:
381 		return 0x81 /* jno */;
382 
383 	case SLJIT_C_FLOAT_UNORDERED:
384 		return 0x8a /* jp */;
385 
386 	case SLJIT_C_FLOAT_ORDERED:
387 		return 0x8b /* jpo */;
388 	}
389 	return 0;
390 }
391 
392 static sljit_ub* generate_far_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_si type);
393 
394 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
395 static sljit_ub* generate_fixed_jump(sljit_ub *code_ptr, sljit_sw addr, sljit_si type);
396 #endif
397 
398 static sljit_ub* generate_near_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_ub *code, sljit_si type)
399 {
400 	sljit_si short_jump;
401 	sljit_uw label_addr;
402 
403 	if (jump->flags & JUMP_LABEL)
404 		label_addr = (sljit_uw)(code + jump->u.label->size);
405 	else
406 		label_addr = jump->u.target;
407 	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
408 
409 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
410 	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
411 		return generate_far_jump_code(jump, code_ptr, type);
412 #endif
413 
414 	if (type == SLJIT_JUMP) {
415 		if (short_jump)
416 			*code_ptr++ = JMP_i8;
417 		else
418 			*code_ptr++ = JMP_i32;
419 		jump->addr++;
420 	}
421 	else if (type >= SLJIT_FAST_CALL) {
422 		short_jump = 0;
423 		*code_ptr++ = CALL_i32;
424 		jump->addr++;
425 	}
426 	else if (short_jump) {
427 		*code_ptr++ = get_jump_code(type) - 0x10;
428 		jump->addr++;
429 	}
430 	else {
431 		*code_ptr++ = GROUP_0F;
432 		*code_ptr++ = get_jump_code(type);
433 		jump->addr += 2;
434 	}
435 
436 	if (short_jump) {
437 		jump->flags |= PATCH_MB;
438 		code_ptr += sizeof(sljit_sb);
439 	} else {
440 		jump->flags |= PATCH_MW;
441 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
442 		code_ptr += sizeof(sljit_sw);
443 #else
444 		code_ptr += sizeof(sljit_si);
445 #endif
446 	}
447 
448 	return code_ptr;
449 }
450 
451 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
452 {
453 	struct sljit_memory_fragment *buf;
454 	sljit_ub *code;
455 	sljit_ub *code_ptr;
456 	sljit_ub *buf_ptr;
457 	sljit_ub *buf_end;
458 	sljit_ub len;
459 
460 	struct sljit_label *label;
461 	struct sljit_jump *jump;
462 	struct sljit_const *const_;
463 
464 	CHECK_ERROR_PTR();
465 	check_sljit_generate_code(compiler);
466 	reverse_buf(compiler);
467 
468 	/* Second code generation pass. */
469 	code = (sljit_ub*)SLJIT_MALLOC_EXEC(compiler->size);
470 	PTR_FAIL_WITH_EXEC_IF(code);
471 	buf = compiler->buf;
472 
473 	code_ptr = code;
474 	label = compiler->labels;
475 	jump = compiler->jumps;
476 	const_ = compiler->consts;
477 	do {
478 		buf_ptr = buf->memory;
479 		buf_end = buf_ptr + buf->used_size;
480 		do {
481 			len = *buf_ptr++;
482 			if (len > 0) {
483 				/* The code is already generated. */
484 				SLJIT_MEMMOVE(code_ptr, buf_ptr, len);
485 				code_ptr += len;
486 				buf_ptr += len;
487 			}
488 			else {
489 				if (*buf_ptr >= 4) {
490 					jump->addr = (sljit_uw)code_ptr;
491 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
492 						code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 4);
493 					else
494 						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 4);
495 					jump = jump->next;
496 				}
497 				else if (*buf_ptr == 0) {
498 					label->addr = (sljit_uw)code_ptr;
499 					label->size = code_ptr - code;
500 					label = label->next;
501 				}
502 				else if (*buf_ptr == 1) {
503 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
504 					const_ = const_->next;
505 				}
506 				else {
507 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
508 					*code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
509 					buf_ptr++;
510 					*(sljit_sw*)code_ptr = *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw));
511 					code_ptr += sizeof(sljit_sw);
512 					buf_ptr += sizeof(sljit_sw) - 1;
513 #else
514 					code_ptr = generate_fixed_jump(code_ptr, *(sljit_sw*)(buf_ptr + 1), *buf_ptr);
515 					buf_ptr += sizeof(sljit_sw);
516 #endif
517 				}
518 				buf_ptr++;
519 			}
520 		} while (buf_ptr < buf_end);
521 		SLJIT_ASSERT(buf_ptr == buf_end);
522 		buf = buf->next;
523 	} while (buf);
524 
525 	SLJIT_ASSERT(!label);
526 	SLJIT_ASSERT(!jump);
527 	SLJIT_ASSERT(!const_);
528 
529 	jump = compiler->jumps;
530 	while (jump) {
531 		if (jump->flags & PATCH_MB) {
532 			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) <= 127);
533 			*(sljit_ub*)jump->addr = (sljit_ub)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb)));
534 		} else if (jump->flags & PATCH_MW) {
535 			if (jump->flags & JUMP_LABEL) {
536 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
537 				*(sljit_sw*)jump->addr = (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sw)));
538 #else
539 				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
540 				*(sljit_si*)jump->addr = (sljit_si)(jump->u.label->addr - (jump->addr + sizeof(sljit_si)));
541 #endif
542 			}
543 			else {
544 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
545 				*(sljit_sw*)jump->addr = (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_sw)));
546 #else
547 				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
548 				*(sljit_si*)jump->addr = (sljit_si)(jump->u.target - (jump->addr + sizeof(sljit_si)));
549 #endif
550 			}
551 		}
552 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
553 		else if (jump->flags & PATCH_MD)
554 			*(sljit_sw*)jump->addr = jump->u.label->addr;
555 #endif
556 
557 		jump = jump->next;
558 	}
559 
560 	/* Maybe we waste some space because of short jumps. */
561 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
562 	compiler->error = SLJIT_ERR_COMPILED;
563 	compiler->executable_size = code_ptr - code;
564 	return (void*)code;
565 }
566 
567 /* --------------------------------------------------------------------- */
568 /*  Operators                                                            */
569 /* --------------------------------------------------------------------- */
570 
571 static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
572 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
573 	sljit_si dst, sljit_sw dstw,
574 	sljit_si src1, sljit_sw src1w,
575 	sljit_si src2, sljit_sw src2w);
576 
577 static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
578 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
579 	sljit_si dst, sljit_sw dstw,
580 	sljit_si src1, sljit_sw src1w,
581 	sljit_si src2, sljit_sw src2w);
582 
583 static sljit_si emit_mov(struct sljit_compiler *compiler,
584 	sljit_si dst, sljit_sw dstw,
585 	sljit_si src, sljit_sw srcw);
586 
587 static SLJIT_INLINE sljit_si emit_save_flags(struct sljit_compiler *compiler)
588 {
589 	sljit_ub *inst;
590 
591 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
592 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
593 	FAIL_IF(!inst);
594 	INC_SIZE(5);
595 #else
596 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
597 	FAIL_IF(!inst);
598 	INC_SIZE(6);
599 	*inst++ = REX_W;
600 #endif
601 	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
602 	*inst++ = 0x64;
603 	*inst++ = 0x24;
604 	*inst++ = (sljit_ub)sizeof(sljit_sw);
605 	*inst++ = PUSHF;
606 	compiler->flags_saved = 1;
607 	return SLJIT_SUCCESS;
608 }
609 
610 static SLJIT_INLINE sljit_si emit_restore_flags(struct sljit_compiler *compiler, sljit_si keep_flags)
611 {
612 	sljit_ub *inst;
613 
614 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
615 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
616 	FAIL_IF(!inst);
617 	INC_SIZE(5);
618 	*inst++ = POPF;
619 #else
620 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
621 	FAIL_IF(!inst);
622 	INC_SIZE(6);
623 	*inst++ = POPF;
624 	*inst++ = REX_W;
625 #endif
626 	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
627 	*inst++ = 0x64;
628 	*inst++ = 0x24;
629 	*inst++ = (sljit_ub)-(sljit_sb)sizeof(sljit_sw);
630 	compiler->flags_saved = keep_flags;
631 	return SLJIT_SUCCESS;
632 }
633 
634 #ifdef _WIN32
635 #include <malloc.h>
636 
637 static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
638 {
639 	/* Workaround for calling the internal _chkstk() function on Windows.
640 	This function touches all 4k pages belongs to the requested stack space,
641 	which size is passed in local_size. This is necessary on Windows where
642 	the stack can only grow in 4k steps. However, this function just burn
643 	CPU cycles if the stack is large enough. However, you don't know it in
644 	advance, so it must always be called. I think this is a bad design in
645 	general even if it has some reasons. */
646 	*(volatile sljit_si*)alloca(local_size) = 0;
647 }
648 
649 #endif
650 
651 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
652 #include "sljitNativeX86_32.c"
653 #else
654 #include "sljitNativeX86_64.c"
655 #endif
656 
657 static sljit_si emit_mov(struct sljit_compiler *compiler,
658 	sljit_si dst, sljit_sw dstw,
659 	sljit_si src, sljit_sw srcw)
660 {
661 	sljit_ub* inst;
662 
663 	if (dst == SLJIT_UNUSED) {
664 		/* No destination, doesn't need to setup flags. */
665 		if (src & SLJIT_MEM) {
666 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
667 			FAIL_IF(!inst);
668 			*inst = MOV_r_rm;
669 		}
670 		return SLJIT_SUCCESS;
671 	}
672 	if (FAST_IS_REG(src)) {
673 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
674 		FAIL_IF(!inst);
675 		*inst = MOV_rm_r;
676 		return SLJIT_SUCCESS;
677 	}
678 	if (src & SLJIT_IMM) {
679 		if (FAST_IS_REG(dst)) {
680 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
681 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
682 #else
683 			if (!compiler->mode32) {
684 				if (NOT_HALFWORD(srcw))
685 					return emit_load_imm64(compiler, dst, srcw);
686 			}
687 			else
688 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
689 #endif
690 		}
691 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
692 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
693 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
694 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
695 			FAIL_IF(!inst);
696 			*inst = MOV_rm_r;
697 			return SLJIT_SUCCESS;
698 		}
699 #endif
700 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
701 		FAIL_IF(!inst);
702 		*inst = MOV_rm_i32;
703 		return SLJIT_SUCCESS;
704 	}
705 	if (FAST_IS_REG(dst)) {
706 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
707 		FAIL_IF(!inst);
708 		*inst = MOV_r_rm;
709 		return SLJIT_SUCCESS;
710 	}
711 
712 	/* Memory to memory move. Requires two instruction. */
713 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
714 	FAIL_IF(!inst);
715 	*inst = MOV_r_rm;
716 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
717 	FAIL_IF(!inst);
718 	*inst = MOV_rm_r;
719 	return SLJIT_SUCCESS;
720 }
721 
722 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
723 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
724 
725 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op)
726 {
727 	sljit_ub *inst;
728 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
729 	sljit_si size;
730 #endif
731 
732 	CHECK_ERROR();
733 	check_sljit_emit_op0(compiler, op);
734 
735 	switch (GET_OPCODE(op)) {
736 	case SLJIT_BREAKPOINT:
737 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
738 		FAIL_IF(!inst);
739 		INC_SIZE(1);
740 		*inst = INT3;
741 		break;
742 	case SLJIT_NOP:
743 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
744 		FAIL_IF(!inst);
745 		INC_SIZE(1);
746 		*inst = NOP;
747 		break;
748 	case SLJIT_UMUL:
749 	case SLJIT_SMUL:
750 	case SLJIT_UDIV:
751 	case SLJIT_SDIV:
752 		compiler->flags_saved = 0;
753 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
754 #ifdef _WIN64
755 		SLJIT_COMPILE_ASSERT(
756 			reg_map[SLJIT_SCRATCH_REG1] == 0
757 			&& reg_map[SLJIT_SCRATCH_REG2] == 2
758 			&& reg_map[TMP_REG1] > 7,
759 			invalid_register_assignment_for_div_mul);
760 #else
761 		SLJIT_COMPILE_ASSERT(
762 			reg_map[SLJIT_SCRATCH_REG1] == 0
763 			&& reg_map[SLJIT_SCRATCH_REG2] < 7
764 			&& reg_map[TMP_REG1] == 2,
765 			invalid_register_assignment_for_div_mul);
766 #endif
767 		compiler->mode32 = op & SLJIT_INT_OP;
768 #endif
769 
770 		op = GET_OPCODE(op);
771 		if (op == SLJIT_UDIV) {
772 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
773 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_SCRATCH_REG2, 0);
774 			inst = emit_x86_instruction(compiler, 1, SLJIT_SCRATCH_REG2, 0, SLJIT_SCRATCH_REG2, 0);
775 #else
776 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
777 #endif
778 			FAIL_IF(!inst);
779 			*inst = XOR_r_rm;
780 		}
781 
782 		if (op == SLJIT_SDIV) {
783 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
784 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_SCRATCH_REG2, 0);
785 #endif
786 
787 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
788 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
789 			FAIL_IF(!inst);
790 			INC_SIZE(1);
791 			*inst = CDQ;
792 #else
793 			if (compiler->mode32) {
794 				inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
795 				FAIL_IF(!inst);
796 				INC_SIZE(1);
797 				*inst = CDQ;
798 			} else {
799 				inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
800 				FAIL_IF(!inst);
801 				INC_SIZE(2);
802 				*inst++ = REX_W;
803 				*inst = CDQ;
804 			}
805 #endif
806 		}
807 
808 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
809 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
810 		FAIL_IF(!inst);
811 		INC_SIZE(2);
812 		*inst++ = GROUP_F7;
813 		*inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_map[TMP_REG1] : reg_map[SLJIT_SCRATCH_REG2]);
814 #else
815 #ifdef _WIN64
816 		size = (!compiler->mode32 || op >= SLJIT_UDIV) ? 3 : 2;
817 #else
818 		size = (!compiler->mode32) ? 3 : 2;
819 #endif
820 		inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
821 		FAIL_IF(!inst);
822 		INC_SIZE(size);
823 #ifdef _WIN64
824 		if (!compiler->mode32)
825 			*inst++ = REX_W | ((op >= SLJIT_UDIV) ? REX_B : 0);
826 		else if (op >= SLJIT_UDIV)
827 			*inst++ = REX_B;
828 		*inst++ = GROUP_F7;
829 		*inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_SCRATCH_REG2]);
830 #else
831 		if (!compiler->mode32)
832 			*inst++ = REX_W;
833 		*inst++ = GROUP_F7;
834 		*inst = MOD_REG | reg_map[SLJIT_SCRATCH_REG2];
835 #endif
836 #endif
837 		switch (op) {
838 		case SLJIT_UMUL:
839 			*inst |= MUL;
840 			break;
841 		case SLJIT_SMUL:
842 			*inst |= IMUL;
843 			break;
844 		case SLJIT_UDIV:
845 			*inst |= DIV;
846 			break;
847 		case SLJIT_SDIV:
848 			*inst |= IDIV;
849 			break;
850 		}
851 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
852 		EMIT_MOV(compiler, SLJIT_SCRATCH_REG2, 0, TMP_REG1, 0);
853 #endif
854 		break;
855 	}
856 
857 	return SLJIT_SUCCESS;
858 }
859 
860 #define ENCODE_PREFIX(prefix) \
861 	do { \
862 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1); \
863 		FAIL_IF(!inst); \
864 		INC_SIZE(1); \
865 		*inst = (prefix); \
866 	} while (0)
867 
868 static sljit_si emit_mov_byte(struct sljit_compiler *compiler, sljit_si sign,
869 	sljit_si dst, sljit_sw dstw,
870 	sljit_si src, sljit_sw srcw)
871 {
872 	sljit_ub* inst;
873 	sljit_si dst_r;
874 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
875 	sljit_si work_r;
876 #endif
877 
878 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
879 	compiler->mode32 = 0;
880 #endif
881 
882 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
883 		return SLJIT_SUCCESS; /* Empty instruction. */
884 
885 	if (src & SLJIT_IMM) {
886 		if (FAST_IS_REG(dst)) {
887 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
888 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
889 #else
890 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
891 			FAIL_IF(!inst);
892 			*inst = MOV_rm_i32;
893 			return SLJIT_SUCCESS;
894 #endif
895 		}
896 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
897 		FAIL_IF(!inst);
898 		*inst = MOV_rm8_i8;
899 		return SLJIT_SUCCESS;
900 	}
901 
902 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
903 
904 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
905 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
906 		if (reg_map[src] >= 4) {
907 			SLJIT_ASSERT(dst_r == TMP_REG1);
908 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
909 		} else
910 			dst_r = src;
911 #else
912 		dst_r = src;
913 #endif
914 	}
915 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
916 	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
917 		/* src, dst are registers. */
918 		SLJIT_ASSERT(SLOW_IS_REG(dst));
919 		if (reg_map[dst] < 4) {
920 			if (dst != src)
921 				EMIT_MOV(compiler, dst, 0, src, 0);
922 			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
923 			FAIL_IF(!inst);
924 			*inst++ = GROUP_0F;
925 			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
926 		}
927 		else {
928 			if (dst != src)
929 				EMIT_MOV(compiler, dst, 0, src, 0);
930 			if (sign) {
931 				/* shl reg, 24 */
932 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
933 				FAIL_IF(!inst);
934 				*inst |= SHL;
935 				/* sar reg, 24 */
936 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
937 				FAIL_IF(!inst);
938 				*inst |= SAR;
939 			}
940 			else {
941 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
942 				FAIL_IF(!inst);
943 				*(inst + 1) |= AND;
944 			}
945 		}
946 		return SLJIT_SUCCESS;
947 	}
948 #endif
949 	else {
950 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
951 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
952 		FAIL_IF(!inst);
953 		*inst++ = GROUP_0F;
954 		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
955 	}
956 
957 	if (dst & SLJIT_MEM) {
958 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
959 		if (dst_r == TMP_REG1) {
960 			/* Find a non-used register, whose reg_map[src] < 4. */
961 			if ((dst & REG_MASK) == SLJIT_SCRATCH_REG1) {
962 				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_SCRATCH_REG2))
963 					work_r = SLJIT_SCRATCH_REG3;
964 				else
965 					work_r = SLJIT_SCRATCH_REG2;
966 			}
967 			else {
968 				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SCRATCH_REG1))
969 					work_r = SLJIT_SCRATCH_REG1;
970 				else if ((dst & REG_MASK) == SLJIT_SCRATCH_REG2)
971 					work_r = SLJIT_SCRATCH_REG3;
972 				else
973 					work_r = SLJIT_SCRATCH_REG2;
974 			}
975 
976 			if (work_r == SLJIT_SCRATCH_REG1) {
977 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
978 			}
979 			else {
980 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
981 				FAIL_IF(!inst);
982 				*inst = XCHG_r_rm;
983 			}
984 
985 			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
986 			FAIL_IF(!inst);
987 			*inst = MOV_rm8_r8;
988 
989 			if (work_r == SLJIT_SCRATCH_REG1) {
990 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
991 			}
992 			else {
993 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
994 				FAIL_IF(!inst);
995 				*inst = XCHG_r_rm;
996 			}
997 		}
998 		else {
999 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1000 			FAIL_IF(!inst);
1001 			*inst = MOV_rm8_r8;
1002 		}
1003 #else
1004 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1005 		FAIL_IF(!inst);
1006 		*inst = MOV_rm8_r8;
1007 #endif
1008 	}
1009 
1010 	return SLJIT_SUCCESS;
1011 }
1012 
1013 static sljit_si emit_mov_half(struct sljit_compiler *compiler, sljit_si sign,
1014 	sljit_si dst, sljit_sw dstw,
1015 	sljit_si src, sljit_sw srcw)
1016 {
1017 	sljit_ub* inst;
1018 	sljit_si dst_r;
1019 
1020 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1021 	compiler->mode32 = 0;
1022 #endif
1023 
1024 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
1025 		return SLJIT_SUCCESS; /* Empty instruction. */
1026 
1027 	if (src & SLJIT_IMM) {
1028 		if (FAST_IS_REG(dst)) {
1029 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1030 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1031 #else
1032 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1033 			FAIL_IF(!inst);
1034 			*inst = MOV_rm_i32;
1035 			return SLJIT_SUCCESS;
1036 #endif
1037 		}
1038 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1039 		FAIL_IF(!inst);
1040 		*inst = MOV_rm_i32;
1041 		return SLJIT_SUCCESS;
1042 	}
1043 
1044 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1045 
1046 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1047 		dst_r = src;
1048 	else {
1049 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1050 		FAIL_IF(!inst);
1051 		*inst++ = GROUP_0F;
1052 		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1053 	}
1054 
1055 	if (dst & SLJIT_MEM) {
1056 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1057 		FAIL_IF(!inst);
1058 		*inst = MOV_rm_r;
1059 	}
1060 
1061 	return SLJIT_SUCCESS;
1062 }
1063 
1064 static sljit_si emit_unary(struct sljit_compiler *compiler, sljit_ub opcode,
1065 	sljit_si dst, sljit_sw dstw,
1066 	sljit_si src, sljit_sw srcw)
1067 {
1068 	sljit_ub* inst;
1069 
1070 	if (dst == SLJIT_UNUSED) {
1071 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1072 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1073 		FAIL_IF(!inst);
1074 		*inst++ = GROUP_F7;
1075 		*inst |= opcode;
1076 		return SLJIT_SUCCESS;
1077 	}
1078 	if (dst == src && dstw == srcw) {
1079 		/* Same input and output */
1080 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1081 		FAIL_IF(!inst);
1082 		*inst++ = GROUP_F7;
1083 		*inst |= opcode;
1084 		return SLJIT_SUCCESS;
1085 	}
1086 	if (FAST_IS_REG(dst)) {
1087 		EMIT_MOV(compiler, dst, 0, src, srcw);
1088 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1089 		FAIL_IF(!inst);
1090 		*inst++ = GROUP_F7;
1091 		*inst |= opcode;
1092 		return SLJIT_SUCCESS;
1093 	}
1094 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1095 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1096 	FAIL_IF(!inst);
1097 	*inst++ = GROUP_F7;
1098 	*inst |= opcode;
1099 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1100 	return SLJIT_SUCCESS;
1101 }
1102 
1103 static sljit_si emit_not_with_flags(struct sljit_compiler *compiler,
1104 	sljit_si dst, sljit_sw dstw,
1105 	sljit_si src, sljit_sw srcw)
1106 {
1107 	sljit_ub* inst;
1108 
1109 	if (dst == SLJIT_UNUSED) {
1110 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1111 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1112 		FAIL_IF(!inst);
1113 		*inst++ = GROUP_F7;
1114 		*inst |= NOT_rm;
1115 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1116 		FAIL_IF(!inst);
1117 		*inst = OR_r_rm;
1118 		return SLJIT_SUCCESS;
1119 	}
1120 	if (FAST_IS_REG(dst)) {
1121 		EMIT_MOV(compiler, dst, 0, src, srcw);
1122 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1123 		FAIL_IF(!inst);
1124 		*inst++ = GROUP_F7;
1125 		*inst |= NOT_rm;
1126 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1127 		FAIL_IF(!inst);
1128 		*inst = OR_r_rm;
1129 		return SLJIT_SUCCESS;
1130 	}
1131 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1132 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1133 	FAIL_IF(!inst);
1134 	*inst++ = GROUP_F7;
1135 	*inst |= NOT_rm;
1136 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1137 	FAIL_IF(!inst);
1138 	*inst = OR_r_rm;
1139 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1140 	return SLJIT_SUCCESS;
1141 }
1142 
1143 static sljit_si emit_clz(struct sljit_compiler *compiler, sljit_si op_flags,
1144 	sljit_si dst, sljit_sw dstw,
1145 	sljit_si src, sljit_sw srcw)
1146 {
1147 	sljit_ub* inst;
1148 	sljit_si dst_r;
1149 
1150 	SLJIT_UNUSED_ARG(op_flags);
1151 	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
1152 		/* Just set the zero flag. */
1153 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1154 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1155 		FAIL_IF(!inst);
1156 		*inst++ = GROUP_F7;
1157 		*inst |= NOT_rm;
1158 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1159 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
1160 #else
1161 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, TMP_REG1, 0);
1162 #endif
1163 		FAIL_IF(!inst);
1164 		*inst |= SHR;
1165 		return SLJIT_SUCCESS;
1166 	}
1167 
1168 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
1169 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
1170 		src = TMP_REG1;
1171 		srcw = 0;
1172 	}
1173 
1174 	inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
1175 	FAIL_IF(!inst);
1176 	*inst++ = GROUP_0F;
1177 	*inst = BSR_r_rm;
1178 
1179 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1180 	if (FAST_IS_REG(dst))
1181 		dst_r = dst;
1182 	else {
1183 		/* Find an unused temporary register. */
1184 		if ((dst & REG_MASK) != SLJIT_SCRATCH_REG1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SCRATCH_REG1))
1185 			dst_r = SLJIT_SCRATCH_REG1;
1186 		else if ((dst & REG_MASK) != SLJIT_SCRATCH_REG2 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SCRATCH_REG2))
1187 			dst_r = SLJIT_SCRATCH_REG2;
1188 		else
1189 			dst_r = SLJIT_SCRATCH_REG3;
1190 		EMIT_MOV(compiler, dst, dstw, dst_r, 0);
1191 	}
1192 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
1193 #else
1194 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
1195 	compiler->mode32 = 0;
1196 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 64 + 63 : 32 + 31);
1197 	compiler->mode32 = op_flags & SLJIT_INT_OP;
1198 #endif
1199 
1200 	if (cpu_has_cmov == -1)
1201 		get_cpu_features();
1202 
1203 	if (cpu_has_cmov) {
1204 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1205 		FAIL_IF(!inst);
1206 		*inst++ = GROUP_0F;
1207 		*inst = CMOVNE_r_rm;
1208 	} else {
1209 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1210 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1211 		FAIL_IF(!inst);
1212 		INC_SIZE(4);
1213 
1214 		*inst++ = JE_i8;
1215 		*inst++ = 2;
1216 		*inst++ = MOV_r_rm;
1217 		*inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
1218 #else
1219 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
1220 		FAIL_IF(!inst);
1221 		INC_SIZE(5);
1222 
1223 		*inst++ = JE_i8;
1224 		*inst++ = 3;
1225 		*inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
1226 		*inst++ = MOV_r_rm;
1227 		*inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
1228 #endif
1229 	}
1230 
1231 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1232 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1233 #else
1234 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, dst_r, 0);
1235 #endif
1236 	FAIL_IF(!inst);
1237 	*(inst + 1) |= XOR;
1238 
1239 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1240 	if (dst & SLJIT_MEM) {
1241 		inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1242 		FAIL_IF(!inst);
1243 		*inst = XCHG_r_rm;
1244 	}
1245 #else
1246 	if (dst & SLJIT_MEM)
1247 		EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
1248 #endif
1249 	return SLJIT_SUCCESS;
1250 }
1251 
1252 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler, sljit_si op,
1253 	sljit_si dst, sljit_sw dstw,
1254 	sljit_si src, sljit_sw srcw)
1255 {
1256 	sljit_ub* inst;
1257 	sljit_si update = 0;
1258 	sljit_si op_flags = GET_ALL_FLAGS(op);
1259 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1260 	sljit_si dst_is_ereg = 0;
1261 	sljit_si src_is_ereg = 0;
1262 #else
1263 #	define src_is_ereg 0
1264 #endif
1265 
1266 	CHECK_ERROR();
1267 	check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw);
1268 	ADJUST_LOCAL_OFFSET(dst, dstw);
1269 	ADJUST_LOCAL_OFFSET(src, srcw);
1270 
1271 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1272 	CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
1273 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1274 	compiler->mode32 = op_flags & SLJIT_INT_OP;
1275 #endif
1276 
1277 	op = GET_OPCODE(op);
1278 	if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
1279 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1280 		compiler->mode32 = 0;
1281 #endif
1282 
1283 		if (op_flags & SLJIT_INT_OP) {
1284 			if (FAST_IS_REG(src) && src == dst) {
1285 				if (!TYPE_CAST_NEEDED(op))
1286 					return SLJIT_SUCCESS;
1287 			}
1288 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1289 			if (op == SLJIT_MOV_SI && (src & SLJIT_MEM))
1290 				op = SLJIT_MOV_UI;
1291 			if (op == SLJIT_MOVU_SI && (src & SLJIT_MEM))
1292 				op = SLJIT_MOVU_UI;
1293 			if (op == SLJIT_MOV_UI && (src & SLJIT_IMM))
1294 				op = SLJIT_MOV_SI;
1295 			if (op == SLJIT_MOVU_UI && (src & SLJIT_IMM))
1296 				op = SLJIT_MOVU_SI;
1297 #endif
1298 		}
1299 
1300 		SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
1301 		if (op >= SLJIT_MOVU) {
1302 			update = 1;
1303 			op -= 8;
1304 		}
1305 
1306 		if (src & SLJIT_IMM) {
1307 			switch (op) {
1308 			case SLJIT_MOV_UB:
1309 				srcw = (sljit_ub)srcw;
1310 				break;
1311 			case SLJIT_MOV_SB:
1312 				srcw = (sljit_sb)srcw;
1313 				break;
1314 			case SLJIT_MOV_UH:
1315 				srcw = (sljit_uh)srcw;
1316 				break;
1317 			case SLJIT_MOV_SH:
1318 				srcw = (sljit_sh)srcw;
1319 				break;
1320 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1321 			case SLJIT_MOV_UI:
1322 				srcw = (sljit_ui)srcw;
1323 				break;
1324 			case SLJIT_MOV_SI:
1325 				srcw = (sljit_si)srcw;
1326 				break;
1327 #endif
1328 			}
1329 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1330 			if (SLJIT_UNLIKELY(dst_is_ereg))
1331 				return emit_mov(compiler, dst, dstw, src, srcw);
1332 #endif
1333 		}
1334 
1335 		if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
1336 			inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
1337 			FAIL_IF(!inst);
1338 			*inst = LEA_r_m;
1339 			src &= SLJIT_MEM | 0xf;
1340 			srcw = 0;
1341 		}
1342 
1343 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1344 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_UI || op == SLJIT_MOV_SI || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1345 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_LOCALS_REG));
1346 			dst = TMP_REG1;
1347 		}
1348 #endif
1349 
1350 		switch (op) {
1351 		case SLJIT_MOV:
1352 		case SLJIT_MOV_P:
1353 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1354 		case SLJIT_MOV_UI:
1355 		case SLJIT_MOV_SI:
1356 #endif
1357 			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1358 			break;
1359 		case SLJIT_MOV_UB:
1360 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1361 			break;
1362 		case SLJIT_MOV_SB:
1363 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1364 			break;
1365 		case SLJIT_MOV_UH:
1366 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1367 			break;
1368 		case SLJIT_MOV_SH:
1369 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1370 			break;
1371 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1372 		case SLJIT_MOV_UI:
1373 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1374 			break;
1375 		case SLJIT_MOV_SI:
1376 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1377 			break;
1378 #endif
1379 		}
1380 
1381 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1382 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1383 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_LOCALS_REG), dstw, TMP_REG1, 0);
1384 #endif
1385 
1386 		if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
1387 			inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
1388 			FAIL_IF(!inst);
1389 			*inst = LEA_r_m;
1390 		}
1391 		return SLJIT_SUCCESS;
1392 	}
1393 
1394 	if (SLJIT_UNLIKELY(GET_FLAGS(op_flags)))
1395 		compiler->flags_saved = 0;
1396 
1397 	switch (op) {
1398 	case SLJIT_NOT:
1399 		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
1400 			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1401 		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1402 
1403 	case SLJIT_NEG:
1404 		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1405 			FAIL_IF(emit_save_flags(compiler));
1406 		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1407 
1408 	case SLJIT_CLZ:
1409 		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1410 			FAIL_IF(emit_save_flags(compiler));
1411 		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1412 	}
1413 
1414 	return SLJIT_SUCCESS;
1415 
1416 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1417 #	undef src_is_ereg
1418 #endif
1419 }
1420 
1421 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1422 
1423 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1424 	if (IS_HALFWORD(immw) || compiler->mode32) { \
1425 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1426 		FAIL_IF(!inst); \
1427 		*(inst + 1) |= (op_imm); \
1428 	} \
1429 	else { \
1430 		FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
1431 		inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
1432 		FAIL_IF(!inst); \
1433 		*inst = (op_mr); \
1434 	}
1435 
1436 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1437 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1438 
1439 #else
1440 
1441 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1442 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1443 	FAIL_IF(!inst); \
1444 	*(inst + 1) |= (op_imm);
1445 
1446 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1447 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1448 
1449 #endif
1450 
1451 static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
1452 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
1453 	sljit_si dst, sljit_sw dstw,
1454 	sljit_si src1, sljit_sw src1w,
1455 	sljit_si src2, sljit_sw src2w)
1456 {
1457 	sljit_ub* inst;
1458 
1459 	if (dst == SLJIT_UNUSED) {
1460 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1461 		if (src2 & SLJIT_IMM) {
1462 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1463 		}
1464 		else {
1465 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1466 			FAIL_IF(!inst);
1467 			*inst = op_rm;
1468 		}
1469 		return SLJIT_SUCCESS;
1470 	}
1471 
1472 	if (dst == src1 && dstw == src1w) {
1473 		if (src2 & SLJIT_IMM) {
1474 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1475 			if ((dst == SLJIT_SCRATCH_REG1) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1476 #else
1477 			if ((dst == SLJIT_SCRATCH_REG1) && (src2w > 127 || src2w < -128)) {
1478 #endif
1479 				BINARY_EAX_IMM(op_eax_imm, src2w);
1480 			}
1481 			else {
1482 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1483 			}
1484 		}
1485 		else if (FAST_IS_REG(dst)) {
1486 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1487 			FAIL_IF(!inst);
1488 			*inst = op_rm;
1489 		}
1490 		else if (FAST_IS_REG(src2)) {
1491 			/* Special exception for sljit_emit_op_flags. */
1492 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1493 			FAIL_IF(!inst);
1494 			*inst = op_mr;
1495 		}
1496 		else {
1497 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1498 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1499 			FAIL_IF(!inst);
1500 			*inst = op_mr;
1501 		}
1502 		return SLJIT_SUCCESS;
1503 	}
1504 
1505 	/* Only for cumulative operations. */
1506 	if (dst == src2 && dstw == src2w) {
1507 		if (src1 & SLJIT_IMM) {
1508 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1509 			if ((dst == SLJIT_SCRATCH_REG1) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1510 #else
1511 			if ((dst == SLJIT_SCRATCH_REG1) && (src1w > 127 || src1w < -128)) {
1512 #endif
1513 				BINARY_EAX_IMM(op_eax_imm, src1w);
1514 			}
1515 			else {
1516 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1517 			}
1518 		}
1519 		else if (FAST_IS_REG(dst)) {
1520 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1521 			FAIL_IF(!inst);
1522 			*inst = op_rm;
1523 		}
1524 		else if (FAST_IS_REG(src1)) {
1525 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1526 			FAIL_IF(!inst);
1527 			*inst = op_mr;
1528 		}
1529 		else {
1530 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1531 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1532 			FAIL_IF(!inst);
1533 			*inst = op_mr;
1534 		}
1535 		return SLJIT_SUCCESS;
1536 	}
1537 
1538 	/* General version. */
1539 	if (FAST_IS_REG(dst)) {
1540 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1541 		if (src2 & SLJIT_IMM) {
1542 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1543 		}
1544 		else {
1545 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1546 			FAIL_IF(!inst);
1547 			*inst = op_rm;
1548 		}
1549 	}
1550 	else {
1551 		/* This version requires less memory writing. */
1552 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1553 		if (src2 & SLJIT_IMM) {
1554 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1555 		}
1556 		else {
1557 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1558 			FAIL_IF(!inst);
1559 			*inst = op_rm;
1560 		}
1561 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1562 	}
1563 
1564 	return SLJIT_SUCCESS;
1565 }
1566 
1567 static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
1568 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
1569 	sljit_si dst, sljit_sw dstw,
1570 	sljit_si src1, sljit_sw src1w,
1571 	sljit_si src2, sljit_sw src2w)
1572 {
1573 	sljit_ub* inst;
1574 
1575 	if (dst == SLJIT_UNUSED) {
1576 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1577 		if (src2 & SLJIT_IMM) {
1578 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1579 		}
1580 		else {
1581 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1582 			FAIL_IF(!inst);
1583 			*inst = op_rm;
1584 		}
1585 		return SLJIT_SUCCESS;
1586 	}
1587 
1588 	if (dst == src1 && dstw == src1w) {
1589 		if (src2 & SLJIT_IMM) {
1590 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1591 			if ((dst == SLJIT_SCRATCH_REG1) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1592 #else
1593 			if ((dst == SLJIT_SCRATCH_REG1) && (src2w > 127 || src2w < -128)) {
1594 #endif
1595 				BINARY_EAX_IMM(op_eax_imm, src2w);
1596 			}
1597 			else {
1598 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1599 			}
1600 		}
1601 		else if (FAST_IS_REG(dst)) {
1602 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1603 			FAIL_IF(!inst);
1604 			*inst = op_rm;
1605 		}
1606 		else if (FAST_IS_REG(src2)) {
1607 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1608 			FAIL_IF(!inst);
1609 			*inst = op_mr;
1610 		}
1611 		else {
1612 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1613 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1614 			FAIL_IF(!inst);
1615 			*inst = op_mr;
1616 		}
1617 		return SLJIT_SUCCESS;
1618 	}
1619 
1620 	/* General version. */
1621 	if (FAST_IS_REG(dst) && dst != src2) {
1622 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1623 		if (src2 & SLJIT_IMM) {
1624 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1625 		}
1626 		else {
1627 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1628 			FAIL_IF(!inst);
1629 			*inst = op_rm;
1630 		}
1631 	}
1632 	else {
1633 		/* This version requires less memory writing. */
1634 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1635 		if (src2 & SLJIT_IMM) {
1636 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1637 		}
1638 		else {
1639 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1640 			FAIL_IF(!inst);
1641 			*inst = op_rm;
1642 		}
1643 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1644 	}
1645 
1646 	return SLJIT_SUCCESS;
1647 }
1648 
1649 static sljit_si emit_mul(struct sljit_compiler *compiler,
1650 	sljit_si dst, sljit_sw dstw,
1651 	sljit_si src1, sljit_sw src1w,
1652 	sljit_si src2, sljit_sw src2w)
1653 {
1654 	sljit_ub* inst;
1655 	sljit_si dst_r;
1656 
1657 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1658 
1659 	/* Register destination. */
1660 	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1661 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1662 		FAIL_IF(!inst);
1663 		*inst++ = GROUP_0F;
1664 		*inst = IMUL_r_rm;
1665 	}
1666 	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1667 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1668 		FAIL_IF(!inst);
1669 		*inst++ = GROUP_0F;
1670 		*inst = IMUL_r_rm;
1671 	}
1672 	else if (src1 & SLJIT_IMM) {
1673 		if (src2 & SLJIT_IMM) {
1674 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1675 			src2 = dst_r;
1676 			src2w = 0;
1677 		}
1678 
1679 		if (src1w <= 127 && src1w >= -128) {
1680 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1681 			FAIL_IF(!inst);
1682 			*inst = IMUL_r_rm_i8;
1683 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
1684 			FAIL_IF(!inst);
1685 			INC_SIZE(1);
1686 			*inst = (sljit_sb)src1w;
1687 		}
1688 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1689 		else {
1690 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1691 			FAIL_IF(!inst);
1692 			*inst = IMUL_r_rm_i32;
1693 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1694 			FAIL_IF(!inst);
1695 			INC_SIZE(4);
1696 			*(sljit_sw*)inst = src1w;
1697 		}
1698 #else
1699 		else if (IS_HALFWORD(src1w)) {
1700 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1701 			FAIL_IF(!inst);
1702 			*inst = IMUL_r_rm_i32;
1703 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1704 			FAIL_IF(!inst);
1705 			INC_SIZE(4);
1706 			*(sljit_si*)inst = (sljit_si)src1w;
1707 		}
1708 		else {
1709 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1710 			if (dst_r != src2)
1711 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1712 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1713 			FAIL_IF(!inst);
1714 			*inst++ = GROUP_0F;
1715 			*inst = IMUL_r_rm;
1716 		}
1717 #endif
1718 	}
1719 	else if (src2 & SLJIT_IMM) {
1720 		/* Note: src1 is NOT immediate. */
1721 
1722 		if (src2w <= 127 && src2w >= -128) {
1723 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1724 			FAIL_IF(!inst);
1725 			*inst = IMUL_r_rm_i8;
1726 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
1727 			FAIL_IF(!inst);
1728 			INC_SIZE(1);
1729 			*inst = (sljit_sb)src2w;
1730 		}
1731 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1732 		else {
1733 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1734 			FAIL_IF(!inst);
1735 			*inst = IMUL_r_rm_i32;
1736 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1737 			FAIL_IF(!inst);
1738 			INC_SIZE(4);
1739 			*(sljit_sw*)inst = src2w;
1740 		}
1741 #else
1742 		else if (IS_HALFWORD(src2w)) {
1743 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1744 			FAIL_IF(!inst);
1745 			*inst = IMUL_r_rm_i32;
1746 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1747 			FAIL_IF(!inst);
1748 			INC_SIZE(4);
1749 			*(sljit_si*)inst = (sljit_si)src2w;
1750 		}
1751 		else {
1752 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1753 			if (dst_r != src1)
1754 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1755 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1756 			FAIL_IF(!inst);
1757 			*inst++ = GROUP_0F;
1758 			*inst = IMUL_r_rm;
1759 		}
1760 #endif
1761 	}
1762 	else {
1763 		/* Neither argument is immediate. */
1764 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1765 			dst_r = TMP_REG1;
1766 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1767 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1768 		FAIL_IF(!inst);
1769 		*inst++ = GROUP_0F;
1770 		*inst = IMUL_r_rm;
1771 	}
1772 
1773 	if (dst_r == TMP_REG1)
1774 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1775 
1776 	return SLJIT_SUCCESS;
1777 }
1778 
1779 static sljit_si emit_lea_binary(struct sljit_compiler *compiler, sljit_si keep_flags,
1780 	sljit_si dst, sljit_sw dstw,
1781 	sljit_si src1, sljit_sw src1w,
1782 	sljit_si src2, sljit_sw src2w)
1783 {
1784 	sljit_ub* inst;
1785 	sljit_si dst_r, done = 0;
1786 
1787 	/* These cases better be left to handled by normal way. */
1788 	if (!keep_flags) {
1789 		if (dst == src1 && dstw == src1w)
1790 			return SLJIT_ERR_UNSUPPORTED;
1791 		if (dst == src2 && dstw == src2w)
1792 			return SLJIT_ERR_UNSUPPORTED;
1793 	}
1794 
1795 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1796 
1797 	if (FAST_IS_REG(src1)) {
1798 		if (FAST_IS_REG(src2)) {
1799 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1800 			FAIL_IF(!inst);
1801 			*inst = LEA_r_m;
1802 			done = 1;
1803 		}
1804 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1805 		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1806 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_si)src2w);
1807 #else
1808 		if (src2 & SLJIT_IMM) {
1809 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1810 #endif
1811 			FAIL_IF(!inst);
1812 			*inst = LEA_r_m;
1813 			done = 1;
1814 		}
1815 	}
1816 	else if (FAST_IS_REG(src2)) {
1817 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1818 		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1819 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_si)src1w);
1820 #else
1821 		if (src1 & SLJIT_IMM) {
1822 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1823 #endif
1824 			FAIL_IF(!inst);
1825 			*inst = LEA_r_m;
1826 			done = 1;
1827 		}
1828 	}
1829 
1830 	if (done) {
1831 		if (dst_r == TMP_REG1)
1832 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1833 		return SLJIT_SUCCESS;
1834 	}
1835 	return SLJIT_ERR_UNSUPPORTED;
1836 }
1837 
1838 static sljit_si emit_cmp_binary(struct sljit_compiler *compiler,
1839 	sljit_si src1, sljit_sw src1w,
1840 	sljit_si src2, sljit_sw src2w)
1841 {
1842 	sljit_ub* inst;
1843 
1844 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1845 	if (src1 == SLJIT_SCRATCH_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1846 #else
1847 	if (src1 == SLJIT_SCRATCH_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1848 #endif
1849 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
1850 		return SLJIT_SUCCESS;
1851 	}
1852 
1853 	if (FAST_IS_REG(src1)) {
1854 		if (src2 & SLJIT_IMM) {
1855 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
1856 		}
1857 		else {
1858 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1859 			FAIL_IF(!inst);
1860 			*inst = CMP_r_rm;
1861 		}
1862 		return SLJIT_SUCCESS;
1863 	}
1864 
1865 	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
1866 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1867 		FAIL_IF(!inst);
1868 		*inst = CMP_rm_r;
1869 		return SLJIT_SUCCESS;
1870 	}
1871 
1872 	if (src2 & SLJIT_IMM) {
1873 		if (src1 & SLJIT_IMM) {
1874 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1875 			src1 = TMP_REG1;
1876 			src1w = 0;
1877 		}
1878 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
1879 	}
1880 	else {
1881 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1882 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1883 		FAIL_IF(!inst);
1884 		*inst = CMP_r_rm;
1885 	}
1886 	return SLJIT_SUCCESS;
1887 }
1888 
1889 static sljit_si emit_test_binary(struct sljit_compiler *compiler,
1890 	sljit_si src1, sljit_sw src1w,
1891 	sljit_si src2, sljit_sw src2w)
1892 {
1893 	sljit_ub* inst;
1894 
1895 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1896 	if (src1 == SLJIT_SCRATCH_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1897 #else
1898 	if (src1 == SLJIT_SCRATCH_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1899 #endif
1900 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
1901 		return SLJIT_SUCCESS;
1902 	}
1903 
1904 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1905 	if (src2 == SLJIT_SCRATCH_REG1 && (src2 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1906 #else
1907 	if (src2 == SLJIT_SCRATCH_REG1 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
1908 #endif
1909 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
1910 		return SLJIT_SUCCESS;
1911 	}
1912 
1913 	if (FAST_IS_REG(src1)) {
1914 		if (src2 & SLJIT_IMM) {
1915 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1916 			if (IS_HALFWORD(src2w) || compiler->mode32) {
1917 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
1918 				FAIL_IF(!inst);
1919 				*inst = GROUP_F7;
1920 			}
1921 			else {
1922 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1923 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, 0);
1924 				FAIL_IF(!inst);
1925 				*inst = TEST_rm_r;
1926 			}
1927 #else
1928 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
1929 			FAIL_IF(!inst);
1930 			*inst = GROUP_F7;
1931 #endif
1932 		}
1933 		else {
1934 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1935 			FAIL_IF(!inst);
1936 			*inst = TEST_rm_r;
1937 		}
1938 		return SLJIT_SUCCESS;
1939 	}
1940 
1941 	if (FAST_IS_REG(src2)) {
1942 		if (src1 & SLJIT_IMM) {
1943 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1944 			if (IS_HALFWORD(src1w) || compiler->mode32) {
1945 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, 0);
1946 				FAIL_IF(!inst);
1947 				*inst = GROUP_F7;
1948 			}
1949 			else {
1950 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1951 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, 0);
1952 				FAIL_IF(!inst);
1953 				*inst = TEST_rm_r;
1954 			}
1955 #else
1956 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, 0);
1957 			FAIL_IF(!inst);
1958 			*inst = GROUP_F7;
1959 #endif
1960 		}
1961 		else {
1962 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1963 			FAIL_IF(!inst);
1964 			*inst = TEST_rm_r;
1965 		}
1966 		return SLJIT_SUCCESS;
1967 	}
1968 
1969 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1970 	if (src2 & SLJIT_IMM) {
1971 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1972 		if (IS_HALFWORD(src2w) || compiler->mode32) {
1973 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1974 			FAIL_IF(!inst);
1975 			*inst = GROUP_F7;
1976 		}
1977 		else {
1978 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1979 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
1980 			FAIL_IF(!inst);
1981 			*inst = TEST_rm_r;
1982 		}
1983 #else
1984 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1985 		FAIL_IF(!inst);
1986 		*inst = GROUP_F7;
1987 #endif
1988 	}
1989 	else {
1990 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1991 		FAIL_IF(!inst);
1992 		*inst = TEST_rm_r;
1993 	}
1994 	return SLJIT_SUCCESS;
1995 }
1996 
1997 static sljit_si emit_shift(struct sljit_compiler *compiler,
1998 	sljit_ub mode,
1999 	sljit_si dst, sljit_sw dstw,
2000 	sljit_si src1, sljit_sw src1w,
2001 	sljit_si src2, sljit_sw src2w)
2002 {
2003 	sljit_ub* inst;
2004 
2005 	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
2006 		if (dst == src1 && dstw == src1w) {
2007 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2008 			FAIL_IF(!inst);
2009 			*inst |= mode;
2010 			return SLJIT_SUCCESS;
2011 		}
2012 		if (dst == SLJIT_UNUSED) {
2013 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2014 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2015 			FAIL_IF(!inst);
2016 			*inst |= mode;
2017 			return SLJIT_SUCCESS;
2018 		}
2019 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2020 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2021 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2022 			FAIL_IF(!inst);
2023 			*inst |= mode;
2024 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2025 			return SLJIT_SUCCESS;
2026 		}
2027 		if (FAST_IS_REG(dst)) {
2028 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2029 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2030 			FAIL_IF(!inst);
2031 			*inst |= mode;
2032 			return SLJIT_SUCCESS;
2033 		}
2034 
2035 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2036 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2037 		FAIL_IF(!inst);
2038 		*inst |= mode;
2039 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2040 		return SLJIT_SUCCESS;
2041 	}
2042 
2043 	if (dst == SLJIT_PREF_SHIFT_REG) {
2044 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2045 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2046 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2047 		FAIL_IF(!inst);
2048 		*inst |= mode;
2049 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2050 	}
2051 	else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2052 		if (src1 != dst)
2053 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2054 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2055 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2056 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2057 		FAIL_IF(!inst);
2058 		*inst |= mode;
2059 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2060 	}
2061 	else {
2062 		/* This case is really difficult, since ecx itself may used for
2063 		   addressing, and we must ensure to work even in that case. */
2064 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2065 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2066 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2067 #else
2068 		/* [esp+0] contains the flags. */
2069 		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_LOCALS_REG), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
2070 #endif
2071 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2072 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2073 		FAIL_IF(!inst);
2074 		*inst |= mode;
2075 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2076 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2077 #else
2078 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), sizeof(sljit_sw));
2079 #endif
2080 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2081 	}
2082 
2083 	return SLJIT_SUCCESS;
2084 }
2085 
2086 static sljit_si emit_shift_with_flags(struct sljit_compiler *compiler,
2087 	sljit_ub mode, sljit_si set_flags,
2088 	sljit_si dst, sljit_sw dstw,
2089 	sljit_si src1, sljit_sw src1w,
2090 	sljit_si src2, sljit_sw src2w)
2091 {
2092 	/* The CPU does not set flags if the shift count is 0. */
2093 	if (src2 & SLJIT_IMM) {
2094 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2095 		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2096 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2097 #else
2098 		if ((src2w & 0x1f) != 0)
2099 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2100 #endif
2101 		if (!set_flags)
2102 			return emit_mov(compiler, dst, dstw, src1, src1w);
2103 		/* OR dst, src, 0 */
2104 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2105 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2106 	}
2107 
2108 	if (!set_flags)
2109 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2110 
2111 	if (!FAST_IS_REG(dst))
2112 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2113 
2114 	FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
2115 
2116 	if (FAST_IS_REG(dst))
2117 		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2118 	return SLJIT_SUCCESS;
2119 }
2120 
2121 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler, sljit_si op,
2122 	sljit_si dst, sljit_sw dstw,
2123 	sljit_si src1, sljit_sw src1w,
2124 	sljit_si src2, sljit_sw src2w)
2125 {
2126 	CHECK_ERROR();
2127 	check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
2128 	ADJUST_LOCAL_OFFSET(dst, dstw);
2129 	ADJUST_LOCAL_OFFSET(src1, src1w);
2130 	ADJUST_LOCAL_OFFSET(src2, src2w);
2131 
2132 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2133 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2134 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2135 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2136 	compiler->mode32 = op & SLJIT_INT_OP;
2137 #endif
2138 
2139 	if (GET_OPCODE(op) >= SLJIT_MUL) {
2140 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2141 			compiler->flags_saved = 0;
2142 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2143 			FAIL_IF(emit_save_flags(compiler));
2144 	}
2145 
2146 	switch (GET_OPCODE(op)) {
2147 	case SLJIT_ADD:
2148 		if (!GET_FLAGS(op)) {
2149 			if (emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2150 				return compiler->error;
2151 		}
2152 		else
2153 			compiler->flags_saved = 0;
2154 		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2155 			FAIL_IF(emit_save_flags(compiler));
2156 		return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
2157 			dst, dstw, src1, src1w, src2, src2w);
2158 	case SLJIT_ADDC:
2159 		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2160 			FAIL_IF(emit_restore_flags(compiler, 1));
2161 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2162 			FAIL_IF(emit_save_flags(compiler));
2163 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2164 			compiler->flags_saved = 0;
2165 		return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
2166 			dst, dstw, src1, src1w, src2, src2w);
2167 	case SLJIT_SUB:
2168 		if (!GET_FLAGS(op)) {
2169 			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2170 				return compiler->error;
2171 		}
2172 		else
2173 			compiler->flags_saved = 0;
2174 		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2175 			FAIL_IF(emit_save_flags(compiler));
2176 		if (dst == SLJIT_UNUSED)
2177 			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2178 		return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
2179 			dst, dstw, src1, src1w, src2, src2w);
2180 	case SLJIT_SUBC:
2181 		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2182 			FAIL_IF(emit_restore_flags(compiler, 1));
2183 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2184 			FAIL_IF(emit_save_flags(compiler));
2185 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2186 			compiler->flags_saved = 0;
2187 		return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
2188 			dst, dstw, src1, src1w, src2, src2w);
2189 	case SLJIT_MUL:
2190 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2191 	case SLJIT_AND:
2192 		if (dst == SLJIT_UNUSED)
2193 			return emit_test_binary(compiler, src1, src1w, src2, src2w);
2194 		return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
2195 			dst, dstw, src1, src1w, src2, src2w);
2196 	case SLJIT_OR:
2197 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2198 			dst, dstw, src1, src1w, src2, src2w);
2199 	case SLJIT_XOR:
2200 		return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
2201 			dst, dstw, src1, src1w, src2, src2w);
2202 	case SLJIT_SHL:
2203 		return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
2204 			dst, dstw, src1, src1w, src2, src2w);
2205 	case SLJIT_LSHR:
2206 		return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
2207 			dst, dstw, src1, src1w, src2, src2w);
2208 	case SLJIT_ASHR:
2209 		return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
2210 			dst, dstw, src1, src1w, src2, src2w);
2211 	}
2212 
2213 	return SLJIT_SUCCESS;
2214 }
2215 
2216 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg)
2217 {
2218 	check_sljit_get_register_index(reg);
2219 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2220 	if (reg == SLJIT_TEMPORARY_EREG1 || reg == SLJIT_TEMPORARY_EREG2
2221 			|| reg == SLJIT_SAVED_EREG1 || reg == SLJIT_SAVED_EREG2)
2222 		return -1;
2223 #endif
2224 	return reg_map[reg];
2225 }
2226 
2227 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg)
2228 {
2229 	check_sljit_get_float_register_index(reg);
2230 	return reg;
2231 }
2232 
2233 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
2234 	void *instruction, sljit_si size)
2235 {
2236 	sljit_ub *inst;
2237 
2238 	CHECK_ERROR();
2239 	check_sljit_emit_op_custom(compiler, instruction, size);
2240 	SLJIT_ASSERT(size > 0 && size < 16);
2241 
2242 	inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
2243 	FAIL_IF(!inst);
2244 	INC_SIZE(size);
2245 	SLJIT_MEMMOVE(inst, instruction, size);
2246 	return SLJIT_SUCCESS;
2247 }
2248 
2249 /* --------------------------------------------------------------------- */
2250 /*  Floating point operators                                             */
2251 /* --------------------------------------------------------------------- */
2252 
2253 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
2254 
2255 /* Alignment + 2 * 16 bytes. */
2256 static sljit_si sse2_data[3 + (4 + 4) * 2];
2257 static sljit_si *sse2_buffer;
2258 
2259 static void init_compiler(void)
2260 {
2261 	sse2_buffer = (sljit_si*)(((sljit_uw)sse2_data + 15) & ~0xf);
2262 	/* Single precision constants. */
2263 	sse2_buffer[0] = 0x80000000;
2264 	sse2_buffer[4] = 0x7fffffff;
2265 	/* Double precision constants. */
2266 	sse2_buffer[8] = 0;
2267 	sse2_buffer[9] = 0x80000000;
2268 	sse2_buffer[12] = 0xffffffff;
2269 	sse2_buffer[13] = 0x7fffffff;
2270 }
2271 
2272 #endif
2273 
2274 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_is_fpu_available(void)
2275 {
2276 #ifdef SLJIT_IS_FPU_AVAILABLE
2277 	return SLJIT_IS_FPU_AVAILABLE;
2278 #elif (defined SLJIT_SSE2 && SLJIT_SSE2)
2279 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2280 	if (cpu_has_sse2 == -1)
2281 		get_cpu_features();
2282 	return cpu_has_sse2;
2283 #else /* SLJIT_DETECT_SSE2 */
2284 	return 1;
2285 #endif /* SLJIT_DETECT_SSE2 */
2286 #else /* SLJIT_SSE2 */
2287 	return 0;
2288 #endif
2289 }
2290 
2291 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
2292 
2293 static sljit_si emit_sse2(struct sljit_compiler *compiler, sljit_ub opcode,
2294 	sljit_si single, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
2295 {
2296 	sljit_ub *inst;
2297 
2298 	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2299 	FAIL_IF(!inst);
2300 	*inst++ = GROUP_0F;
2301 	*inst = opcode;
2302 	return SLJIT_SUCCESS;
2303 }
2304 
2305 static sljit_si emit_sse2_logic(struct sljit_compiler *compiler, sljit_ub opcode,
2306 	sljit_si pref66, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
2307 {
2308 	sljit_ub *inst;
2309 
2310 	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2311 	FAIL_IF(!inst);
2312 	*inst++ = GROUP_0F;
2313 	*inst = opcode;
2314 	return SLJIT_SUCCESS;
2315 }
2316 
2317 static SLJIT_INLINE sljit_si emit_sse2_load(struct sljit_compiler *compiler,
2318 	sljit_si single, sljit_si dst, sljit_si src, sljit_sw srcw)
2319 {
2320 	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2321 }
2322 
2323 static SLJIT_INLINE sljit_si emit_sse2_store(struct sljit_compiler *compiler,
2324 	sljit_si single, sljit_si dst, sljit_sw dstw, sljit_si src)
2325 {
2326 	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2327 }
2328 
2329 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop1(struct sljit_compiler *compiler, sljit_si op,
2330 	sljit_si dst, sljit_sw dstw,
2331 	sljit_si src, sljit_sw srcw)
2332 {
2333 	sljit_si dst_r;
2334 
2335 	CHECK_ERROR();
2336 	check_sljit_emit_fop1(compiler, op, dst, dstw, src, srcw);
2337 
2338 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2339 	compiler->mode32 = 1;
2340 #endif
2341 
2342 	if (GET_OPCODE(op) == SLJIT_CMPD) {
2343 		compiler->flags_saved = 0;
2344 		if (FAST_IS_REG(dst))
2345 			dst_r = dst;
2346 		else {
2347 			dst_r = TMP_FREG;
2348 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, dst, dstw));
2349 		}
2350 		return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_SINGLE_OP), dst_r, src, srcw);
2351 	}
2352 
2353 	if (op == SLJIT_MOVD) {
2354 		if (FAST_IS_REG(dst))
2355 			return emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst, src, srcw);
2356 		if (FAST_IS_REG(src))
2357 			return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, src);
2358 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src, srcw));
2359 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2360 	}
2361 
2362 	if (SLOW_IS_REG(dst)) {
2363 		dst_r = dst;
2364 		if (dst != src)
2365 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
2366 	}
2367 	else {
2368 		dst_r = TMP_FREG;
2369 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
2370 	}
2371 
2372 	switch (GET_OPCODE(op)) {
2373 	case SLJIT_NEGD:
2374 		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer : sse2_buffer + 8)));
2375 		break;
2376 
2377 	case SLJIT_ABSD:
2378 		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2379 		break;
2380 	}
2381 
2382 	if (dst_r == TMP_FREG)
2383 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2384 	return SLJIT_SUCCESS;
2385 }
2386 
2387 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop2(struct sljit_compiler *compiler, sljit_si op,
2388 	sljit_si dst, sljit_sw dstw,
2389 	sljit_si src1, sljit_sw src1w,
2390 	sljit_si src2, sljit_sw src2w)
2391 {
2392 	sljit_si dst_r;
2393 
2394 	CHECK_ERROR();
2395 	check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
2396 
2397 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2398 	compiler->mode32 = 1;
2399 #endif
2400 
2401 	if (FAST_IS_REG(dst)) {
2402 		dst_r = dst;
2403 		if (dst == src1)
2404 			; /* Do nothing here. */
2405 		else if (dst == src2 && (op == SLJIT_ADDD || op == SLJIT_MULD)) {
2406 			/* Swap arguments. */
2407 			src2 = src1;
2408 			src2w = src1w;
2409 		}
2410 		else if (dst != src2)
2411 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src1, src1w));
2412 		else {
2413 			dst_r = TMP_FREG;
2414 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
2415 		}
2416 	}
2417 	else {
2418 		dst_r = TMP_FREG;
2419 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
2420 	}
2421 
2422 	switch (GET_OPCODE(op)) {
2423 	case SLJIT_ADDD:
2424 		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2425 		break;
2426 
2427 	case SLJIT_SUBD:
2428 		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2429 		break;
2430 
2431 	case SLJIT_MULD:
2432 		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2433 		break;
2434 
2435 	case SLJIT_DIVD:
2436 		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2437 		break;
2438 	}
2439 
2440 	if (dst_r == TMP_FREG)
2441 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2442 	return SLJIT_SUCCESS;
2443 }
2444 
2445 #else
2446 
2447 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop1(struct sljit_compiler *compiler, sljit_si op,
2448 	sljit_si dst, sljit_sw dstw,
2449 	sljit_si src, sljit_sw srcw)
2450 {
2451 	CHECK_ERROR();
2452 	/* Should cause an assertion fail. */
2453 	check_sljit_emit_fop1(compiler, op, dst, dstw, src, srcw);
2454 	compiler->error = SLJIT_ERR_UNSUPPORTED;
2455 	return SLJIT_ERR_UNSUPPORTED;
2456 }
2457 
2458 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop2(struct sljit_compiler *compiler, sljit_si op,
2459 	sljit_si dst, sljit_sw dstw,
2460 	sljit_si src1, sljit_sw src1w,
2461 	sljit_si src2, sljit_sw src2w)
2462 {
2463 	CHECK_ERROR();
2464 	/* Should cause an assertion fail. */
2465 	check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
2466 	compiler->error = SLJIT_ERR_UNSUPPORTED;
2467 	return SLJIT_ERR_UNSUPPORTED;
2468 }
2469 
2470 #endif
2471 
2472 /* --------------------------------------------------------------------- */
2473 /*  Conditional instructions                                             */
2474 /* --------------------------------------------------------------------- */
2475 
2476 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2477 {
2478 	sljit_ub *inst;
2479 	struct sljit_label *label;
2480 
2481 	CHECK_ERROR_PTR();
2482 	check_sljit_emit_label(compiler);
2483 
2484 	/* We should restore the flags before the label,
2485 	   since other taken jumps has their own flags as well. */
2486 	if (SLJIT_UNLIKELY(compiler->flags_saved))
2487 		PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2488 
2489 	if (compiler->last_label && compiler->last_label->size == compiler->size)
2490 		return compiler->last_label;
2491 
2492 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2493 	PTR_FAIL_IF(!label);
2494 	set_label(label, compiler);
2495 
2496 	inst = (sljit_ub*)ensure_buf(compiler, 2);
2497 	PTR_FAIL_IF(!inst);
2498 
2499 	*inst++ = 0;
2500 	*inst++ = 0;
2501 
2502 	return label;
2503 }
2504 
2505 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_si type)
2506 {
2507 	sljit_ub *inst;
2508 	struct sljit_jump *jump;
2509 
2510 	CHECK_ERROR_PTR();
2511 	check_sljit_emit_jump(compiler, type);
2512 
2513 	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2514 		if ((type & 0xff) <= SLJIT_JUMP)
2515 			PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2516 		compiler->flags_saved = 0;
2517 	}
2518 
2519 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2520 	PTR_FAIL_IF_NULL(jump);
2521 	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
2522 	type &= 0xff;
2523 
2524 	if (type >= SLJIT_CALL1)
2525 		PTR_FAIL_IF(call_with_args(compiler, type));
2526 
2527 	/* Worst case size. */
2528 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2529 	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2530 #else
2531 	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2532 #endif
2533 
2534 	inst = (sljit_ub*)ensure_buf(compiler, 2);
2535 	PTR_FAIL_IF_NULL(inst);
2536 
2537 	*inst++ = 0;
2538 	*inst++ = type + 4;
2539 	return jump;
2540 }
2541 
2542 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compiler, sljit_si type, sljit_si src, sljit_sw srcw)
2543 {
2544 	sljit_ub *inst;
2545 	struct sljit_jump *jump;
2546 
2547 	CHECK_ERROR();
2548 	check_sljit_emit_ijump(compiler, type, src, srcw);
2549 	ADJUST_LOCAL_OFFSET(src, srcw);
2550 
2551 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2552 
2553 	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2554 		if (type <= SLJIT_JUMP)
2555 			FAIL_IF(emit_restore_flags(compiler, 0));
2556 		compiler->flags_saved = 0;
2557 	}
2558 
2559 	if (type >= SLJIT_CALL1) {
2560 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2561 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
2562 		if (src == SLJIT_SCRATCH_REG3) {
2563 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2564 			src = TMP_REG1;
2565 		}
2566 		if (src == SLJIT_MEM1(SLJIT_LOCALS_REG) && type >= SLJIT_CALL3)
2567 			srcw += sizeof(sljit_sw);
2568 #endif
2569 #endif
2570 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
2571 		if (src == SLJIT_SCRATCH_REG3) {
2572 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2573 			src = TMP_REG1;
2574 		}
2575 #endif
2576 		FAIL_IF(call_with_args(compiler, type));
2577 	}
2578 
2579 	if (src == SLJIT_IMM) {
2580 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2581 		FAIL_IF_NULL(jump);
2582 		set_jump(jump, compiler, JUMP_ADDR);
2583 		jump->u.target = srcw;
2584 
2585 		/* Worst case size. */
2586 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2587 		compiler->size += 5;
2588 #else
2589 		compiler->size += 10 + 3;
2590 #endif
2591 
2592 		inst = (sljit_ub*)ensure_buf(compiler, 2);
2593 		FAIL_IF_NULL(inst);
2594 
2595 		*inst++ = 0;
2596 		*inst++ = type + 4;
2597 	}
2598 	else {
2599 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2600 		/* REX_W is not necessary (src is not immediate). */
2601 		compiler->mode32 = 1;
2602 #endif
2603 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2604 		FAIL_IF(!inst);
2605 		*inst++ = GROUP_FF;
2606 		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2607 	}
2608 	return SLJIT_SUCCESS;
2609 }
2610 
2611 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_si op,
2612 	sljit_si dst, sljit_sw dstw,
2613 	sljit_si src, sljit_sw srcw,
2614 	sljit_si type)
2615 {
2616 	sljit_ub *inst;
2617 	sljit_ub cond_set = 0;
2618 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2619 	sljit_si reg;
2620 #else
2621 	/* CHECK_EXTRA_REGS migh overwrite these values. */
2622 	sljit_si dst_save = dst;
2623 	sljit_sw dstw_save = dstw;
2624 #endif
2625 
2626 	CHECK_ERROR();
2627 	check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type);
2628 
2629 	if (dst == SLJIT_UNUSED)
2630 		return SLJIT_SUCCESS;
2631 
2632 	ADJUST_LOCAL_OFFSET(dst, dstw);
2633 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2634 	if (SLJIT_UNLIKELY(compiler->flags_saved))
2635 		FAIL_IF(emit_restore_flags(compiler, op & SLJIT_KEEP_FLAGS));
2636 
2637 	/* setcc = jcc + 0x10. */
2638 	cond_set = get_jump_code(type) + 0x10;
2639 
2640 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2641 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
2642 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 3);
2643 		FAIL_IF(!inst);
2644 		INC_SIZE(4 + 3);
2645 		/* Set low register to conditional flag. */
2646 		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2647 		*inst++ = GROUP_0F;
2648 		*inst++ = cond_set;
2649 		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
2650 		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2651 		*inst++ = OR_rm8_r8;
2652 		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2653 		return SLJIT_SUCCESS;
2654 	}
2655 
2656 	reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2657 
2658 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 4);
2659 	FAIL_IF(!inst);
2660 	INC_SIZE(4 + 4);
2661 	/* Set low register to conditional flag. */
2662 	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2663 	*inst++ = GROUP_0F;
2664 	*inst++ = cond_set;
2665 	*inst++ = MOD_REG | reg_lmap[reg];
2666 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2667 	*inst++ = GROUP_0F;
2668 	*inst++ = MOVZX_r_rm8;
2669 	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2670 
2671 	if (reg != TMP_REG1)
2672 		return SLJIT_SUCCESS;
2673 
2674 	if (GET_OPCODE(op) < SLJIT_ADD) {
2675 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2676 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2677 	}
2678 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
2679 	compiler->skip_checks = 1;
2680 #endif
2681 	return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
2682 #else /* SLJIT_CONFIG_X86_64 */
2683 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2684 		if (reg_map[dst] <= 4) {
2685 			/* Low byte is accessible. */
2686 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
2687 			FAIL_IF(!inst);
2688 			INC_SIZE(3 + 3);
2689 			/* Set low byte to conditional flag. */
2690 			*inst++ = GROUP_0F;
2691 			*inst++ = cond_set;
2692 			*inst++ = MOD_REG | reg_map[dst];
2693 
2694 			*inst++ = GROUP_0F;
2695 			*inst++ = MOVZX_r_rm8;
2696 			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2697 			return SLJIT_SUCCESS;
2698 		}
2699 
2700 		/* Low byte is not accessible. */
2701 		if (cpu_has_cmov == -1)
2702 			get_cpu_features();
2703 
2704 		if (cpu_has_cmov) {
2705 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2706 			/* a xor reg, reg operation would overwrite the flags. */
2707 			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2708 
2709 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 3);
2710 			FAIL_IF(!inst);
2711 			INC_SIZE(3);
2712 
2713 			*inst++ = GROUP_0F;
2714 			/* cmovcc = setcc - 0x50. */
2715 			*inst++ = cond_set - 0x50;
2716 			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2717 			return SLJIT_SUCCESS;
2718 		}
2719 
2720 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2721 		FAIL_IF(!inst);
2722 		INC_SIZE(1 + 3 + 3 + 1);
2723 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2724 		/* Set al to conditional flag. */
2725 		*inst++ = GROUP_0F;
2726 		*inst++ = cond_set;
2727 		*inst++ = MOD_REG | 0 /* eax */;
2728 
2729 		*inst++ = GROUP_0F;
2730 		*inst++ = MOVZX_r_rm8;
2731 		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2732 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2733 		return SLJIT_SUCCESS;
2734 	}
2735 
2736 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
2737 		SLJIT_COMPILE_ASSERT(reg_map[SLJIT_SCRATCH_REG1] == 0, scratch_reg1_must_be_eax);
2738 		if (dst != SLJIT_SCRATCH_REG1) {
2739 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2740 			FAIL_IF(!inst);
2741 			INC_SIZE(1 + 3 + 2 + 1);
2742 			/* Set low register to conditional flag. */
2743 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2744 			*inst++ = GROUP_0F;
2745 			*inst++ = cond_set;
2746 			*inst++ = MOD_REG | 0 /* eax */;
2747 			*inst++ = OR_rm8_r8;
2748 			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2749 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2750 		}
2751 		else {
2752 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2753 			FAIL_IF(!inst);
2754 			INC_SIZE(2 + 3 + 2 + 2);
2755 			/* Set low register to conditional flag. */
2756 			*inst++ = XCHG_r_rm;
2757 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2758 			*inst++ = GROUP_0F;
2759 			*inst++ = cond_set;
2760 			*inst++ = MOD_REG | 1 /* ecx */;
2761 			*inst++ = OR_rm8_r8;
2762 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2763 			*inst++ = XCHG_r_rm;
2764 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2765 		}
2766 		return SLJIT_SUCCESS;
2767 	}
2768 
2769 	/* Set TMP_REG1 to the bit. */
2770 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2771 	FAIL_IF(!inst);
2772 	INC_SIZE(1 + 3 + 3 + 1);
2773 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2774 	/* Set al to conditional flag. */
2775 	*inst++ = GROUP_0F;
2776 	*inst++ = cond_set;
2777 	*inst++ = MOD_REG | 0 /* eax */;
2778 
2779 	*inst++ = GROUP_0F;
2780 	*inst++ = MOVZX_r_rm8;
2781 	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2782 
2783 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2784 
2785 	if (GET_OPCODE(op) < SLJIT_ADD)
2786 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2787 
2788 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
2789 	compiler->skip_checks = 1;
2790 #endif
2791 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2792 #endif /* SLJIT_CONFIG_X86_64 */
2793 }
2794 
2795 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_local_base(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw offset)
2796 {
2797 	CHECK_ERROR();
2798 	check_sljit_get_local_base(compiler, dst, dstw, offset);
2799 	ADJUST_LOCAL_OFFSET(dst, dstw);
2800 
2801 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2802 
2803 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2804 	compiler->mode32 = 0;
2805 #endif
2806 
2807 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_LOCALS_REG), offset);
2808 
2809 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2810 	if (NOT_HALFWORD(offset)) {
2811 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
2812 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2813 		SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_LOCALS_REG, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
2814 		return compiler->error;
2815 #else
2816 		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_LOCALS_REG, 0, TMP_REG1, 0);
2817 #endif
2818 	}
2819 #endif
2820 
2821 	if (offset != 0)
2822 		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_LOCALS_REG, 0, SLJIT_IMM, offset);
2823 	return emit_mov(compiler, dst, dstw, SLJIT_LOCALS_REG, 0);
2824 }
2825 
2826 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
2827 {
2828 	sljit_ub *inst;
2829 	struct sljit_const *const_;
2830 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2831 	sljit_si reg;
2832 #endif
2833 
2834 	CHECK_ERROR_PTR();
2835 	check_sljit_emit_const(compiler, dst, dstw, init_value);
2836 	ADJUST_LOCAL_OFFSET(dst, dstw);
2837 
2838 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2839 
2840 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
2841 	PTR_FAIL_IF(!const_);
2842 	set_const(const_, compiler);
2843 
2844 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2845 	compiler->mode32 = 0;
2846 	reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2847 
2848 	if (emit_load_imm64(compiler, reg, init_value))
2849 		return NULL;
2850 #else
2851 	if (dst == SLJIT_UNUSED)
2852 		dst = TMP_REG1;
2853 
2854 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
2855 		return NULL;
2856 #endif
2857 
2858 	inst = (sljit_ub*)ensure_buf(compiler, 2);
2859 	PTR_FAIL_IF(!inst);
2860 
2861 	*inst++ = 0;
2862 	*inst++ = 1;
2863 
2864 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2865 	if (dst & SLJIT_MEM)
2866 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
2867 			return NULL;
2868 #endif
2869 
2870 	return const_;
2871 }
2872 
2873 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
2874 {
2875 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2876 	*(sljit_sw*)addr = new_addr - (addr + 4);
2877 #else
2878 	*(sljit_uw*)addr = new_addr;
2879 #endif
2880 }
2881 
2882 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
2883 {
2884 	*(sljit_sw*)addr = new_constant;
2885 }
2886