xref: /netbsd-src/external/gpl3/gdb.old/dist/sim/aarch64/simulator.c (revision 32d1c65c71fbdb65a012e8392a62a757dd6853e9)
1 /* simulator.c -- Interface for the AArch64 simulator.
2 
3    Copyright (C) 2015-2023 Free Software Foundation, Inc.
4 
5    Contributed by Red Hat.
6 
7    This file is part of GDB.
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3 of the License, or
12    (at your option) any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
21 
22 /* This must come before any other includes.  */
23 #include "defs.h"
24 
25 #include <stdlib.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <sys/types.h>
29 #include <math.h>
30 #include <time.h>
31 #include <limits.h>
32 
33 #include "simulator.h"
34 #include "cpustate.h"
35 #include "memory.h"
36 
37 #include "sim-signal.h"
38 
39 #define NO_SP 0
40 #define SP_OK 1
41 
42 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
43 #define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
44 #define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
45 
46 /* Space saver macro.  */
47 #define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
48 
49 #define HALT_UNALLOC							\
50   do									\
51     {									\
52       TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
53       TRACE_INSN (cpu,							\
54 		  "Unallocated instruction detected at sim line %d,"	\
55 		  " exe addr %" PRIx64,					\
56 		  __LINE__, aarch64_get_PC (cpu));			\
57       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
58 		       sim_stopped, SIM_SIGILL);			\
59     }									\
60   while (0)
61 
62 #define HALT_NYI							\
63   do									\
64     {									\
65       TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
66       TRACE_INSN (cpu,							\
67 		  "Unimplemented instruction detected at sim line %d,"	\
68 		  " exe addr %" PRIx64,					\
69 		  __LINE__, aarch64_get_PC (cpu));			\
70       if (! TRACE_ANY_P (cpu))						\
71         sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
72                         aarch64_get_instr (cpu));			\
73       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
74 		       sim_stopped, SIM_SIGABRT);			\
75     }									\
76   while (0)
77 
78 #define NYI_assert(HI, LO, EXPECTED)					\
79   do									\
80     {									\
81       if (INSTR ((HI), (LO)) != (EXPECTED))				\
82 	HALT_NYI;							\
83     }									\
84   while (0)
85 
86 static uint64_t
87 expand_logical_immediate (uint32_t S, uint32_t R, uint32_t N)
88 {
89   uint64_t mask;
90   uint64_t imm;
91   unsigned simd_size;
92 
93   /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
94      (in other words, right rotated by R), then replicated. */
95   if (N != 0)
96     {
97       simd_size = 64;
98       mask = 0xffffffffffffffffull;
99     }
100   else
101     {
102       switch (S)
103 	{
104 	case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
105 	case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; S &= 0xf; break;
106 	case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; S &= 0x7; break;
107 	case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; S &= 0x3; break;
108 	case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; S &= 0x1; break;
109 	default: return 0;
110 	}
111       mask = (1ull << simd_size) - 1;
112       /* Top bits are IGNORED.  */
113       R &= simd_size - 1;
114     }
115 
116   /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
117   if (S == simd_size - 1)
118     return 0;
119 
120   /* S+1 consecutive bits to 1.  */
121   /* NOTE: S can't be 63 due to detection above.  */
122   imm = (1ull << (S + 1)) - 1;
123 
124   /* Rotate to the left by simd_size - R.  */
125   if (R != 0)
126     imm = ((imm << (simd_size - R)) & mask) | (imm >> R);
127 
128   /* Replicate the value according to SIMD size.  */
129   switch (simd_size)
130     {
131     case  2: imm = (imm <<  2) | imm;
132     case  4: imm = (imm <<  4) | imm;
133     case  8: imm = (imm <<  8) | imm;
134     case 16: imm = (imm << 16) | imm;
135     case 32: imm = (imm << 32) | imm;
136     case 64: break;
137     default: return 0;
138     }
139 
140   return imm;
141 }
142 
143 /* Instr[22,10] encodes N immr and imms. we want a lookup table
144    for each possible combination i.e. 13 bits worth of int entries.  */
145 #define  LI_TABLE_SIZE  (1 << 13)
146 static uint64_t LITable[LI_TABLE_SIZE];
147 
148 void
149 aarch64_init_LIT_table (void)
150 {
151   unsigned index;
152 
153   for (index = 0; index < LI_TABLE_SIZE; index++)
154     {
155       uint32_t N    = uimm (index, 12, 12);
156       uint32_t immr = uimm (index, 11, 6);
157       uint32_t imms = uimm (index, 5, 0);
158 
159       LITable [index] = expand_logical_immediate (imms, immr, N);
160     }
161 }
162 
163 static void
164 dexNotify (sim_cpu *cpu)
165 {
166   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
167                            2 ==> exit Java, 3 ==> start next bytecode.  */
168   uint32_t type = INSTR (14, 0);
169 
170   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
171 
172   switch (type)
173     {
174     case 0:
175       /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
176 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
177       break;
178     case 1:
179       /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
180 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
181       break;
182     case 2:
183       /* aarch64_notifyMethodExit ();  */
184       break;
185     case 3:
186       /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
187 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
188       break;
189     }
190 }
191 
192 /* secondary decode within top level groups  */
193 
194 static void
195 dexPseudo (sim_cpu *cpu)
196 {
197   /* assert instr[28,27] = 00
198 
199      We provide 2 pseudo instructions:
200 
201      HALT stops execution of the simulator causing an immediate
202      return to the x86 code which entered it.
203 
204      CALLOUT initiates recursive entry into x86 code.  A register
205      argument holds the address of the x86 routine.  Immediate
206      values in the instruction identify the number of general
207      purpose and floating point register arguments to be passed
208      and the type of any value to be returned.  */
209 
210   uint32_t PSEUDO_HALT      =  0xE0000000U;
211   uint32_t PSEUDO_CALLOUT   =  0x00018000U;
212   uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
213   uint32_t PSEUDO_NOTIFY    =  0x00014000U;
214   uint32_t dispatch;
215 
216   if (aarch64_get_instr (cpu) == PSEUDO_HALT)
217     {
218       TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
219       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
220 		       sim_stopped, SIM_SIGTRAP);
221     }
222 
223   dispatch = INSTR (31, 15);
224 
225   /* We do not handle callouts at the moment.  */
226   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
227     {
228       TRACE_EVENTS (cpu, " Callout");
229       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
230 		       sim_stopped, SIM_SIGABRT);
231     }
232 
233   else if (dispatch == PSEUDO_NOTIFY)
234     dexNotify (cpu);
235 
236   else
237     HALT_UNALLOC;
238 }
239 
240 /* Load-store single register (unscaled offset)
241    These instructions employ a base register plus an unscaled signed
242    9 bit offset.
243 
244    N.B. the base register (source) can be Xn or SP. all other
245    registers may not be SP.  */
246 
247 /* 32 bit load 32 bit unscaled signed 9 bit.  */
248 static void
249 ldur32 (sim_cpu *cpu, int32_t offset)
250 {
251   unsigned rn = INSTR (9, 5);
252   unsigned rt = INSTR (4, 0);
253 
254   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
255   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
256 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
257 			+ offset));
258 }
259 
260 /* 64 bit load 64 bit unscaled signed 9 bit.  */
261 static void
262 ldur64 (sim_cpu *cpu, int32_t offset)
263 {
264   unsigned rn = INSTR (9, 5);
265   unsigned rt = INSTR (4, 0);
266 
267   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
268   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
269 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
270 			+ offset));
271 }
272 
273 /* 32 bit load zero-extended byte unscaled signed 9 bit.  */
274 static void
275 ldurb32 (sim_cpu *cpu, int32_t offset)
276 {
277   unsigned rn = INSTR (9, 5);
278   unsigned rt = INSTR (4, 0);
279 
280   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
281   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
282 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
283 			+ offset));
284 }
285 
286 /* 32 bit load sign-extended byte unscaled signed 9 bit.  */
287 static void
288 ldursb32 (sim_cpu *cpu, int32_t offset)
289 {
290   unsigned rn = INSTR (9, 5);
291   unsigned rt = INSTR (4, 0);
292 
293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
294   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
295 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
296 			+ offset));
297 }
298 
299 /* 64 bit load sign-extended byte unscaled signed 9 bit.  */
300 static void
301 ldursb64 (sim_cpu *cpu, int32_t offset)
302 {
303   unsigned rn = INSTR (9, 5);
304   unsigned rt = INSTR (4, 0);
305 
306   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
307   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
308 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
309 			+ offset));
310 }
311 
312 /* 32 bit load zero-extended short unscaled signed 9 bit  */
313 static void
314 ldurh32 (sim_cpu *cpu, int32_t offset)
315 {
316   unsigned rn = INSTR (9, 5);
317   unsigned rd = INSTR (4, 0);
318 
319   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
320   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
321 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
322 			+ offset));
323 }
324 
325 /* 32 bit load sign-extended short unscaled signed 9 bit  */
326 static void
327 ldursh32 (sim_cpu *cpu, int32_t offset)
328 {
329   unsigned rn = INSTR (9, 5);
330   unsigned rd = INSTR (4, 0);
331 
332   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
333   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
334 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
335 			+ offset));
336 }
337 
338 /* 64 bit load sign-extended short unscaled signed 9 bit  */
339 static void
340 ldursh64 (sim_cpu *cpu, int32_t offset)
341 {
342   unsigned rn = INSTR (9, 5);
343   unsigned rt = INSTR (4, 0);
344 
345   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
346   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
347 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
348 			+ offset));
349 }
350 
351 /* 64 bit load sign-extended word unscaled signed 9 bit  */
352 static void
353 ldursw (sim_cpu *cpu, int32_t offset)
354 {
355   unsigned rn = INSTR (9, 5);
356   unsigned rd = INSTR (4, 0);
357 
358   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
359   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
360 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
361 			+ offset));
362 }
363 
364 /* N.B. with stores the value in source is written to the address
365    identified by source2 modified by offset.  */
366 
367 /* 32 bit store 32 bit unscaled signed 9 bit.  */
368 static void
369 stur32 (sim_cpu *cpu, int32_t offset)
370 {
371   unsigned rn = INSTR (9, 5);
372   unsigned rd = INSTR (4, 0);
373 
374   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
375   aarch64_set_mem_u32 (cpu,
376 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
377 		       aarch64_get_reg_u32 (cpu, rd, NO_SP));
378 }
379 
380 /* 64 bit store 64 bit unscaled signed 9 bit  */
381 static void
382 stur64 (sim_cpu *cpu, int32_t offset)
383 {
384   unsigned rn = INSTR (9, 5);
385   unsigned rd = INSTR (4, 0);
386 
387   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
388   aarch64_set_mem_u64 (cpu,
389 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
390 		       aarch64_get_reg_u64 (cpu, rd, NO_SP));
391 }
392 
393 /* 32 bit store byte unscaled signed 9 bit  */
394 static void
395 sturb (sim_cpu *cpu, int32_t offset)
396 {
397   unsigned rn = INSTR (9, 5);
398   unsigned rd = INSTR (4, 0);
399 
400   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
401   aarch64_set_mem_u8 (cpu,
402 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
403 		      aarch64_get_reg_u8 (cpu, rd, NO_SP));
404 }
405 
406 /* 32 bit store short unscaled signed 9 bit  */
407 static void
408 sturh (sim_cpu *cpu, int32_t offset)
409 {
410   unsigned rn = INSTR (9, 5);
411   unsigned rd = INSTR (4, 0);
412 
413   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
414   aarch64_set_mem_u16 (cpu,
415 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
416 		       aarch64_get_reg_u16 (cpu, rd, NO_SP));
417 }
418 
419 /* Load single register pc-relative label
420    Offset is a signed 19 bit immediate count in words
421    rt may not be SP.  */
422 
423 /* 32 bit pc-relative load  */
424 static void
425 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
426 {
427   unsigned rd = INSTR (4, 0);
428 
429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
430   aarch64_set_reg_u64 (cpu, rd, NO_SP,
431 		       aarch64_get_mem_u32
432 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
433 }
434 
435 /* 64 bit pc-relative load  */
436 static void
437 ldr_pcrel (sim_cpu *cpu, int32_t offset)
438 {
439   unsigned rd = INSTR (4, 0);
440 
441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
442   aarch64_set_reg_u64 (cpu, rd, NO_SP,
443 		       aarch64_get_mem_u64
444 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
445 }
446 
447 /* sign extended 32 bit pc-relative load  */
448 static void
449 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
450 {
451   unsigned rd = INSTR (4, 0);
452 
453   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
454   aarch64_set_reg_u64 (cpu, rd, NO_SP,
455 		       aarch64_get_mem_s32
456 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
457 }
458 
459 /* float pc-relative load  */
460 static void
461 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
462 {
463   unsigned int rd = INSTR (4, 0);
464 
465   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
466   aarch64_set_vec_u32 (cpu, rd, 0,
467 		       aarch64_get_mem_u32
468 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
469 }
470 
471 /* double pc-relative load  */
472 static void
473 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
474 {
475   unsigned int st = INSTR (4, 0);
476 
477   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
478   aarch64_set_vec_u64 (cpu, st, 0,
479 		       aarch64_get_mem_u64
480 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
481 }
482 
483 /* long double pc-relative load.  */
484 static void
485 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
486 {
487   unsigned int st = INSTR (4, 0);
488   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
489   FRegister a;
490 
491   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
492   aarch64_get_mem_long_double (cpu, addr, & a);
493   aarch64_set_FP_long_double (cpu, st, a);
494 }
495 
496 /* This can be used to scale an offset by applying
497    the requisite shift. the second argument is either
498    16, 32 or 64.  */
499 
500 #define SCALE(_offset, _elementSize) \
501     ((_offset) << ScaleShift ## _elementSize)
502 
503 /* This can be used to optionally scale a register derived offset
504    by applying the requisite shift as indicated by the Scaling
505    argument.  The second argument is either Byte, Short, Word
506    or Long. The third argument is either Scaled or Unscaled.
507    N.B. when _Scaling is Scaled the shift gets ANDed with
508    all 1s while when it is Unscaled it gets ANDed with 0.  */
509 
510 #define OPT_SCALE(_offset, _elementType, _Scaling) \
511   ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))
512 
513 /* This can be used to zero or sign extend a 32 bit register derived
514    value to a 64 bit value.  the first argument must be the value as
515    a uint32_t and the second must be either UXTW or SXTW. The result
516    is returned as an int64_t.  */
517 
518 static inline int64_t
519 extend (uint32_t value, Extension extension)
520 {
521   union
522   {
523     uint32_t u;
524     int32_t   n;
525   } x;
526 
527   /* A branchless variant of this ought to be possible.  */
528   if (extension == UXTW || extension == NoExtension)
529     return value;
530 
531   x.u = value;
532   return x.n;
533 }
534 
535 /* Scalar Floating Point
536 
537    FP load/store single register (4 addressing modes)
538 
539    N.B. the base register (source) can be the stack pointer.
540    The secondary source register (source2) can only be an Xn register.  */
541 
542 /* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
543 static void
544 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
545 {
546   unsigned rn = INSTR (9, 5);
547   unsigned st = INSTR (4, 0);
548   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
549 
550   if (wb != Post)
551     address += offset;
552 
553   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
554   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
555   if (wb == Post)
556     address += offset;
557 
558   if (wb != NoWriteBack)
559     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
560 }
561 
562 /* Load 8 bit with unsigned 12 bit offset.  */
563 static void
564 fldrb_abs (sim_cpu *cpu, uint32_t offset)
565 {
566   unsigned rd = INSTR (4, 0);
567   unsigned rn = INSTR (9, 5);
568   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
569 
570   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
571   aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
572 }
573 
574 /* Load 16 bit scaled unsigned 12 bit.  */
575 static void
576 fldrh_abs (sim_cpu *cpu, uint32_t offset)
577 {
578   unsigned rd = INSTR (4, 0);
579   unsigned rn = INSTR (9, 5);
580   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
581 
582   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
583   aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
584 }
585 
586 /* Load 32 bit scaled unsigned 12 bit.  */
587 static void
588 fldrs_abs (sim_cpu *cpu, uint32_t offset)
589 {
590   unsigned rd = INSTR (4, 0);
591   unsigned rn = INSTR (9, 5);
592   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
593 
594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
595   aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
596 }
597 
598 /* Load 64 bit scaled unsigned 12 bit.  */
599 static void
600 fldrd_abs (sim_cpu *cpu, uint32_t offset)
601 {
602   unsigned rd = INSTR (4, 0);
603   unsigned rn = INSTR (9, 5);
604   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
605 
606   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
607   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
608 }
609 
610 /* Load 128 bit scaled unsigned 12 bit.  */
611 static void
612 fldrq_abs (sim_cpu *cpu, uint32_t offset)
613 {
614   unsigned rd = INSTR (4, 0);
615   unsigned rn = INSTR (9, 5);
616   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
617 
618   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
619   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
620   aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
621 }
622 
623 /* Load 32 bit scaled or unscaled zero- or sign-extended
624    32-bit register offset.  */
625 static void
626 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
627 {
628   unsigned rm = INSTR (20, 16);
629   unsigned rn = INSTR (9, 5);
630   unsigned st = INSTR (4, 0);
631   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
632   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
633   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
634 
635   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
636   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
637 		       (cpu, address + displacement));
638 }
639 
640 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
641 static void
642 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
643 {
644   unsigned rn = INSTR (9, 5);
645   unsigned st = INSTR (4, 0);
646   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
647 
648   if (wb != Post)
649     address += offset;
650 
651   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
652   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
653 
654   if (wb == Post)
655     address += offset;
656 
657   if (wb != NoWriteBack)
658     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
659 }
660 
661 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
662 static void
663 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
664 {
665   unsigned rm = INSTR (20, 16);
666   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
667   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
668 
669   fldrd_wb (cpu, displacement, NoWriteBack);
670 }
671 
672 /* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
673 static void
674 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
675 {
676   FRegister a;
677   unsigned rn = INSTR (9, 5);
678   unsigned st = INSTR (4, 0);
679   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
680 
681   if (wb != Post)
682     address += offset;
683 
684   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
685   aarch64_get_mem_long_double (cpu, address, & a);
686   aarch64_set_FP_long_double (cpu, st, a);
687 
688   if (wb == Post)
689     address += offset;
690 
691   if (wb != NoWriteBack)
692     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
693 }
694 
695 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
696 static void
697 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
698 {
699   unsigned rm = INSTR (20, 16);
700   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
701   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
702 
703   fldrq_wb (cpu, displacement, NoWriteBack);
704 }
705 
706 /* Memory Access
707 
708    load-store single register
709    There are four addressing modes available here which all employ a
710    64 bit source (base) register.
711 
712    N.B. the base register (source) can be the stack pointer.
713    The secondary source register (source2)can only be an Xn register.
714 
715    Scaled, 12-bit, unsigned immediate offset, without pre- and
716    post-index options.
717    Unscaled, 9-bit, signed immediate offset with pre- or post-index
718    writeback.
719    scaled or unscaled 64-bit register offset.
720    scaled or unscaled 32-bit extended register offset.
721 
722    All offsets are assumed to be raw from the decode i.e. the
723    simulator is expected to adjust scaled offsets based on the
724    accessed data size with register or extended register offset
725    versions the same applies except that in the latter case the
726    operation may also require a sign extend.
727 
728    A separate method is provided for each possible addressing mode.  */
729 
730 /* 32 bit load 32 bit scaled unsigned 12 bit  */
731 static void
732 ldr32_abs (sim_cpu *cpu, uint32_t offset)
733 {
734   unsigned rn = INSTR (9, 5);
735   unsigned rt = INSTR (4, 0);
736 
737   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
738   /* The target register may not be SP but the source may be.  */
739   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
740 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
741 			+ SCALE (offset, 32)));
742 }
743 
744 /* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
745 static void
746 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
747 {
748   unsigned rn = INSTR (9, 5);
749   unsigned rt = INSTR (4, 0);
750   uint64_t address;
751 
752   if (rn == rt && wb != NoWriteBack)
753     HALT_UNALLOC;
754 
755   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
756 
757   if (wb != Post)
758     address += offset;
759 
760   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
761   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
762 
763   if (wb == Post)
764     address += offset;
765 
766   if (wb != NoWriteBack)
767     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
768 }
769 
770 /* 32 bit load 32 bit scaled or unscaled
771    zero- or sign-extended 32-bit register offset  */
772 static void
773 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
774 {
775   unsigned rm = INSTR (20, 16);
776   unsigned rn = INSTR (9, 5);
777   unsigned rt = INSTR (4, 0);
778   /* rn may reference SP, rm and rt must reference ZR  */
779 
780   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
781   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
782   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
783 
784   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
785   aarch64_set_reg_u64 (cpu, rt, NO_SP,
786 		       aarch64_get_mem_u32 (cpu, address + displacement));
787 }
788 
789 /* 64 bit load 64 bit scaled unsigned 12 bit  */
790 static void
791 ldr_abs (sim_cpu *cpu, uint32_t offset)
792 {
793   unsigned rn = INSTR (9, 5);
794   unsigned rt = INSTR (4, 0);
795 
796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
797   /* The target register may not be SP but the source may be.  */
798   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
799 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
800 			+ SCALE (offset, 64)));
801 }
802 
803 /* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
804 static void
805 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
806 {
807   unsigned rn = INSTR (9, 5);
808   unsigned rt = INSTR (4, 0);
809   uint64_t address;
810 
811   if (rn == rt && wb != NoWriteBack)
812     HALT_UNALLOC;
813 
814   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
815 
816   if (wb != Post)
817     address += offset;
818 
819   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
820   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
821 
822   if (wb == Post)
823     address += offset;
824 
825   if (wb != NoWriteBack)
826     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
827 }
828 
829 /* 64 bit load 64 bit scaled or unscaled zero-
830    or sign-extended 32-bit register offset.  */
831 static void
832 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
833 {
834   unsigned rm = INSTR (20, 16);
835   unsigned rn = INSTR (9, 5);
836   unsigned rt = INSTR (4, 0);
837   /* rn may reference SP, rm and rt must reference ZR  */
838 
839   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
840   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
841   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
842 
843   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
844   aarch64_set_reg_u64 (cpu, rt, NO_SP,
845 		       aarch64_get_mem_u64 (cpu, address + displacement));
846 }
847 
848 /* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
849 static void
850 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
851 {
852   unsigned rn = INSTR (9, 5);
853   unsigned rt = INSTR (4, 0);
854 
855   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
856   /* The target register may not be SP but the source may be
857      there is no scaling required for a byte load.  */
858   aarch64_set_reg_u64 (cpu, rt, NO_SP,
859 		       aarch64_get_mem_u8
860 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
861 }
862 
863 /* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
864 static void
865 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
866 {
867   unsigned rn = INSTR (9, 5);
868   unsigned rt = INSTR (4, 0);
869   uint64_t address;
870 
871   if (rn == rt && wb != NoWriteBack)
872     HALT_UNALLOC;
873 
874   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
875 
876   if (wb != Post)
877     address += offset;
878 
879   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
880   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
881 
882   if (wb == Post)
883     address += offset;
884 
885   if (wb != NoWriteBack)
886     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
887 }
888 
889 /* 32 bit load zero-extended byte scaled or unscaled zero-
890    or sign-extended 32-bit register offset.  */
891 static void
892 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
893 {
894   unsigned rm = INSTR (20, 16);
895   unsigned rn = INSTR (9, 5);
896   unsigned rt = INSTR (4, 0);
897   /* rn may reference SP, rm and rt must reference ZR  */
898 
899   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
900   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
901 				 extension);
902 
903   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
904   /* There is no scaling required for a byte load.  */
905   aarch64_set_reg_u64 (cpu, rt, NO_SP,
906 		       aarch64_get_mem_u8 (cpu, address + displacement));
907 }
908 
909 /* 64 bit load sign-extended byte unscaled signed 9 bit
910    with pre- or post-writeback.  */
911 static void
912 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
913 {
914   unsigned rn = INSTR (9, 5);
915   unsigned rt = INSTR (4, 0);
916   uint64_t address;
917   int64_t val;
918 
919   if (rn == rt && wb != NoWriteBack)
920     HALT_UNALLOC;
921 
922   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
923 
924   if (wb != Post)
925     address += offset;
926 
927   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
928   val = aarch64_get_mem_s8 (cpu, address);
929   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
930 
931   if (wb == Post)
932     address += offset;
933 
934   if (wb != NoWriteBack)
935     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
936 }
937 
938 /* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
939 static void
940 ldrsb_abs (sim_cpu *cpu, uint32_t offset)
941 {
942   ldrsb_wb (cpu, offset, NoWriteBack);
943 }
944 
945 /* 64 bit load sign-extended byte scaled or unscaled zero-
946    or sign-extended 32-bit register offset.  */
947 static void
948 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
949 {
950   unsigned rm = INSTR (20, 16);
951   unsigned rn = INSTR (9, 5);
952   unsigned rt = INSTR (4, 0);
953   /* rn may reference SP, rm and rt must reference ZR  */
954 
955   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
956   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
957 				 extension);
958   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
959   /* There is no scaling required for a byte load.  */
960   aarch64_set_reg_s64 (cpu, rt, NO_SP,
961 		       aarch64_get_mem_s8 (cpu, address + displacement));
962 }
963 
964 /* 32 bit load zero-extended short scaled unsigned 12 bit.  */
965 static void
966 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
967 {
968   unsigned rn = INSTR (9, 5);
969   unsigned rt = INSTR (4, 0);
970   uint32_t val;
971 
972   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
973   /* The target register may not be SP but the source may be.  */
974   val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
975 			     + SCALE (offset, 16));
976   aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
977 }
978 
979 /* 32 bit load zero-extended short unscaled signed 9 bit
980    with pre- or post-writeback.  */
981 static void
982 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
983 {
984   unsigned rn = INSTR (9, 5);
985   unsigned rt = INSTR (4, 0);
986   uint64_t address;
987 
988   if (rn == rt && wb != NoWriteBack)
989     HALT_UNALLOC;
990 
991   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
992 
993   if (wb != Post)
994     address += offset;
995 
996   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
997   aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
998 
999   if (wb == Post)
1000     address += offset;
1001 
1002   if (wb != NoWriteBack)
1003     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1004 }
1005 
1006 /* 32 bit load zero-extended short scaled or unscaled zero-
1007    or sign-extended 32-bit register offset.  */
1008 static void
1009 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1010 {
1011   unsigned rm = INSTR (20, 16);
1012   unsigned rn = INSTR (9, 5);
1013   unsigned rt = INSTR (4, 0);
1014   /* rn may reference SP, rm and rt must reference ZR  */
1015 
1016   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1017   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1018   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1019 
1020   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1021   aarch64_set_reg_u32 (cpu, rt, NO_SP,
1022 		       aarch64_get_mem_u16 (cpu, address + displacement));
1023 }
1024 
1025 /* 32 bit load sign-extended short scaled unsigned 12 bit.  */
1026 static void
1027 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
1028 {
1029   unsigned rn = INSTR (9, 5);
1030   unsigned rt = INSTR (4, 0);
1031   int32_t val;
1032 
1033   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1034   /* The target register may not be SP but the source may be.  */
1035   val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1036 			     + SCALE (offset, 16));
1037   aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
1038 }
1039 
1040 /* 32 bit load sign-extended short unscaled signed 9 bit
1041    with pre- or post-writeback.  */
1042 static void
1043 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1044 {
1045   unsigned rn = INSTR (9, 5);
1046   unsigned rt = INSTR (4, 0);
1047   uint64_t address;
1048 
1049   if (rn == rt && wb != NoWriteBack)
1050     HALT_UNALLOC;
1051 
1052   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1053 
1054   if (wb != Post)
1055     address += offset;
1056 
1057   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1058   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1059 		       (int32_t) aarch64_get_mem_s16 (cpu, address));
1060 
1061   if (wb == Post)
1062     address += offset;
1063 
1064   if (wb != NoWriteBack)
1065     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1066 }
1067 
1068 /* 32 bit load sign-extended short scaled or unscaled zero-
1069    or sign-extended 32-bit register offset.  */
1070 static void
1071 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1072 {
1073   unsigned rm = INSTR (20, 16);
1074   unsigned rn = INSTR (9, 5);
1075   unsigned rt = INSTR (4, 0);
1076   /* rn may reference SP, rm and rt must reference ZR  */
1077 
1078   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1079   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1080   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1081 
1082   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1083   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1084 		       (int32_t) aarch64_get_mem_s16
1085 		       (cpu, address + displacement));
1086 }
1087 
1088 /* 64 bit load sign-extended short scaled unsigned 12 bit.  */
1089 static void
1090 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
1091 {
1092   unsigned rn = INSTR (9, 5);
1093   unsigned rt = INSTR (4, 0);
1094   int64_t val;
1095 
1096   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1097   /* The target register may not be SP but the source may be.  */
1098   val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1099 			      + SCALE (offset, 16));
1100   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1101 }
1102 
1103 /* 64 bit load sign-extended short unscaled signed 9 bit
1104    with pre- or post-writeback.  */
1105 static void
1106 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1107 {
1108   unsigned rn = INSTR (9, 5);
1109   unsigned rt = INSTR (4, 0);
1110   uint64_t address;
1111   int64_t val;
1112 
1113   if (rn == rt && wb != NoWriteBack)
1114     HALT_UNALLOC;
1115 
1116   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1117   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1118 
1119   if (wb != Post)
1120     address += offset;
1121 
1122   val = aarch64_get_mem_s16 (cpu, address);
1123   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1124 
1125   if (wb == Post)
1126     address += offset;
1127 
1128   if (wb != NoWriteBack)
1129     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1130 }
1131 
1132 /* 64 bit load sign-extended short scaled or unscaled zero-
1133    or sign-extended 32-bit register offset.  */
1134 static void
1135 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1136 {
1137   unsigned rm = INSTR (20, 16);
1138   unsigned rn = INSTR (9, 5);
1139   unsigned rt = INSTR (4, 0);
1140 
1141   /* rn may reference SP, rm and rt must reference ZR  */
1142 
1143   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1144   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1145   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
1146   int64_t val;
1147 
1148   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1149   val = aarch64_get_mem_s16 (cpu, address + displacement);
1150   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1151 }
1152 
1153 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
1154 static void
1155 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
1156 {
1157   unsigned rn = INSTR (9, 5);
1158   unsigned rt = INSTR (4, 0);
1159   int64_t val;
1160 
1161   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1162   val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1163 			     + SCALE (offset, 32));
1164   /* The target register may not be SP but the source may be.  */
1165   return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1166 }
1167 
1168 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
1169    with pre- or post-writeback.  */
1170 static void
1171 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1172 {
1173   unsigned rn = INSTR (9, 5);
1174   unsigned rt = INSTR (4, 0);
1175   uint64_t address;
1176 
1177   if (rn == rt && wb != NoWriteBack)
1178     HALT_UNALLOC;
1179 
1180   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1181 
1182   if (wb != Post)
1183     address += offset;
1184 
1185   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1186   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
1187 
1188   if (wb == Post)
1189     address += offset;
1190 
1191   if (wb != NoWriteBack)
1192     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1193 }
1194 
1195 /* 64 bit load sign-extended 32 bit scaled or unscaled zero-
1196    or sign-extended 32-bit register offset.  */
1197 static void
1198 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1199 {
1200   unsigned rm = INSTR (20, 16);
1201   unsigned rn = INSTR (9, 5);
1202   unsigned rt = INSTR (4, 0);
1203   /* rn may reference SP, rm and rt must reference ZR  */
1204 
1205   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1206   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1207   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
1208 
1209   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1210   aarch64_set_reg_s64 (cpu, rt, NO_SP,
1211 		       aarch64_get_mem_s32 (cpu, address + displacement));
1212 }
1213 
1214 /* N.B. with stores the value in source is written to the
1215    address identified by source2 modified by source3/offset.  */
1216 
1217 /* 32 bit store scaled unsigned 12 bit.  */
1218 static void
1219 str32_abs (sim_cpu *cpu, uint32_t offset)
1220 {
1221   unsigned rn = INSTR (9, 5);
1222   unsigned rt = INSTR (4, 0);
1223 
1224   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1225   /* The target register may not be SP but the source may be.  */
1226   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
1227 			     + SCALE (offset, 32)),
1228 		       aarch64_get_reg_u32 (cpu, rt, NO_SP));
1229 }
1230 
1231 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1232 static void
1233 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1234 {
1235   unsigned rn = INSTR (9, 5);
1236   unsigned rt = INSTR (4, 0);
1237   uint64_t address;
1238 
1239   if (rn == rt && wb != NoWriteBack)
1240     HALT_UNALLOC;
1241 
1242   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1243   if (wb != Post)
1244     address += offset;
1245 
1246   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1247   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
1248 
1249   if (wb == Post)
1250     address += offset;
1251 
1252   if (wb != NoWriteBack)
1253     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1254 }
1255 
1256 /* 32 bit store scaled or unscaled zero- or
1257    sign-extended 32-bit register offset.  */
1258 static void
1259 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1260 {
1261   unsigned rm = INSTR (20, 16);
1262   unsigned rn = INSTR (9, 5);
1263   unsigned rt = INSTR (4, 0);
1264 
1265   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1266   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1267   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
1268 
1269   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1270   aarch64_set_mem_u32 (cpu, address + displacement,
1271 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1272 }
1273 
1274 /* 64 bit store scaled unsigned 12 bit.  */
1275 static void
1276 str_abs (sim_cpu *cpu, uint32_t offset)
1277 {
1278   unsigned rn = INSTR (9, 5);
1279   unsigned rt = INSTR (4, 0);
1280 
1281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1282   aarch64_set_mem_u64 (cpu,
1283 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
1284 		       + SCALE (offset, 64),
1285 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1286 }
1287 
1288 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1289 static void
1290 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1291 {
1292   unsigned rn = INSTR (9, 5);
1293   unsigned rt = INSTR (4, 0);
1294   uint64_t address;
1295 
1296   if (rn == rt && wb != NoWriteBack)
1297     HALT_UNALLOC;
1298 
1299   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1300 
1301   if (wb != Post)
1302     address += offset;
1303 
1304   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1305   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
1306 
1307   if (wb == Post)
1308     address += offset;
1309 
1310   if (wb != NoWriteBack)
1311     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1312 }
1313 
1314 /* 64 bit store scaled or unscaled zero-
1315    or sign-extended 32-bit register offset.  */
1316 static void
1317 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1318 {
1319   unsigned rm = INSTR (20, 16);
1320   unsigned rn = INSTR (9, 5);
1321   unsigned rt = INSTR (4, 0);
1322   /* rn may reference SP, rm and rt must reference ZR  */
1323 
1324   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1325   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1326 			       extension);
1327   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
1328 
1329   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1330   aarch64_set_mem_u64 (cpu, address + displacement,
1331 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1332 }
1333 
1334 /* 32 bit store byte scaled unsigned 12 bit.  */
1335 static void
1336 strb_abs (sim_cpu *cpu, uint32_t offset)
1337 {
1338   unsigned rn = INSTR (9, 5);
1339   unsigned rt = INSTR (4, 0);
1340 
1341   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1342   /* The target register may not be SP but the source may be.
1343      There is no scaling required for a byte load.  */
1344   aarch64_set_mem_u8 (cpu,
1345 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
1346 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
1347 }
1348 
1349 /* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
1350 static void
1351 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1352 {
1353   unsigned rn = INSTR (9, 5);
1354   unsigned rt = INSTR (4, 0);
1355   uint64_t address;
1356 
1357   if (rn == rt && wb != NoWriteBack)
1358     HALT_UNALLOC;
1359 
1360   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1361 
1362   if (wb != Post)
1363     address += offset;
1364 
1365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1366   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
1367 
1368   if (wb == Post)
1369     address += offset;
1370 
1371   if (wb != NoWriteBack)
1372     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1373 }
1374 
1375 /* 32 bit store byte scaled or unscaled zero-
1376    or sign-extended 32-bit register offset.  */
1377 static void
1378 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1379 {
1380   unsigned rm = INSTR (20, 16);
1381   unsigned rn = INSTR (9, 5);
1382   unsigned rt = INSTR (4, 0);
1383   /* rn may reference SP, rm and rt must reference ZR  */
1384 
1385   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1386   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1387 				 extension);
1388 
1389   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1390   /* There is no scaling required for a byte load.  */
1391   aarch64_set_mem_u8 (cpu, address + displacement,
1392 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
1393 }
1394 
1395 /* 32 bit store short scaled unsigned 12 bit.  */
1396 static void
1397 strh_abs (sim_cpu *cpu, uint32_t offset)
1398 {
1399   unsigned rn = INSTR (9, 5);
1400   unsigned rt = INSTR (4, 0);
1401 
1402   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1403   /* The target register may not be SP but the source may be.  */
1404   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1405 		       + SCALE (offset, 16),
1406 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
1407 }
1408 
1409 /* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
1410 static void
1411 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1412 {
1413   unsigned rn = INSTR (9, 5);
1414   unsigned rt = INSTR (4, 0);
1415   uint64_t address;
1416 
1417   if (rn == rt && wb != NoWriteBack)
1418     HALT_UNALLOC;
1419 
1420   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1421 
1422   if (wb != Post)
1423     address += offset;
1424 
1425   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1426   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
1427 
1428   if (wb == Post)
1429     address += offset;
1430 
1431   if (wb != NoWriteBack)
1432     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1433 }
1434 
1435 /* 32 bit store short scaled or unscaled zero-
1436    or sign-extended 32-bit register offset.  */
1437 static void
1438 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1439 {
1440   unsigned rm = INSTR (20, 16);
1441   unsigned rn = INSTR (9, 5);
1442   unsigned rt = INSTR (4, 0);
1443   /* rn may reference SP, rm and rt must reference ZR  */
1444 
1445   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1446   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1447   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1448 
1449   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1450   aarch64_set_mem_u16 (cpu, address + displacement,
1451 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
1452 }
1453 
1454 /* Prefetch unsigned 12 bit.  */
1455 static void
1456 prfm_abs (sim_cpu *cpu, uint32_t offset)
1457 {
1458   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1459                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1460                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1461                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1462                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1463                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1464                           ow ==> UNALLOC
1465      PrfOp prfop = prfop (instr, 4, 0);
1466      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
1467      + SCALE (offset, 64).  */
1468 
1469   /* TODO : implement prefetch of address.  */
1470 }
1471 
1472 /* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
1473 static void
1474 prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1475 {
1476   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1477                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1478                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1479                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1480                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1481                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1482                           ow ==> UNALLOC
1483      rn may reference SP, rm may only reference ZR
1484      PrfOp prfop = prfop (instr, 4, 0);
1485      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1486      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1487                                 extension);
1488      uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
1489      uint64_t address = base + displacement.  */
1490 
1491   /* TODO : implement prefetch of address  */
1492 }
1493 
1494 /* 64 bit pc-relative prefetch.  */
1495 static void
1496 prfm_pcrel (sim_cpu *cpu, int32_t offset)
1497 {
1498   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1499                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1500                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1501                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1502                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1503                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1504                           ow ==> UNALLOC
1505      PrfOp prfop = prfop (instr, 4, 0);
1506      uint64_t address = aarch64_get_PC (cpu) + offset.  */
1507 
1508   /* TODO : implement this  */
1509 }
1510 
1511 /* Load-store exclusive.  */
1512 
1513 static void
1514 ldxr (sim_cpu *cpu)
1515 {
1516   unsigned rn = INSTR (9, 5);
1517   unsigned rt = INSTR (4, 0);
1518   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1519   int size = INSTR (31, 30);
1520   /* int ordered = INSTR (15, 15);  */
1521   /* int exclusive = ! INSTR (23, 23);  */
1522 
1523   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1524   switch (size)
1525     {
1526     case 0:
1527       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
1528       break;
1529     case 1:
1530       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1531       break;
1532     case 2:
1533       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
1534       break;
1535     case 3:
1536       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
1537       break;
1538     }
1539 }
1540 
1541 static void
1542 stxr (sim_cpu *cpu)
1543 {
1544   unsigned rn = INSTR (9, 5);
1545   unsigned rt = INSTR (4, 0);
1546   unsigned rs = INSTR (20, 16);
1547   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1548   int      size = INSTR (31, 30);
1549   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
1550 
1551   switch (size)
1552     {
1553     case 0: aarch64_set_mem_u8 (cpu, address, data); break;
1554     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
1555     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
1556     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
1557     }
1558 
1559   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1560   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
1561 }
1562 
1563 static void
1564 dexLoadLiteral (sim_cpu *cpu)
1565 {
1566   /* instr[29,27] == 011
1567      instr[25,24] == 00
1568      instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
1569                             010 ==> LDRX,  011 ==> FLDRD
1570                             100 ==> LDRSW, 101 ==> FLDRQ
1571                             110 ==> PRFM, 111 ==> UNALLOC
1572      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
1573      instr[23, 5] == simm19  */
1574 
1575   /* unsigned rt = INSTR (4, 0);  */
1576   uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
1577   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
1578 
1579   switch (dispatch)
1580     {
1581     case 0: ldr32_pcrel (cpu, imm); break;
1582     case 1: fldrs_pcrel (cpu, imm); break;
1583     case 2: ldr_pcrel   (cpu, imm); break;
1584     case 3: fldrd_pcrel (cpu, imm); break;
1585     case 4: ldrsw_pcrel (cpu, imm); break;
1586     case 5: fldrq_pcrel (cpu, imm); break;
1587     case 6: prfm_pcrel  (cpu, imm); break;
1588     case 7:
1589     default:
1590       HALT_UNALLOC;
1591     }
1592 }
1593 
1594 /* Immediate arithmetic
1595    The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
1596    value left shifted by 12 bits (done at decode).
1597 
1598    N.B. the register args (dest, source) can normally be Xn or SP.
1599    the exception occurs for flag setting instructions which may
1600    only use Xn for the output (dest).  */
1601 
1602 /* 32 bit add immediate.  */
1603 static void
1604 add32 (sim_cpu *cpu, uint32_t aimm)
1605 {
1606   unsigned rn = INSTR (9, 5);
1607   unsigned rd = INSTR (4, 0);
1608 
1609   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1610   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1611 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
1612 }
1613 
1614 /* 64 bit add immediate.  */
1615 static void
1616 add64 (sim_cpu *cpu, uint32_t aimm)
1617 {
1618   unsigned rn = INSTR (9, 5);
1619   unsigned rd = INSTR (4, 0);
1620 
1621   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1622   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1623 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
1624 }
1625 
1626 static void
1627 set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
1628 {
1629   int32_t   result = value1 + value2;
1630   int64_t   sresult = (int64_t) value1 + (int64_t) value2;
1631   uint64_t  uresult = (uint64_t)(uint32_t) value1
1632     + (uint64_t)(uint32_t) value2;
1633   uint32_t  flags = 0;
1634 
1635   if (result == 0)
1636     flags |= Z;
1637 
1638   if (result & (1 << 31))
1639     flags |= N;
1640 
1641   if (uresult != (uint32_t)uresult)
1642     flags |= C;
1643 
1644   if (sresult != (int32_t)sresult)
1645     flags |= V;
1646 
1647   aarch64_set_CPSR (cpu, flags);
1648 }
1649 
1650 #define NEG(a) (((a) & signbit) == signbit)
1651 #define POS(a) (((a) & signbit) == 0)
1652 
1653 static void
1654 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1655 {
1656   uint64_t result = value1 + value2;
1657   uint32_t flags = 0;
1658   uint64_t signbit = 1ULL << 63;
1659 
1660   if (result == 0)
1661     flags |= Z;
1662 
1663   if (NEG (result))
1664     flags |= N;
1665 
1666   if (   (NEG (value1) && NEG (value2))
1667       || (NEG (value1) && POS (result))
1668       || (NEG (value2) && POS (result)))
1669     flags |= C;
1670 
1671   if (   (NEG (value1) && NEG (value2) && POS (result))
1672       || (POS (value1) && POS (value2) && NEG (result)))
1673     flags |= V;
1674 
1675   aarch64_set_CPSR (cpu, flags);
1676 }
1677 
1678 static void
1679 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
1680 {
1681   uint32_t result = value1 - value2;
1682   uint32_t flags = 0;
1683   uint32_t signbit = 1U << 31;
1684 
1685   if (result == 0)
1686     flags |= Z;
1687 
1688   if (NEG (result))
1689     flags |= N;
1690 
1691   if (   (NEG (value1) && POS (value2))
1692       || (NEG (value1) && POS (result))
1693       || (POS (value2) && POS (result)))
1694     flags |= C;
1695 
1696   if (   (NEG (value1) && POS (value2) && POS (result))
1697       || (POS (value1) && NEG (value2) && NEG (result)))
1698     flags |= V;
1699 
1700   aarch64_set_CPSR (cpu, flags);
1701 }
1702 
1703 static void
1704 set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1705 {
1706   uint64_t result = value1 - value2;
1707   uint32_t flags = 0;
1708   uint64_t signbit = 1ULL << 63;
1709 
1710   if (result == 0)
1711     flags |= Z;
1712 
1713   if (NEG (result))
1714     flags |= N;
1715 
1716   if (   (NEG (value1) && POS (value2))
1717       || (NEG (value1) && POS (result))
1718       || (POS (value2) && POS (result)))
1719     flags |= C;
1720 
1721   if (   (NEG (value1) && POS (value2) && POS (result))
1722       || (POS (value1) && NEG (value2) && NEG (result)))
1723     flags |= V;
1724 
1725   aarch64_set_CPSR (cpu, flags);
1726 }
1727 
1728 static void
1729 set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
1730 {
1731   uint32_t flags = 0;
1732 
1733   if (result == 0)
1734     flags |= Z;
1735   else
1736     flags &= ~ Z;
1737 
1738   if (result & (1 << 31))
1739     flags |= N;
1740   else
1741     flags &= ~ N;
1742 
1743   aarch64_set_CPSR (cpu, flags);
1744 }
1745 
1746 static void
1747 set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
1748 {
1749   uint32_t flags = 0;
1750 
1751   if (result == 0)
1752     flags |= Z;
1753   else
1754     flags &= ~ Z;
1755 
1756   if (result & (1ULL << 63))
1757     flags |= N;
1758   else
1759     flags &= ~ N;
1760 
1761   aarch64_set_CPSR (cpu, flags);
1762 }
1763 
1764 /* 32 bit add immediate set flags.  */
1765 static void
1766 adds32 (sim_cpu *cpu, uint32_t aimm)
1767 {
1768   unsigned rn = INSTR (9, 5);
1769   unsigned rd = INSTR (4, 0);
1770   /* TODO : do we need to worry about signs here?  */
1771   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
1772 
1773   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1774   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
1775   set_flags_for_add32 (cpu, value1, aimm);
1776 }
1777 
1778 /* 64 bit add immediate set flags.  */
1779 static void
1780 adds64 (sim_cpu *cpu, uint32_t aimm)
1781 {
1782   unsigned rn = INSTR (9, 5);
1783   unsigned rd = INSTR (4, 0);
1784   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1785   uint64_t value2 = aimm;
1786 
1787   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1788   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1789   set_flags_for_add64 (cpu, value1, value2);
1790 }
1791 
1792 /* 32 bit sub immediate.  */
1793 static void
1794 sub32 (sim_cpu *cpu, uint32_t aimm)
1795 {
1796   unsigned rn = INSTR (9, 5);
1797   unsigned rd = INSTR (4, 0);
1798 
1799   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1800   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1801 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
1802 }
1803 
1804 /* 64 bit sub immediate.  */
1805 static void
1806 sub64 (sim_cpu *cpu, uint32_t aimm)
1807 {
1808   unsigned rn = INSTR (9, 5);
1809   unsigned rd = INSTR (4, 0);
1810 
1811   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1812   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1813 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
1814 }
1815 
1816 /* 32 bit sub immediate set flags.  */
1817 static void
1818 subs32 (sim_cpu *cpu, uint32_t aimm)
1819 {
1820   unsigned rn = INSTR (9, 5);
1821   unsigned rd = INSTR (4, 0);
1822   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1823   uint32_t value2 = aimm;
1824 
1825   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1826   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1827   set_flags_for_sub32 (cpu, value1, value2);
1828 }
1829 
1830 /* 64 bit sub immediate set flags.  */
1831 static void
1832 subs64 (sim_cpu *cpu, uint32_t aimm)
1833 {
1834   unsigned rn = INSTR (9, 5);
1835   unsigned rd = INSTR (4, 0);
1836   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1837   uint32_t value2 = aimm;
1838 
1839   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1840   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1841   set_flags_for_sub64 (cpu, value1, value2);
1842 }
1843 
1844 /* Data Processing Register.  */
1845 
1846 /* First two helpers to perform the shift operations.  */
1847 
1848 static inline uint32_t
1849 shifted32 (uint32_t value, Shift shift, uint32_t count)
1850 {
1851   switch (shift)
1852     {
1853     default:
1854     case LSL:
1855       return (value << count);
1856     case LSR:
1857       return (value >> count);
1858     case ASR:
1859       {
1860 	int32_t svalue = value;
1861 	return (svalue >> count);
1862       }
1863     case ROR:
1864       {
1865 	uint32_t top = value >> count;
1866 	uint32_t bottom = value << (32 - count);
1867 	return (bottom | top);
1868       }
1869     }
1870 }
1871 
1872 static inline uint64_t
1873 shifted64 (uint64_t value, Shift shift, uint32_t count)
1874 {
1875   switch (shift)
1876     {
1877     default:
1878     case LSL:
1879       return (value << count);
1880     case LSR:
1881       return (value >> count);
1882     case ASR:
1883       {
1884 	int64_t svalue = value;
1885 	return (svalue >> count);
1886       }
1887     case ROR:
1888       {
1889 	uint64_t top = value >> count;
1890 	uint64_t bottom = value << (64 - count);
1891 	return (bottom | top);
1892       }
1893     }
1894 }
1895 
1896 /* Arithmetic shifted register.
1897    These allow an optional LSL, ASR or LSR to the second source
1898    register with a count up to the register bit count.
1899 
1900    N.B register args may not be SP.  */
1901 
1902 /* 32 bit ADD shifted register.  */
1903 static void
1904 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1905 {
1906   unsigned rm = INSTR (20, 16);
1907   unsigned rn = INSTR (9, 5);
1908   unsigned rd = INSTR (4, 0);
1909 
1910   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1911   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1912 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
1913 		       + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1914 				    shift, count));
1915 }
1916 
1917 /* 64 bit ADD shifted register.  */
1918 static void
1919 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1920 {
1921   unsigned rm = INSTR (20, 16);
1922   unsigned rn = INSTR (9, 5);
1923   unsigned rd = INSTR (4, 0);
1924 
1925   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1926   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1927 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
1928 		       + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1929 				    shift, count));
1930 }
1931 
1932 /* 32 bit ADD shifted register setting flags.  */
1933 static void
1934 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1935 {
1936   unsigned rm = INSTR (20, 16);
1937   unsigned rn = INSTR (9, 5);
1938   unsigned rd = INSTR (4, 0);
1939 
1940   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
1941   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1942 			       shift, count);
1943 
1944   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1945   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1946   set_flags_for_add32 (cpu, value1, value2);
1947 }
1948 
1949 /* 64 bit ADD shifted register setting flags.  */
1950 static void
1951 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1952 {
1953   unsigned rm = INSTR (20, 16);
1954   unsigned rn = INSTR (9, 5);
1955   unsigned rd = INSTR (4, 0);
1956 
1957   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
1958   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1959 			       shift, count);
1960 
1961   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1962   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1963   set_flags_for_add64 (cpu, value1, value2);
1964 }
1965 
1966 /* 32 bit SUB shifted register.  */
1967 static void
1968 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1969 {
1970   unsigned rm = INSTR (20, 16);
1971   unsigned rn = INSTR (9, 5);
1972   unsigned rd = INSTR (4, 0);
1973 
1974   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1975   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1976 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
1977 		       - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1978 				    shift, count));
1979 }
1980 
1981 /* 64 bit SUB shifted register.  */
1982 static void
1983 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1984 {
1985   unsigned rm = INSTR (20, 16);
1986   unsigned rn = INSTR (9, 5);
1987   unsigned rd = INSTR (4, 0);
1988 
1989   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1990   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1991 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
1992 		       - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1993 				    shift, count));
1994 }
1995 
1996 /* 32 bit SUB shifted register setting flags.  */
1997 static void
1998 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1999 {
2000   unsigned rm = INSTR (20, 16);
2001   unsigned rn = INSTR (9, 5);
2002   unsigned rd = INSTR (4, 0);
2003 
2004   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2005   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
2006 			      shift, count);
2007 
2008   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2009   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2010   set_flags_for_sub32 (cpu, value1, value2);
2011 }
2012 
2013 /* 64 bit SUB shifted register setting flags.  */
2014 static void
2015 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2016 {
2017   unsigned rm = INSTR (20, 16);
2018   unsigned rn = INSTR (9, 5);
2019   unsigned rd = INSTR (4, 0);
2020 
2021   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2022   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2023 			       shift, count);
2024 
2025   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2026   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2027   set_flags_for_sub64 (cpu, value1, value2);
2028 }
2029 
2030 /* First a couple more helpers to fetch the
2031    relevant source register element either
2032    sign or zero extended as required by the
2033    extension value.  */
2034 
2035 static uint32_t
2036 extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
2037 {
2038   switch (extension)
2039     {
2040     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2041     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2042     case UXTW: /* Fall through.  */
2043     case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2044     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2045     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2046     case SXTW: /* Fall through.  */
2047     case SXTX: /* Fall through.  */
2048     default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2049   }
2050 }
2051 
2052 static uint64_t
2053 extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
2054 {
2055   switch (extension)
2056     {
2057     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2058     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2059     case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2060     case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
2061     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2062     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2063     case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2064     case SXTX:
2065     default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
2066     }
2067 }
2068 
2069 /* Arithmetic extending register
2070    These allow an optional sign extension of some portion of the
2071    second source register followed by an optional left shift of
2072    between 1 and 4 bits (i.e. a shift of 0-4 bits???)
2073 
2074    N.B output (dest) and first input arg (source) may normally be Xn
2075    or SP. However, for flag setting operations dest can only be
2076    Xn. Second input registers are always Xn.  */
2077 
2078 /* 32 bit ADD extending register.  */
2079 static void
2080 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2081 {
2082   unsigned rm = INSTR (20, 16);
2083   unsigned rn = INSTR (9, 5);
2084   unsigned rd = INSTR (4, 0);
2085 
2086   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2087   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2088 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
2089 		       + (extreg32 (cpu, rm, extension) << shift));
2090 }
2091 
2092 /* 64 bit ADD extending register.
2093    N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2094 static void
2095 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2096 {
2097   unsigned rm = INSTR (20, 16);
2098   unsigned rn = INSTR (9, 5);
2099   unsigned rd = INSTR (4, 0);
2100 
2101   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2102   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2103 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
2104 		       + (extreg64 (cpu, rm, extension) << shift));
2105 }
2106 
2107 /* 32 bit ADD extending register setting flags.  */
2108 static void
2109 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2110 {
2111   unsigned rm = INSTR (20, 16);
2112   unsigned rn = INSTR (9, 5);
2113   unsigned rd = INSTR (4, 0);
2114 
2115   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2116   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2117 
2118   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2119   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2120   set_flags_for_add32 (cpu, value1, value2);
2121 }
2122 
2123 /* 64 bit ADD extending register setting flags  */
2124 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2125 static void
2126 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2127 {
2128   unsigned rm = INSTR (20, 16);
2129   unsigned rn = INSTR (9, 5);
2130   unsigned rd = INSTR (4, 0);
2131 
2132   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2133   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2134 
2135   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2136   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2137   set_flags_for_add64 (cpu, value1, value2);
2138 }
2139 
2140 /* 32 bit SUB extending register.  */
2141 static void
2142 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2143 {
2144   unsigned rm = INSTR (20, 16);
2145   unsigned rn = INSTR (9, 5);
2146   unsigned rd = INSTR (4, 0);
2147 
2148   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2149   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2150 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
2151 		       - (extreg32 (cpu, rm, extension) << shift));
2152 }
2153 
2154 /* 64 bit SUB extending register.  */
2155 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2156 static void
2157 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2158 {
2159   unsigned rm = INSTR (20, 16);
2160   unsigned rn = INSTR (9, 5);
2161   unsigned rd = INSTR (4, 0);
2162 
2163   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2164   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2165 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
2166 		       - (extreg64 (cpu, rm, extension) << shift));
2167 }
2168 
2169 /* 32 bit SUB extending register setting flags.  */
2170 static void
2171 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2172 {
2173   unsigned rm = INSTR (20, 16);
2174   unsigned rn = INSTR (9, 5);
2175   unsigned rd = INSTR (4, 0);
2176 
2177   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2178   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2179 
2180   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2181   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2182   set_flags_for_sub32 (cpu, value1, value2);
2183 }
2184 
2185 /* 64 bit SUB extending register setting flags  */
2186 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2187 static void
2188 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2189 {
2190   unsigned rm = INSTR (20, 16);
2191   unsigned rn = INSTR (9, 5);
2192   unsigned rd = INSTR (4, 0);
2193 
2194   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2195   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2196 
2197   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2198   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2199   set_flags_for_sub64 (cpu, value1, value2);
2200 }
2201 
2202 static void
2203 dexAddSubtractImmediate (sim_cpu *cpu)
2204 {
2205   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2206      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2207      instr[29]    = set : 0 ==> no flags, 1 ==> set flags
2208      instr[28,24] = 10001
2209      instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
2210      instr[21,10] = uimm12
2211      instr[9,5]   = Rn
2212      instr[4,0]   = Rd  */
2213 
2214   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
2215   uint32_t shift = INSTR (23, 22);
2216   uint32_t imm = INSTR (21, 10);
2217   uint32_t dispatch = INSTR (31, 29);
2218 
2219   NYI_assert (28, 24, 0x11);
2220 
2221   if (shift > 1)
2222     HALT_UNALLOC;
2223 
2224   if (shift)
2225     imm <<= 12;
2226 
2227   switch (dispatch)
2228     {
2229     case 0: add32 (cpu, imm); break;
2230     case 1: adds32 (cpu, imm); break;
2231     case 2: sub32 (cpu, imm); break;
2232     case 3: subs32 (cpu, imm); break;
2233     case 4: add64 (cpu, imm); break;
2234     case 5: adds64 (cpu, imm); break;
2235     case 6: sub64 (cpu, imm); break;
2236     case 7: subs64 (cpu, imm); break;
2237     }
2238 }
2239 
2240 static void
2241 dexAddSubtractShiftedRegister (sim_cpu *cpu)
2242 {
2243   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2244      instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
2245      instr[28,24] = 01011
2246      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
2247      instr[21]    = 0
2248      instr[20,16] = Rm
2249      instr[15,10] = count : must be 0xxxxx for 32 bit
2250      instr[9,5]   = Rn
2251      instr[4,0]   = Rd  */
2252 
2253   uint32_t size = INSTR (31, 31);
2254   uint32_t count = INSTR (15, 10);
2255   Shift shiftType = INSTR (23, 22);
2256 
2257   NYI_assert (28, 24, 0x0B);
2258   NYI_assert (21, 21, 0);
2259 
2260   /* Shift encoded as ROR is unallocated.  */
2261   if (shiftType == ROR)
2262     HALT_UNALLOC;
2263 
2264   /* 32 bit operations must have count[5] = 0
2265      or else we have an UNALLOC.  */
2266   if (size == 0 && uimm (count, 5, 5))
2267     HALT_UNALLOC;
2268 
2269   /* Dispatch on size:op i.e instr [31,29].  */
2270   switch (INSTR (31, 29))
2271     {
2272     case 0: add32_shift  (cpu, shiftType, count); break;
2273     case 1: adds32_shift (cpu, shiftType, count); break;
2274     case 2: sub32_shift  (cpu, shiftType, count); break;
2275     case 3: subs32_shift (cpu, shiftType, count); break;
2276     case 4: add64_shift  (cpu, shiftType, count); break;
2277     case 5: adds64_shift (cpu, shiftType, count); break;
2278     case 6: sub64_shift  (cpu, shiftType, count); break;
2279     case 7: subs64_shift (cpu, shiftType, count); break;
2280     }
2281 }
2282 
2283 static void
2284 dexAddSubtractExtendedRegister (sim_cpu *cpu)
2285 {
2286   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2287      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2288      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2289      instr[28,24] = 01011
2290      instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
2291      instr[21]    = 1
2292      instr[20,16] = Rm
2293      instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
2294                              000 ==> LSL|UXTW, 001 ==> UXTZ,
2295                              000 ==> SXTB, 001 ==> SXTH,
2296                              000 ==> SXTW, 001 ==> SXTX,
2297      instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
2298      instr[9,5]   = Rn
2299      instr[4,0]   = Rd  */
2300 
2301   Extension extensionType = INSTR (15, 13);
2302   uint32_t shift = INSTR (12, 10);
2303 
2304   NYI_assert (28, 24, 0x0B);
2305   NYI_assert (21, 21, 1);
2306 
2307   /* Shift may not exceed 4.  */
2308   if (shift > 4)
2309     HALT_UNALLOC;
2310 
2311   /* Dispatch on size:op:set?.  */
2312   switch (INSTR (31, 29))
2313     {
2314     case 0: add32_ext  (cpu, extensionType, shift); break;
2315     case 1: adds32_ext (cpu, extensionType, shift); break;
2316     case 2: sub32_ext  (cpu, extensionType, shift); break;
2317     case 3: subs32_ext (cpu, extensionType, shift); break;
2318     case 4: add64_ext  (cpu, extensionType, shift); break;
2319     case 5: adds64_ext (cpu, extensionType, shift); break;
2320     case 6: sub64_ext  (cpu, extensionType, shift); break;
2321     case 7: subs64_ext (cpu, extensionType, shift); break;
2322     }
2323 }
2324 
2325 /* Conditional data processing
2326    Condition register is implicit 3rd source.  */
2327 
2328 /* 32 bit add with carry.  */
2329 /* N.B register args may not be SP.  */
2330 
2331 static void
2332 adc32 (sim_cpu *cpu)
2333 {
2334   unsigned rm = INSTR (20, 16);
2335   unsigned rn = INSTR (9, 5);
2336   unsigned rd = INSTR (4, 0);
2337 
2338   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2339   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2340 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
2341 		       + aarch64_get_reg_u32 (cpu, rm, NO_SP)
2342 		       + IS_SET (C));
2343 }
2344 
2345 /* 64 bit add with carry  */
2346 static void
2347 adc64 (sim_cpu *cpu)
2348 {
2349   unsigned rm = INSTR (20, 16);
2350   unsigned rn = INSTR (9, 5);
2351   unsigned rd = INSTR (4, 0);
2352 
2353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2354   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2355 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
2356 		       + aarch64_get_reg_u64 (cpu, rm, NO_SP)
2357 		       + IS_SET (C));
2358 }
2359 
2360 /* 32 bit add with carry setting flags.  */
2361 static void
2362 adcs32 (sim_cpu *cpu)
2363 {
2364   unsigned rm = INSTR (20, 16);
2365   unsigned rn = INSTR (9, 5);
2366   unsigned rd = INSTR (4, 0);
2367 
2368   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2369   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2370   uint32_t carry = IS_SET (C);
2371 
2372   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2373   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2374   set_flags_for_add32 (cpu, value1, value2 + carry);
2375 }
2376 
2377 /* 64 bit add with carry setting flags.  */
2378 static void
2379 adcs64 (sim_cpu *cpu)
2380 {
2381   unsigned rm = INSTR (20, 16);
2382   unsigned rn = INSTR (9, 5);
2383   unsigned rd = INSTR (4, 0);
2384 
2385   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2386   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2387   uint64_t carry = IS_SET (C);
2388 
2389   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2390   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2391   set_flags_for_add64 (cpu, value1, value2 + carry);
2392 }
2393 
2394 /* 32 bit sub with carry.  */
2395 static void
2396 sbc32 (sim_cpu *cpu)
2397 {
2398   unsigned rm = INSTR (20, 16);
2399   unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
2400   unsigned rd = INSTR (4, 0);
2401 
2402   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2403   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2404 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
2405 		       - aarch64_get_reg_u32 (cpu, rm, NO_SP)
2406 		       - 1 + IS_SET (C));
2407 }
2408 
2409 /* 64 bit sub with carry  */
2410 static void
2411 sbc64 (sim_cpu *cpu)
2412 {
2413   unsigned rm = INSTR (20, 16);
2414   unsigned rn = INSTR (9, 5);
2415   unsigned rd = INSTR (4, 0);
2416 
2417   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2418   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2419 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
2420 		       - aarch64_get_reg_u64 (cpu, rm, NO_SP)
2421 		       - 1 + IS_SET (C));
2422 }
2423 
2424 /* 32 bit sub with carry setting flags  */
2425 static void
2426 sbcs32 (sim_cpu *cpu)
2427 {
2428   unsigned rm = INSTR (20, 16);
2429   unsigned rn = INSTR (9, 5);
2430   unsigned rd = INSTR (4, 0);
2431 
2432   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2433   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2434   uint32_t carry  = IS_SET (C);
2435   uint32_t result = value1 - value2 + 1 - carry;
2436 
2437   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2438   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2439   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
2440 }
2441 
2442 /* 64 bit sub with carry setting flags  */
2443 static void
2444 sbcs64 (sim_cpu *cpu)
2445 {
2446   unsigned rm = INSTR (20, 16);
2447   unsigned rn = INSTR (9, 5);
2448   unsigned rd = INSTR (4, 0);
2449 
2450   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2451   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2452   uint64_t carry  = IS_SET (C);
2453   uint64_t result = value1 - value2 + 1 - carry;
2454 
2455   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2456   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2457   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
2458 }
2459 
2460 static void
2461 dexAddSubtractWithCarry (sim_cpu *cpu)
2462 {
2463   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2464      instr[30]    = op : 0 ==> ADC, 1 ==> SBC
2465      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2466      instr[28,21] = 1 1010 000
2467      instr[20,16] = Rm
2468      instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
2469      instr[9,5]   = Rn
2470      instr[4,0]   = Rd  */
2471 
2472   uint32_t op2 = INSTR (15, 10);
2473 
2474   NYI_assert (28, 21, 0xD0);
2475 
2476   if (op2 != 0)
2477     HALT_UNALLOC;
2478 
2479   /* Dispatch on size:op:set?.  */
2480   switch (INSTR (31, 29))
2481     {
2482     case 0: adc32 (cpu); break;
2483     case 1: adcs32 (cpu); break;
2484     case 2: sbc32 (cpu); break;
2485     case 3: sbcs32 (cpu); break;
2486     case 4: adc64 (cpu); break;
2487     case 5: adcs64 (cpu); break;
2488     case 6: sbc64 (cpu); break;
2489     case 7: sbcs64 (cpu); break;
2490     }
2491 }
2492 
2493 static uint32_t
2494 testConditionCode (sim_cpu *cpu, CondCode cc)
2495 {
2496   /* This should be reduceable to branchless logic
2497      by some careful testing of bits in CC followed
2498      by the requisite masking and combining of bits
2499      from the flag register.
2500 
2501      For now we do it with a switch.  */
2502   int res;
2503 
2504   switch (cc)
2505     {
2506     case EQ:  res = IS_SET (Z);    break;
2507     case NE:  res = IS_CLEAR (Z);  break;
2508     case CS:  res = IS_SET (C);    break;
2509     case CC:  res = IS_CLEAR (C);  break;
2510     case MI:  res = IS_SET (N);    break;
2511     case PL:  res = IS_CLEAR (N);  break;
2512     case VS:  res = IS_SET (V);    break;
2513     case VC:  res = IS_CLEAR (V);  break;
2514     case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
2515     case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
2516     case GE:  res = IS_SET (N) == IS_SET (V);    break;
2517     case LT:  res = IS_SET (N) != IS_SET (V);    break;
2518     case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
2519     case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
2520     case AL:
2521     case NV:
2522     default:
2523       res = 1;
2524       break;
2525     }
2526   return res;
2527 }
2528 
2529 static void
2530 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
2531 {
2532   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2533      instr[30]    = compare with positive (1) or negative value (0)
2534      instr[29,21] = 1 1101 0010
2535      instr[20,16] = Rm or const
2536      instr[15,12] = cond
2537      instr[11]    = compare reg (0) or const (1)
2538      instr[10]    = 0
2539      instr[9,5]   = Rn
2540      instr[4]     = 0
2541      instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
2542   signed int negate;
2543   unsigned rm;
2544   unsigned rn;
2545 
2546   NYI_assert (29, 21, 0x1d2);
2547   NYI_assert (10, 10, 0);
2548   NYI_assert (4, 4, 0);
2549 
2550   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2551   if (! testConditionCode (cpu, INSTR (15, 12)))
2552     {
2553       aarch64_set_CPSR (cpu, INSTR (3, 0));
2554       return;
2555     }
2556 
2557   negate = INSTR (30, 30) ? 1 : -1;
2558   rm = INSTR (20, 16);
2559   rn = INSTR ( 9,  5);
2560 
2561   if (INSTR (31, 31))
2562     {
2563       if (INSTR (11, 11))
2564 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2565 			     negate * (uint64_t) rm);
2566       else
2567 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2568 			     negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
2569     }
2570   else
2571     {
2572       if (INSTR (11, 11))
2573 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2574 			     negate * rm);
2575       else
2576 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2577 			     negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
2578     }
2579 }
2580 
2581 static void
2582 do_vec_MOV_whole_vector (sim_cpu *cpu)
2583 {
2584   /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)
2585 
2586      instr[31]    = 0
2587      instr[30]    = half(0)/full(1)
2588      instr[29,21] = 001110101
2589      instr[20,16] = Vs
2590      instr[15,10] = 000111
2591      instr[9,5]   = Vs
2592      instr[4,0]   = Vd  */
2593 
2594   unsigned vs = INSTR (9, 5);
2595   unsigned vd = INSTR (4, 0);
2596 
2597   NYI_assert (29, 21, 0x075);
2598   NYI_assert (15, 10, 0x07);
2599 
2600   if (INSTR (20, 16) != vs)
2601     HALT_NYI;
2602 
2603   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2604   if (INSTR (30, 30))
2605     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
2606 
2607   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
2608 }
2609 
2610 static void
2611 do_vec_SMOV_into_scalar (sim_cpu *cpu)
2612 {
2613   /* instr[31]    = 0
2614      instr[30]    = word(0)/long(1)
2615      instr[29,21] = 00 1110 000
2616      instr[20,16] = element size and index
2617      instr[15,10] = 00 0010 11
2618      instr[9,5]   = V source
2619      instr[4,0]   = R dest  */
2620 
2621   unsigned vs = INSTR (9, 5);
2622   unsigned rd = INSTR (4, 0);
2623   unsigned imm5 = INSTR (20, 16);
2624   unsigned full = INSTR (30, 30);
2625   int size, index;
2626 
2627   NYI_assert (29, 21, 0x070);
2628   NYI_assert (15, 10, 0x0B);
2629 
2630   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2631 
2632   if (imm5 & 0x1)
2633     {
2634       size = 0;
2635       index = (imm5 >> 1) & 0xF;
2636     }
2637   else if (imm5 & 0x2)
2638     {
2639       size = 1;
2640       index = (imm5 >> 2) & 0x7;
2641     }
2642   else if (full && (imm5 & 0x4))
2643     {
2644       size = 2;
2645       index = (imm5 >> 3) & 0x3;
2646     }
2647   else
2648     HALT_UNALLOC;
2649 
2650   switch (size)
2651     {
2652     case 0:
2653       if (full)
2654 	aarch64_set_reg_s64 (cpu, rd, NO_SP,
2655 			     aarch64_get_vec_s8 (cpu, vs, index));
2656       else
2657 	aarch64_set_reg_s32 (cpu, rd, NO_SP,
2658 			     aarch64_get_vec_s8 (cpu, vs, index));
2659       break;
2660 
2661     case 1:
2662       if (full)
2663 	aarch64_set_reg_s64 (cpu, rd, NO_SP,
2664 			     aarch64_get_vec_s16 (cpu, vs, index));
2665       else
2666 	aarch64_set_reg_s32 (cpu, rd, NO_SP,
2667 			     aarch64_get_vec_s16 (cpu, vs, index));
2668       break;
2669 
2670     case 2:
2671       aarch64_set_reg_s64 (cpu, rd, NO_SP,
2672 			   aarch64_get_vec_s32 (cpu, vs, index));
2673       break;
2674 
2675     default:
2676       HALT_UNALLOC;
2677     }
2678 }
2679 
2680 static void
2681 do_vec_UMOV_into_scalar (sim_cpu *cpu)
2682 {
2683   /* instr[31]    = 0
2684      instr[30]    = word(0)/long(1)
2685      instr[29,21] = 00 1110 000
2686      instr[20,16] = element size and index
2687      instr[15,10] = 00 0011 11
2688      instr[9,5]   = V source
2689      instr[4,0]   = R dest  */
2690 
2691   unsigned vs = INSTR (9, 5);
2692   unsigned rd = INSTR (4, 0);
2693   unsigned imm5 = INSTR (20, 16);
2694   unsigned full = INSTR (30, 30);
2695   int size, index;
2696 
2697   NYI_assert (29, 21, 0x070);
2698   NYI_assert (15, 10, 0x0F);
2699 
2700   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2701 
2702   if (!full)
2703     {
2704       if (imm5 & 0x1)
2705 	{
2706 	  size = 0;
2707 	  index = (imm5 >> 1) & 0xF;
2708 	}
2709       else if (imm5 & 0x2)
2710 	{
2711 	  size = 1;
2712 	  index = (imm5 >> 2) & 0x7;
2713 	}
2714       else if (imm5 & 0x4)
2715 	{
2716 	  size = 2;
2717 	  index = (imm5 >> 3) & 0x3;
2718 	}
2719       else
2720 	HALT_UNALLOC;
2721     }
2722   else if (imm5 & 0x8)
2723     {
2724       size = 3;
2725       index = (imm5 >> 4) & 0x1;
2726     }
2727   else
2728     HALT_UNALLOC;
2729 
2730   switch (size)
2731     {
2732     case 0:
2733       aarch64_set_reg_u32 (cpu, rd, NO_SP,
2734 			   aarch64_get_vec_u8 (cpu, vs, index));
2735       break;
2736 
2737     case 1:
2738       aarch64_set_reg_u32 (cpu, rd, NO_SP,
2739 			   aarch64_get_vec_u16 (cpu, vs, index));
2740       break;
2741 
2742     case 2:
2743       aarch64_set_reg_u32 (cpu, rd, NO_SP,
2744 			   aarch64_get_vec_u32 (cpu, vs, index));
2745       break;
2746 
2747     case 3:
2748       aarch64_set_reg_u64 (cpu, rd, NO_SP,
2749 			   aarch64_get_vec_u64 (cpu, vs, index));
2750       break;
2751 
2752     default:
2753       HALT_UNALLOC;
2754     }
2755 }
2756 
2757 static void
2758 do_vec_INS (sim_cpu *cpu)
2759 {
2760   /* instr[31,21] = 01001110000
2761      instr[20,16] = element size and index
2762      instr[15,10] = 000111
2763      instr[9,5]   = W source
2764      instr[4,0]   = V dest  */
2765 
2766   int index;
2767   unsigned rs = INSTR (9, 5);
2768   unsigned vd = INSTR (4, 0);
2769 
2770   NYI_assert (31, 21, 0x270);
2771   NYI_assert (15, 10, 0x07);
2772 
2773   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2774   if (INSTR (16, 16))
2775     {
2776       index = INSTR (20, 17);
2777       aarch64_set_vec_u8 (cpu, vd, index,
2778 			  aarch64_get_reg_u8 (cpu, rs, NO_SP));
2779     }
2780   else if (INSTR (17, 17))
2781     {
2782       index = INSTR (20, 18);
2783       aarch64_set_vec_u16 (cpu, vd, index,
2784 			   aarch64_get_reg_u16 (cpu, rs, NO_SP));
2785     }
2786   else if (INSTR (18, 18))
2787     {
2788       index = INSTR (20, 19);
2789       aarch64_set_vec_u32 (cpu, vd, index,
2790 			   aarch64_get_reg_u32 (cpu, rs, NO_SP));
2791     }
2792   else if (INSTR (19, 19))
2793     {
2794       index = INSTR (20, 20);
2795       aarch64_set_vec_u64 (cpu, vd, index,
2796 			   aarch64_get_reg_u64 (cpu, rs, NO_SP));
2797     }
2798   else
2799     HALT_NYI;
2800 }
2801 
2802 static void
2803 do_vec_DUP_vector_into_vector (sim_cpu *cpu)
2804 {
2805   /* instr[31]    = 0
2806      instr[30]    = half(0)/full(1)
2807      instr[29,21] = 00 1110 000
2808      instr[20,16] = element size and index
2809      instr[15,10] = 0000 01
2810      instr[9,5]   = V source
2811      instr[4,0]   = V dest.  */
2812 
2813   unsigned full = INSTR (30, 30);
2814   unsigned vs = INSTR (9, 5);
2815   unsigned vd = INSTR (4, 0);
2816   int i, index;
2817 
2818   NYI_assert (29, 21, 0x070);
2819   NYI_assert (15, 10, 0x01);
2820 
2821   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2822   if (INSTR (16, 16))
2823     {
2824       index = INSTR (20, 17);
2825 
2826       for (i = 0; i < (full ? 16 : 8); i++)
2827 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
2828     }
2829   else if (INSTR (17, 17))
2830     {
2831       index = INSTR (20, 18);
2832 
2833       for (i = 0; i < (full ? 8 : 4); i++)
2834 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
2835     }
2836   else if (INSTR (18, 18))
2837     {
2838       index = INSTR (20, 19);
2839 
2840       for (i = 0; i < (full ? 4 : 2); i++)
2841 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
2842     }
2843   else
2844     {
2845       if (INSTR (19, 19) == 0)
2846 	HALT_UNALLOC;
2847 
2848       if (! full)
2849 	HALT_UNALLOC;
2850 
2851       index = INSTR (20, 20);
2852 
2853       for (i = 0; i < 2; i++)
2854 	aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
2855     }
2856 }
2857 
2858 static void
2859 do_vec_TBL (sim_cpu *cpu)
2860 {
2861   /* instr[31]    = 0
2862      instr[30]    = half(0)/full(1)
2863      instr[29,21] = 00 1110 000
2864      instr[20,16] = Vm
2865      instr[15]    = 0
2866      instr[14,13] = vec length
2867      instr[12,10] = 000
2868      instr[9,5]   = V start
2869      instr[4,0]   = V dest  */
2870 
2871   int full    = INSTR (30, 30);
2872   int len     = INSTR (14, 13) + 1;
2873   unsigned vm = INSTR (20, 16);
2874   unsigned vn = INSTR (9, 5);
2875   unsigned vd = INSTR (4, 0);
2876   unsigned i;
2877 
2878   NYI_assert (29, 21, 0x070);
2879   NYI_assert (12, 10, 0);
2880 
2881   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2882   for (i = 0; i < (full ? 16 : 8); i++)
2883     {
2884       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
2885       uint8_t val;
2886 
2887       if (selector < 16)
2888 	val = aarch64_get_vec_u8 (cpu, vn, selector);
2889       else if (selector < 32)
2890 	val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
2891       else if (selector < 48)
2892 	val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
2893       else if (selector < 64)
2894 	val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
2895       else
2896 	val = 0;
2897 
2898       aarch64_set_vec_u8 (cpu, vd, i, val);
2899     }
2900 }
2901 
2902 static void
2903 do_vec_TRN (sim_cpu *cpu)
2904 {
2905   /* instr[31]    = 0
2906      instr[30]    = half(0)/full(1)
2907      instr[29,24] = 00 1110
2908      instr[23,22] = size
2909      instr[21]    = 0
2910      instr[20,16] = Vm
2911      instr[15]    = 0
2912      instr[14]    = TRN1 (0) / TRN2 (1)
2913      instr[13,10] = 1010
2914      instr[9,5]   = V source
2915      instr[4,0]   = V dest.  */
2916 
2917   int full    = INSTR (30, 30);
2918   int second  = INSTR (14, 14);
2919   unsigned vm = INSTR (20, 16);
2920   unsigned vn = INSTR (9, 5);
2921   unsigned vd = INSTR (4, 0);
2922   unsigned i;
2923 
2924   NYI_assert (29, 24, 0x0E);
2925   NYI_assert (13, 10, 0xA);
2926 
2927   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2928   switch (INSTR (23, 22))
2929     {
2930     case 0:
2931       for (i = 0; i < (full ? 8 : 4); i++)
2932 	{
2933 	  aarch64_set_vec_u8
2934 	    (cpu, vd, i * 2,
2935 	     aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
2936 	  aarch64_set_vec_u8
2937 	    (cpu, vd, 1 * 2 + 1,
2938 	     aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
2939 	}
2940       break;
2941 
2942     case 1:
2943       for (i = 0; i < (full ? 4 : 2); i++)
2944 	{
2945 	  aarch64_set_vec_u16
2946 	    (cpu, vd, i * 2,
2947 	     aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
2948 	  aarch64_set_vec_u16
2949 	    (cpu, vd, 1 * 2 + 1,
2950 	     aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
2951 	}
2952       break;
2953 
2954     case 2:
2955       aarch64_set_vec_u32
2956 	(cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
2957       aarch64_set_vec_u32
2958 	(cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
2959       aarch64_set_vec_u32
2960 	(cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
2961       aarch64_set_vec_u32
2962 	(cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
2963       break;
2964 
2965     case 3:
2966       if (! full)
2967 	HALT_UNALLOC;
2968 
2969       aarch64_set_vec_u64 (cpu, vd, 0,
2970 			   aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
2971       aarch64_set_vec_u64 (cpu, vd, 1,
2972 			   aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
2973       break;
2974     }
2975 }
2976 
2977 static void
2978 do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
2979 {
2980   /* instr[31]    = 0
2981      instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
2982                     [must be 1 for 64-bit xfer]
2983      instr[29,20] = 00 1110 0000
2984      instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
2985                                   0100=> 32-bits. 1000=>64-bits
2986      instr[15,10] = 0000 11
2987      instr[9,5]   = W source
2988      instr[4,0]   = V dest.  */
2989 
2990   unsigned i;
2991   unsigned Vd = INSTR (4, 0);
2992   unsigned Rs = INSTR (9, 5);
2993   int both    = INSTR (30, 30);
2994 
2995   NYI_assert (29, 20, 0x0E0);
2996   NYI_assert (15, 10, 0x03);
2997 
2998   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2999   switch (INSTR (19, 16))
3000     {
3001     case 1:
3002       for (i = 0; i < (both ? 16 : 8); i++)
3003 	aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
3004       break;
3005 
3006     case 2:
3007       for (i = 0; i < (both ? 8 : 4); i++)
3008 	aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
3009       break;
3010 
3011     case 4:
3012       for (i = 0; i < (both ? 4 : 2); i++)
3013 	aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
3014       break;
3015 
3016     case 8:
3017       if (!both)
3018 	HALT_NYI;
3019       aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
3020       aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
3021       break;
3022 
3023     default:
3024       HALT_NYI;
3025     }
3026 }
3027 
3028 static void
3029 do_vec_UZP (sim_cpu *cpu)
3030 {
3031   /* instr[31]    = 0
3032      instr[30]    = half(0)/full(1)
3033      instr[29,24] = 00 1110
3034      instr[23,22] = size: byte(00), half(01), word (10), long (11)
3035      instr[21]    = 0
3036      instr[20,16] = Vm
3037      instr[15]    = 0
3038      instr[14]    = lower (0) / upper (1)
3039      instr[13,10] = 0110
3040      instr[9,5]   = Vn
3041      instr[4,0]   = Vd.  */
3042 
3043   int full = INSTR (30, 30);
3044   int upper = INSTR (14, 14);
3045 
3046   unsigned vm = INSTR (20, 16);
3047   unsigned vn = INSTR (9, 5);
3048   unsigned vd = INSTR (4, 0);
3049 
3050   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3051   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3052   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3053   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3054 
3055   uint64_t val1;
3056   uint64_t val2;
3057 
3058   uint64_t input2 = full ? val_n2 : val_m1;
3059 
3060   NYI_assert (29, 24, 0x0E);
3061   NYI_assert (21, 21, 0);
3062   NYI_assert (15, 15, 0);
3063   NYI_assert (13, 10, 6);
3064 
3065   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3066   switch (INSTR (23, 22))
3067     {
3068     case 0:
3069       val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
3070       val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
3071       val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
3072       val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
3073 
3074       val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
3075       val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
3076       val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
3077       val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
3078 
3079       if (full)
3080 	{
3081 	  val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
3082 	  val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
3083 	  val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
3084 	  val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
3085 
3086 	  val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
3087 	  val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
3088 	  val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
3089 	  val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
3090 	}
3091       break;
3092 
3093     case 1:
3094       val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
3095       val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3096 
3097       val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
3098       val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3099 
3100       if (full)
3101 	{
3102 	  val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
3103 	  val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3104 
3105 	  val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
3106 	  val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3107 	}
3108       break;
3109 
3110     case 2:
3111       val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
3112       val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3113 
3114       if (full)
3115 	{
3116 	  val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
3117 	  val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3118 	}
3119       break;
3120 
3121     case 3:
3122       if (! full)
3123 	HALT_UNALLOC;
3124 
3125       val1 = upper ? val_n2 : val_n1;
3126       val2 = upper ? val_m2 : val_m1;
3127       break;
3128     }
3129 
3130   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3131   if (full)
3132     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3133 }
3134 
3135 static void
3136 do_vec_ZIP (sim_cpu *cpu)
3137 {
3138   /* instr[31]    = 0
3139      instr[30]    = half(0)/full(1)
3140      instr[29,24] = 00 1110
3141      instr[23,22] = size: byte(00), hald(01), word (10), long (11)
3142      instr[21]    = 0
3143      instr[20,16] = Vm
3144      instr[15]    = 0
3145      instr[14]    = lower (0) / upper (1)
3146      instr[13,10] = 1110
3147      instr[9,5]   = Vn
3148      instr[4,0]   = Vd.  */
3149 
3150   int full = INSTR (30, 30);
3151   int upper = INSTR (14, 14);
3152 
3153   unsigned vm = INSTR (20, 16);
3154   unsigned vn = INSTR (9, 5);
3155   unsigned vd = INSTR (4, 0);
3156 
3157   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3158   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3159   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3160   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3161 
3162   uint64_t val1 = 0;
3163   uint64_t val2 = 0;
3164 
3165   uint64_t input1 = upper ? val_n1 : val_m1;
3166   uint64_t input2 = upper ? val_n2 : val_m2;
3167 
3168   NYI_assert (29, 24, 0x0E);
3169   NYI_assert (21, 21, 0);
3170   NYI_assert (15, 15, 0);
3171   NYI_assert (13, 10, 0xE);
3172 
3173   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3174   switch (INSTR (23, 23))
3175     {
3176     case 0:
3177       val1 =
3178 	  ((input1 <<  0) & (0xFF    <<  0))
3179 	| ((input2 <<  8) & (0xFF    <<  8))
3180 	| ((input1 <<  8) & (0xFF    << 16))
3181 	| ((input2 << 16) & (0xFF    << 24))
3182 	| ((input1 << 16) & (0xFFULL << 32))
3183 	| ((input2 << 24) & (0xFFULL << 40))
3184 	| ((input1 << 24) & (0xFFULL << 48))
3185 	| ((input2 << 32) & (0xFFULL << 56));
3186 
3187       val2 =
3188 	  ((input1 >> 32) & (0xFF    <<  0))
3189 	| ((input2 >> 24) & (0xFF    <<  8))
3190 	| ((input1 >> 24) & (0xFF    << 16))
3191 	| ((input2 >> 16) & (0xFF    << 24))
3192 	| ((input1 >> 16) & (0xFFULL << 32))
3193 	| ((input2 >>  8) & (0xFFULL << 40))
3194 	| ((input1 >>  8) & (0xFFULL << 48))
3195 	| ((input2 >>  0) & (0xFFULL << 56));
3196       break;
3197 
3198     case 1:
3199       val1 =
3200 	  ((input1 <<  0) & (0xFFFF    <<  0))
3201 	| ((input2 << 16) & (0xFFFF    << 16))
3202 	| ((input1 << 16) & (0xFFFFULL << 32))
3203 	| ((input2 << 32) & (0xFFFFULL << 48));
3204 
3205       val2 =
3206 	  ((input1 >> 32) & (0xFFFF    <<  0))
3207 	| ((input2 >> 16) & (0xFFFF    << 16))
3208 	| ((input1 >> 16) & (0xFFFFULL << 32))
3209 	| ((input2 >>  0) & (0xFFFFULL << 48));
3210       break;
3211 
3212     case 2:
3213       val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
3214       val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
3215       break;
3216 
3217     case 3:
3218       val1 = input1;
3219       val2 = input2;
3220       break;
3221     }
3222 
3223   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3224   if (full)
3225     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3226 }
3227 
3228 /* Floating point immediates are encoded in 8 bits.
3229    fpimm[7] = sign bit.
3230    fpimm[6:4] = signed exponent.
3231    fpimm[3:0] = fraction (assuming leading 1).
3232    i.e. F = s * 1.f * 2^(e - b).  */
3233 
3234 static float
3235 fp_immediate_for_encoding_32 (uint32_t imm8)
3236 {
3237   float u;
3238   uint32_t s, e, f, i;
3239 
3240   s = (imm8 >> 7) & 0x1;
3241   e = (imm8 >> 4) & 0x7;
3242   f = imm8 & 0xf;
3243 
3244   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3245   u = (16.0 + f) / 16.0;
3246 
3247   /* N.B. exponent is signed.  */
3248   if (e < 4)
3249     {
3250       int epos = e;
3251 
3252       for (i = 0; i <= epos; i++)
3253 	u *= 2.0;
3254     }
3255   else
3256     {
3257       int eneg = 7 - e;
3258 
3259       for (i = 0; i < eneg; i++)
3260 	u /= 2.0;
3261     }
3262 
3263   if (s)
3264     u = - u;
3265 
3266   return u;
3267 }
3268 
3269 static double
3270 fp_immediate_for_encoding_64 (uint32_t imm8)
3271 {
3272   double u;
3273   uint32_t s, e, f, i;
3274 
3275   s = (imm8 >> 7) & 0x1;
3276   e = (imm8 >> 4) & 0x7;
3277   f = imm8 & 0xf;
3278 
3279   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3280   u = (16.0 + f) / 16.0;
3281 
3282   /* N.B. exponent is signed.  */
3283   if (e < 4)
3284     {
3285       int epos = e;
3286 
3287       for (i = 0; i <= epos; i++)
3288 	u *= 2.0;
3289     }
3290   else
3291     {
3292       int eneg = 7 - e;
3293 
3294       for (i = 0; i < eneg; i++)
3295 	u /= 2.0;
3296     }
3297 
3298   if (s)
3299     u = - u;
3300 
3301   return u;
3302 }
3303 
3304 static void
3305 do_vec_MOV_immediate (sim_cpu *cpu)
3306 {
3307   /* instr[31]    = 0
3308      instr[30]    = full/half selector
3309      instr[29,19] = 00111100000
3310      instr[18,16] = high 3 bits of uimm8
3311      instr[15,12] = size & shift:
3312                                   0000 => 32-bit
3313                                   0010 => 32-bit + LSL#8
3314                                   0100 => 32-bit + LSL#16
3315                                   0110 => 32-bit + LSL#24
3316                                   1010 => 16-bit + LSL#8
3317                                   1000 => 16-bit
3318                                   1101 => 32-bit + MSL#16
3319                                   1100 => 32-bit + MSL#8
3320                                   1110 => 8-bit
3321                                   1111 => double
3322      instr[11,10] = 01
3323      instr[9,5]   = low 5-bits of uimm8
3324      instr[4,0]   = Vd.  */
3325 
3326   int full     = INSTR (30, 30);
3327   unsigned vd  = INSTR (4, 0);
3328   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3329   unsigned i;
3330 
3331   NYI_assert (29, 19, 0x1E0);
3332   NYI_assert (11, 10, 1);
3333 
3334   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3335   switch (INSTR (15, 12))
3336     {
3337     case 0x0: /* 32-bit, no shift.  */
3338     case 0x2: /* 32-bit, shift by 8.  */
3339     case 0x4: /* 32-bit, shift by 16.  */
3340     case 0x6: /* 32-bit, shift by 24.  */
3341       val <<= (8 * INSTR (14, 13));
3342       for (i = 0; i < (full ? 4 : 2); i++)
3343 	aarch64_set_vec_u32 (cpu, vd, i, val);
3344       break;
3345 
3346     case 0xa: /* 16-bit, shift by 8.  */
3347       val <<= 8;
3348       /* Fall through.  */
3349     case 0x8: /* 16-bit, no shift.  */
3350       for (i = 0; i < (full ? 8 : 4); i++)
3351 	aarch64_set_vec_u16 (cpu, vd, i, val);
3352       break;
3353 
3354     case 0xd: /* 32-bit, mask shift by 16.  */
3355       val <<= 8;
3356       val |= 0xFF;
3357       /* Fall through.  */
3358     case 0xc: /* 32-bit, mask shift by 8. */
3359       val <<= 8;
3360       val |= 0xFF;
3361       for (i = 0; i < (full ? 4 : 2); i++)
3362 	aarch64_set_vec_u32 (cpu, vd, i, val);
3363       break;
3364 
3365     case 0xe: /* 8-bit, no shift.  */
3366       for (i = 0; i < (full ? 16 : 8); i++)
3367 	aarch64_set_vec_u8 (cpu, vd, i, val);
3368       break;
3369 
3370     case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
3371       {
3372 	float u = fp_immediate_for_encoding_32 (val);
3373 	for (i = 0; i < (full ? 4 : 2); i++)
3374 	  aarch64_set_vec_float (cpu, vd, i, u);
3375 	break;
3376       }
3377 
3378     default:
3379       HALT_NYI;
3380     }
3381 }
3382 
3383 static void
3384 do_vec_MVNI (sim_cpu *cpu)
3385 {
3386   /* instr[31]    = 0
3387      instr[30]    = full/half selector
3388      instr[29,19] = 10111100000
3389      instr[18,16] = high 3 bits of uimm8
3390      instr[15,12] = selector
3391      instr[11,10] = 01
3392      instr[9,5]   = low 5-bits of uimm8
3393      instr[4,0]   = Vd.  */
3394 
3395   int full     = INSTR (30, 30);
3396   unsigned vd  = INSTR (4, 0);
3397   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3398   unsigned i;
3399 
3400   NYI_assert (29, 19, 0x5E0);
3401   NYI_assert (11, 10, 1);
3402 
3403   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3404   switch (INSTR (15, 12))
3405     {
3406     case 0x0: /* 32-bit, no shift.  */
3407     case 0x2: /* 32-bit, shift by 8.  */
3408     case 0x4: /* 32-bit, shift by 16.  */
3409     case 0x6: /* 32-bit, shift by 24.  */
3410       val <<= (8 * INSTR (14, 13));
3411       val = ~ val;
3412       for (i = 0; i < (full ? 4 : 2); i++)
3413 	aarch64_set_vec_u32 (cpu, vd, i, val);
3414       return;
3415 
3416     case 0xa: /* 16-bit, 8 bit shift. */
3417       val <<= 8;
3418     case 0x8: /* 16-bit, no shift. */
3419       val = ~ val;
3420       for (i = 0; i < (full ? 8 : 4); i++)
3421 	aarch64_set_vec_u16 (cpu, vd, i, val);
3422       return;
3423 
3424     case 0xd: /* 32-bit, mask shift by 16.  */
3425       val <<= 8;
3426       val |= 0xFF;
3427     case 0xc: /* 32-bit, mask shift by 8. */
3428       val <<= 8;
3429       val |= 0xFF;
3430       val = ~ val;
3431       for (i = 0; i < (full ? 4 : 2); i++)
3432 	aarch64_set_vec_u32 (cpu, vd, i, val);
3433       return;
3434 
3435     case 0xE: /* MOVI Dn, #mask64 */
3436       {
3437 	uint64_t mask = 0;
3438 
3439 	for (i = 0; i < 8; i++)
3440 	  if (val & (1 << i))
3441 	    mask |= (0xFFUL << (i * 8));
3442 	aarch64_set_vec_u64 (cpu, vd, 0, mask);
3443 	aarch64_set_vec_u64 (cpu, vd, 1, mask);
3444 	return;
3445       }
3446 
3447     case 0xf: /* FMOV Vd.2D, #fpimm.  */
3448       {
3449 	double u = fp_immediate_for_encoding_64 (val);
3450 
3451 	if (! full)
3452 	  HALT_UNALLOC;
3453 
3454 	aarch64_set_vec_double (cpu, vd, 0, u);
3455 	aarch64_set_vec_double (cpu, vd, 1, u);
3456 	return;
3457       }
3458 
3459     default:
3460       HALT_NYI;
3461     }
3462 }
3463 
3464 #define ABS(A) ((A) < 0 ? - (A) : (A))
3465 
3466 static void
3467 do_vec_ABS (sim_cpu *cpu)
3468 {
3469   /* instr[31]    = 0
3470      instr[30]    = half(0)/full(1)
3471      instr[29,24] = 00 1110
3472      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3473      instr[21,10] = 10 0000 1011 10
3474      instr[9,5]   = Vn
3475      instr[4.0]   = Vd.  */
3476 
3477   unsigned vn = INSTR (9, 5);
3478   unsigned vd = INSTR (4, 0);
3479   unsigned full = INSTR (30, 30);
3480   unsigned i;
3481 
3482   NYI_assert (29, 24, 0x0E);
3483   NYI_assert (21, 10, 0x82E);
3484 
3485   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3486   switch (INSTR (23, 22))
3487     {
3488     case 0:
3489       for (i = 0; i < (full ? 16 : 8); i++)
3490 	aarch64_set_vec_s8 (cpu, vd, i,
3491 			    ABS (aarch64_get_vec_s8 (cpu, vn, i)));
3492       break;
3493 
3494     case 1:
3495       for (i = 0; i < (full ? 8 : 4); i++)
3496 	aarch64_set_vec_s16 (cpu, vd, i,
3497 			     ABS (aarch64_get_vec_s16 (cpu, vn, i)));
3498       break;
3499 
3500     case 2:
3501       for (i = 0; i < (full ? 4 : 2); i++)
3502 	aarch64_set_vec_s32 (cpu, vd, i,
3503 			     ABS (aarch64_get_vec_s32 (cpu, vn, i)));
3504       break;
3505 
3506     case 3:
3507       if (! full)
3508 	HALT_NYI;
3509       for (i = 0; i < 2; i++)
3510 	aarch64_set_vec_s64 (cpu, vd, i,
3511 			     ABS (aarch64_get_vec_s64 (cpu, vn, i)));
3512       break;
3513     }
3514 }
3515 
3516 static void
3517 do_vec_ADDV (sim_cpu *cpu)
3518 {
3519   /* instr[31]    = 0
3520      instr[30]    = full/half selector
3521      instr[29,24] = 00 1110
3522      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3523      instr[21,10] = 11 0001 1011 10
3524      instr[9,5]   = Vm
3525      instr[4.0]   = Rd.  */
3526 
3527   unsigned vm = INSTR (9, 5);
3528   unsigned rd = INSTR (4, 0);
3529   unsigned i;
3530   int      full = INSTR (30, 30);
3531 
3532   NYI_assert (29, 24, 0x0E);
3533   NYI_assert (21, 10, 0xC6E);
3534 
3535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3536   switch (INSTR (23, 22))
3537     {
3538     case 0:
3539       {
3540 	uint8_t val = 0;
3541 	for (i = 0; i < (full ? 16 : 8); i++)
3542 	  val += aarch64_get_vec_u8 (cpu, vm, i);
3543 	aarch64_set_vec_u64 (cpu, rd, 0, val);
3544 	return;
3545       }
3546 
3547     case 1:
3548       {
3549 	uint16_t val = 0;
3550 	for (i = 0; i < (full ? 8 : 4); i++)
3551 	  val += aarch64_get_vec_u16 (cpu, vm, i);
3552 	aarch64_set_vec_u64 (cpu, rd, 0, val);
3553 	return;
3554       }
3555 
3556     case 2:
3557       {
3558 	uint32_t val = 0;
3559 	if (! full)
3560 	  HALT_UNALLOC;
3561 	for (i = 0; i < 4; i++)
3562 	  val += aarch64_get_vec_u32 (cpu, vm, i);
3563 	aarch64_set_vec_u64 (cpu, rd, 0, val);
3564 	return;
3565       }
3566 
3567     case 3:
3568       HALT_UNALLOC;
3569     }
3570 }
3571 
3572 static void
3573 do_vec_ins_2 (sim_cpu *cpu)
3574 {
3575   /* instr[31,21] = 01001110000
3576      instr[20,18] = size & element selector
3577      instr[17,14] = 0000
3578      instr[13]    = direction: to vec(0), from vec (1)
3579      instr[12,10] = 111
3580      instr[9,5]   = Vm
3581      instr[4,0]   = Vd.  */
3582 
3583   unsigned elem;
3584   unsigned vm = INSTR (9, 5);
3585   unsigned vd = INSTR (4, 0);
3586 
3587   NYI_assert (31, 21, 0x270);
3588   NYI_assert (17, 14, 0);
3589   NYI_assert (12, 10, 7);
3590 
3591   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3592   if (INSTR (13, 13) == 1)
3593     {
3594       if (INSTR (18, 18) == 1)
3595 	{
3596 	  /* 32-bit moves.  */
3597 	  elem = INSTR (20, 19);
3598 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
3599 			       aarch64_get_vec_u32 (cpu, vm, elem));
3600 	}
3601       else
3602 	{
3603 	  /* 64-bit moves.  */
3604 	  if (INSTR (19, 19) != 1)
3605 	    HALT_NYI;
3606 
3607 	  elem = INSTR (20, 20);
3608 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
3609 			       aarch64_get_vec_u64 (cpu, vm, elem));
3610 	}
3611     }
3612   else
3613     {
3614       if (INSTR (18, 18) == 1)
3615 	{
3616 	  /* 32-bit moves.  */
3617 	  elem = INSTR (20, 19);
3618 	  aarch64_set_vec_u32 (cpu, vd, elem,
3619 			       aarch64_get_reg_u32 (cpu, vm, NO_SP));
3620 	}
3621       else
3622 	{
3623 	  /* 64-bit moves.  */
3624 	  if (INSTR (19, 19) != 1)
3625 	    HALT_NYI;
3626 
3627 	  elem = INSTR (20, 20);
3628 	  aarch64_set_vec_u64 (cpu, vd, elem,
3629 			       aarch64_get_reg_u64 (cpu, vm, NO_SP));
3630 	}
3631     }
3632 }
3633 
3634 #define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)	  \
3635   do								  \
3636     {								  \
3637       DST_TYPE a[N], b[N];					  \
3638 								  \
3639       for (i = 0; i < (N); i++)					  \
3640 	{							  \
3641 	  a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
3642 	  b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
3643 	}							  \
3644       for (i = 0; i < (N); i++)					  \
3645 	aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);	  \
3646     }								  \
3647   while (0)
3648 
3649 static void
3650 do_vec_mull (sim_cpu *cpu)
3651 {
3652   /* instr[31]    = 0
3653      instr[30]    = lower(0)/upper(1) selector
3654      instr[29]    = signed(0)/unsigned(1)
3655      instr[28,24] = 0 1110
3656      instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
3657      instr[21]    = 1
3658      instr[20,16] = Vm
3659      instr[15,10] = 11 0000
3660      instr[9,5]   = Vn
3661      instr[4.0]   = Vd.  */
3662 
3663   int    unsign = INSTR (29, 29);
3664   int    bias = INSTR (30, 30);
3665   unsigned vm = INSTR (20, 16);
3666   unsigned vn = INSTR ( 9,  5);
3667   unsigned vd = INSTR ( 4,  0);
3668   unsigned i;
3669 
3670   NYI_assert (28, 24, 0x0E);
3671   NYI_assert (15, 10, 0x30);
3672 
3673   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3674   /* NB: Read source values before writing results, in case
3675      the source and destination vectors are the same.  */
3676   switch (INSTR (23, 22))
3677     {
3678     case 0:
3679       if (bias)
3680 	bias = 8;
3681       if (unsign)
3682 	DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
3683       else
3684 	DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
3685       return;
3686 
3687     case 1:
3688       if (bias)
3689 	bias = 4;
3690       if (unsign)
3691 	DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
3692       else
3693 	DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
3694       return;
3695 
3696     case 2:
3697       if (bias)
3698 	bias = 2;
3699       if (unsign)
3700 	DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
3701       else
3702 	DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
3703       return;
3704 
3705     case 3:
3706       HALT_NYI;
3707     }
3708 }
3709 
3710 static void
3711 do_vec_fadd (sim_cpu *cpu)
3712 {
3713   /* instr[31]    = 0
3714      instr[30]    = half(0)/full(1)
3715      instr[29,24] = 001110
3716      instr[23]    = FADD(0)/FSUB(1)
3717      instr[22]    = float (0)/double(1)
3718      instr[21]    = 1
3719      instr[20,16] = Vm
3720      instr[15,10] = 110101
3721      instr[9,5]   = Vn
3722      instr[4.0]   = Vd.  */
3723 
3724   unsigned vm = INSTR (20, 16);
3725   unsigned vn = INSTR (9, 5);
3726   unsigned vd = INSTR (4, 0);
3727   unsigned i;
3728   int      full = INSTR (30, 30);
3729 
3730   NYI_assert (29, 24, 0x0E);
3731   NYI_assert (21, 21, 1);
3732   NYI_assert (15, 10, 0x35);
3733 
3734   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3735   if (INSTR (23, 23))
3736     {
3737       if (INSTR (22, 22))
3738 	{
3739 	  if (! full)
3740 	    HALT_NYI;
3741 
3742 	  for (i = 0; i < 2; i++)
3743 	    aarch64_set_vec_double (cpu, vd, i,
3744 				    aarch64_get_vec_double (cpu, vn, i)
3745 				    - aarch64_get_vec_double (cpu, vm, i));
3746 	}
3747       else
3748 	{
3749 	  for (i = 0; i < (full ? 4 : 2); i++)
3750 	    aarch64_set_vec_float (cpu, vd, i,
3751 				   aarch64_get_vec_float (cpu, vn, i)
3752 				   - aarch64_get_vec_float (cpu, vm, i));
3753 	}
3754     }
3755   else
3756     {
3757       if (INSTR (22, 22))
3758 	{
3759 	  if (! full)
3760 	    HALT_NYI;
3761 
3762 	  for (i = 0; i < 2; i++)
3763 	    aarch64_set_vec_double (cpu, vd, i,
3764 				    aarch64_get_vec_double (cpu, vm, i)
3765 				    + aarch64_get_vec_double (cpu, vn, i));
3766 	}
3767       else
3768 	{
3769 	  for (i = 0; i < (full ? 4 : 2); i++)
3770 	    aarch64_set_vec_float (cpu, vd, i,
3771 				   aarch64_get_vec_float (cpu, vm, i)
3772 				   + aarch64_get_vec_float (cpu, vn, i));
3773 	}
3774     }
3775 }
3776 
3777 static void
3778 do_vec_add (sim_cpu *cpu)
3779 {
3780   /* instr[31]    = 0
3781      instr[30]    = full/half selector
3782      instr[29,24] = 001110
3783      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3784      instr[21]    = 1
3785      instr[20,16] = Vn
3786      instr[15,10] = 100001
3787      instr[9,5]   = Vm
3788      instr[4.0]   = Vd.  */
3789 
3790   unsigned vm = INSTR (20, 16);
3791   unsigned vn = INSTR (9, 5);
3792   unsigned vd = INSTR (4, 0);
3793   unsigned i;
3794   int      full = INSTR (30, 30);
3795 
3796   NYI_assert (29, 24, 0x0E);
3797   NYI_assert (21, 21, 1);
3798   NYI_assert (15, 10, 0x21);
3799 
3800   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3801   switch (INSTR (23, 22))
3802     {
3803     case 0:
3804       for (i = 0; i < (full ? 16 : 8); i++)
3805 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
3806 			    + aarch64_get_vec_u8 (cpu, vm, i));
3807       return;
3808 
3809     case 1:
3810       for (i = 0; i < (full ? 8 : 4); i++)
3811 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
3812 			     + aarch64_get_vec_u16 (cpu, vm, i));
3813       return;
3814 
3815     case 2:
3816       for (i = 0; i < (full ? 4 : 2); i++)
3817 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
3818 			     + aarch64_get_vec_u32 (cpu, vm, i));
3819       return;
3820 
3821     case 3:
3822       if (! full)
3823 	HALT_UNALLOC;
3824       aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
3825 			   + aarch64_get_vec_u64 (cpu, vm, 0));
3826       aarch64_set_vec_u64 (cpu, vd, 1,
3827 			   aarch64_get_vec_u64 (cpu, vn, 1)
3828 			   + aarch64_get_vec_u64 (cpu, vm, 1));
3829       return;
3830     }
3831 }
3832 
3833 static void
3834 do_vec_mul (sim_cpu *cpu)
3835 {
3836   /* instr[31]    = 0
3837      instr[30]    = full/half selector
3838      instr[29,24] = 00 1110
3839      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3840      instr[21]    = 1
3841      instr[20,16] = Vn
3842      instr[15,10] = 10 0111
3843      instr[9,5]   = Vm
3844      instr[4.0]   = Vd.  */
3845 
3846   unsigned vm = INSTR (20, 16);
3847   unsigned vn = INSTR (9, 5);
3848   unsigned vd = INSTR (4, 0);
3849   unsigned i;
3850   int      full = INSTR (30, 30);
3851   int      bias = 0;
3852 
3853   NYI_assert (29, 24, 0x0E);
3854   NYI_assert (21, 21, 1);
3855   NYI_assert (15, 10, 0x27);
3856 
3857   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3858   switch (INSTR (23, 22))
3859     {
3860     case 0:
3861       DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
3862       return;
3863 
3864     case 1:
3865       DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
3866       return;
3867 
3868     case 2:
3869       DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
3870       return;
3871 
3872     case 3:
3873       HALT_UNALLOC;
3874     }
3875 }
3876 
3877 static void
3878 do_vec_MLA (sim_cpu *cpu)
3879 {
3880   /* instr[31]    = 0
3881      instr[30]    = full/half selector
3882      instr[29,24] = 00 1110
3883      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3884      instr[21]    = 1
3885      instr[20,16] = Vn
3886      instr[15,10] = 1001 01
3887      instr[9,5]   = Vm
3888      instr[4.0]   = Vd.  */
3889 
3890   unsigned vm = INSTR (20, 16);
3891   unsigned vn = INSTR (9, 5);
3892   unsigned vd = INSTR (4, 0);
3893   unsigned i;
3894   int      full = INSTR (30, 30);
3895 
3896   NYI_assert (29, 24, 0x0E);
3897   NYI_assert (21, 21, 1);
3898   NYI_assert (15, 10, 0x25);
3899 
3900   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3901   switch (INSTR (23, 22))
3902     {
3903     case 0:
3904       for (i = 0; i < (full ? 16 : 8); i++)
3905 	aarch64_set_vec_u8 (cpu, vd, i,
3906 			    aarch64_get_vec_u8 (cpu, vd, i)
3907 			    + (aarch64_get_vec_u8 (cpu, vn, i)
3908 			       * aarch64_get_vec_u8 (cpu, vm, i)));
3909       return;
3910 
3911     case 1:
3912       for (i = 0; i < (full ? 8 : 4); i++)
3913 	aarch64_set_vec_u16 (cpu, vd, i,
3914 			     aarch64_get_vec_u16 (cpu, vd, i)
3915 			     + (aarch64_get_vec_u16 (cpu, vn, i)
3916 				* aarch64_get_vec_u16 (cpu, vm, i)));
3917       return;
3918 
3919     case 2:
3920       for (i = 0; i < (full ? 4 : 2); i++)
3921 	aarch64_set_vec_u32 (cpu, vd, i,
3922 			     aarch64_get_vec_u32 (cpu, vd, i)
3923 			     + (aarch64_get_vec_u32 (cpu, vn, i)
3924 				* aarch64_get_vec_u32 (cpu, vm, i)));
3925       return;
3926 
3927     default:
3928       HALT_UNALLOC;
3929     }
3930 }
3931 
3932 static float
3933 fmaxnm (float a, float b)
3934 {
3935   if (! isnan (a))
3936     {
3937       if (! isnan (b))
3938 	return a > b ? a : b;
3939       return a;
3940     }
3941   else if (! isnan (b))
3942     return b;
3943   return a;
3944 }
3945 
3946 static float
3947 fminnm (float a, float b)
3948 {
3949   if (! isnan (a))
3950     {
3951       if (! isnan (b))
3952 	return a < b ? a : b;
3953       return a;
3954     }
3955   else if (! isnan (b))
3956     return b;
3957   return a;
3958 }
3959 
3960 static double
3961 dmaxnm (double a, double b)
3962 {
3963   if (! isnan (a))
3964     {
3965       if (! isnan (b))
3966 	return a > b ? a : b;
3967       return a;
3968     }
3969   else if (! isnan (b))
3970     return b;
3971   return a;
3972 }
3973 
3974 static double
3975 dminnm (double a, double b)
3976 {
3977   if (! isnan (a))
3978     {
3979       if (! isnan (b))
3980 	return a < b ? a : b;
3981       return a;
3982     }
3983   else if (! isnan (b))
3984     return b;
3985   return a;
3986 }
3987 
3988 static void
3989 do_vec_FminmaxNMP (sim_cpu *cpu)
3990 {
3991   /* instr [31]    = 0
3992      instr [30]    = half (0)/full (1)
3993      instr [29,24] = 10 1110
3994      instr [23]    = max(0)/min(1)
3995      instr [22]    = float (0)/double (1)
3996      instr [21]    = 1
3997      instr [20,16] = Vn
3998      instr [15,10] = 1100 01
3999      instr [9,5]   = Vm
4000      instr [4.0]   = Vd.  */
4001 
4002   unsigned vm = INSTR (20, 16);
4003   unsigned vn = INSTR (9, 5);
4004   unsigned vd = INSTR (4, 0);
4005   int      full = INSTR (30, 30);
4006 
4007   NYI_assert (29, 24, 0x2E);
4008   NYI_assert (21, 21, 1);
4009   NYI_assert (15, 10, 0x31);
4010 
4011   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4012   if (INSTR (22, 22))
4013     {
4014       double (* fn)(double, double) = INSTR (23, 23)
4015 	? dminnm : dmaxnm;
4016 
4017       if (! full)
4018 	HALT_NYI;
4019       aarch64_set_vec_double (cpu, vd, 0,
4020 			      fn (aarch64_get_vec_double (cpu, vn, 0),
4021 				  aarch64_get_vec_double (cpu, vn, 1)));
4022       aarch64_set_vec_double (cpu, vd, 0,
4023 			      fn (aarch64_get_vec_double (cpu, vm, 0),
4024 				  aarch64_get_vec_double (cpu, vm, 1)));
4025     }
4026   else
4027     {
4028       float (* fn)(float, float) = INSTR (23, 23)
4029 	? fminnm : fmaxnm;
4030 
4031       aarch64_set_vec_float (cpu, vd, 0,
4032 			     fn (aarch64_get_vec_float (cpu, vn, 0),
4033 				 aarch64_get_vec_float (cpu, vn, 1)));
4034       if (full)
4035 	aarch64_set_vec_float (cpu, vd, 1,
4036 			       fn (aarch64_get_vec_float (cpu, vn, 2),
4037 				   aarch64_get_vec_float (cpu, vn, 3)));
4038 
4039       aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
4040 			     fn (aarch64_get_vec_float (cpu, vm, 0),
4041 				 aarch64_get_vec_float (cpu, vm, 1)));
4042       if (full)
4043 	aarch64_set_vec_float (cpu, vd, 3,
4044 			       fn (aarch64_get_vec_float (cpu, vm, 2),
4045 				   aarch64_get_vec_float (cpu, vm, 3)));
4046     }
4047 }
4048 
4049 static void
4050 do_vec_AND (sim_cpu *cpu)
4051 {
4052   /* instr[31]    = 0
4053      instr[30]    = half (0)/full (1)
4054      instr[29,21] = 001110001
4055      instr[20,16] = Vm
4056      instr[15,10] = 000111
4057      instr[9,5]   = Vn
4058      instr[4.0]   = Vd.  */
4059 
4060   unsigned vm = INSTR (20, 16);
4061   unsigned vn = INSTR (9, 5);
4062   unsigned vd = INSTR (4, 0);
4063   unsigned i;
4064   int      full = INSTR (30, 30);
4065 
4066   NYI_assert (29, 21, 0x071);
4067   NYI_assert (15, 10, 0x07);
4068 
4069   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4070   for (i = 0; i < (full ? 4 : 2); i++)
4071     aarch64_set_vec_u32 (cpu, vd, i,
4072 			 aarch64_get_vec_u32 (cpu, vn, i)
4073 			 & aarch64_get_vec_u32 (cpu, vm, i));
4074 }
4075 
4076 static void
4077 do_vec_BSL (sim_cpu *cpu)
4078 {
4079   /* instr[31]    = 0
4080      instr[30]    = half (0)/full (1)
4081      instr[29,21] = 101110011
4082      instr[20,16] = Vm
4083      instr[15,10] = 000111
4084      instr[9,5]   = Vn
4085      instr[4.0]   = Vd.  */
4086 
4087   unsigned vm = INSTR (20, 16);
4088   unsigned vn = INSTR (9, 5);
4089   unsigned vd = INSTR (4, 0);
4090   unsigned i;
4091   int      full = INSTR (30, 30);
4092 
4093   NYI_assert (29, 21, 0x173);
4094   NYI_assert (15, 10, 0x07);
4095 
4096   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4097   for (i = 0; i < (full ? 16 : 8); i++)
4098     aarch64_set_vec_u8 (cpu, vd, i,
4099 			(    aarch64_get_vec_u8 (cpu, vd, i)
4100 			   & aarch64_get_vec_u8 (cpu, vn, i))
4101 			| ((~ aarch64_get_vec_u8 (cpu, vd, i))
4102 			   & aarch64_get_vec_u8 (cpu, vm, i)));
4103 }
4104 
4105 static void
4106 do_vec_EOR (sim_cpu *cpu)
4107 {
4108   /* instr[31]    = 0
4109      instr[30]    = half (0)/full (1)
4110      instr[29,21] = 10 1110 001
4111      instr[20,16] = Vm
4112      instr[15,10] = 000111
4113      instr[9,5]   = Vn
4114      instr[4.0]   = Vd.  */
4115 
4116   unsigned vm = INSTR (20, 16);
4117   unsigned vn = INSTR (9, 5);
4118   unsigned vd = INSTR (4, 0);
4119   unsigned i;
4120   int      full = INSTR (30, 30);
4121 
4122   NYI_assert (29, 21, 0x171);
4123   NYI_assert (15, 10, 0x07);
4124 
4125   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4126   for (i = 0; i < (full ? 4 : 2); i++)
4127     aarch64_set_vec_u32 (cpu, vd, i,
4128 			 aarch64_get_vec_u32 (cpu, vn, i)
4129 			 ^ aarch64_get_vec_u32 (cpu, vm, i));
4130 }
4131 
4132 static void
4133 do_vec_bit (sim_cpu *cpu)
4134 {
4135   /* instr[31]    = 0
4136      instr[30]    = half (0)/full (1)
4137      instr[29,23] = 10 1110 1
4138      instr[22]    = BIT (0) / BIF (1)
4139      instr[21]    = 1
4140      instr[20,16] = Vm
4141      instr[15,10] = 0001 11
4142      instr[9,5]   = Vn
4143      instr[4.0]   = Vd.  */
4144 
4145   unsigned vm = INSTR (20, 16);
4146   unsigned vn = INSTR (9, 5);
4147   unsigned vd = INSTR (4, 0);
4148   unsigned full = INSTR (30, 30);
4149   unsigned test_false = INSTR (22, 22);
4150   unsigned i;
4151 
4152   NYI_assert (29, 23, 0x5D);
4153   NYI_assert (21, 21, 1);
4154   NYI_assert (15, 10, 0x07);
4155 
4156   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4157   for (i = 0; i < (full ? 4 : 2); i++)
4158     {
4159       uint32_t vd_val = aarch64_get_vec_u32 (cpu, vd, i);
4160       uint32_t vn_val = aarch64_get_vec_u32 (cpu, vn, i);
4161       uint32_t vm_val = aarch64_get_vec_u32 (cpu, vm, i);
4162       if (test_false)
4163 	aarch64_set_vec_u32 (cpu, vd, i,
4164 			     (vd_val & vm_val) | (vn_val & ~vm_val));
4165       else
4166 	aarch64_set_vec_u32 (cpu, vd, i,
4167 			     (vd_val & ~vm_val) | (vn_val & vm_val));
4168     }
4169 }
4170 
4171 static void
4172 do_vec_ORN (sim_cpu *cpu)
4173 {
4174   /* instr[31]    = 0
4175      instr[30]    = half (0)/full (1)
4176      instr[29,21] = 00 1110 111
4177      instr[20,16] = Vm
4178      instr[15,10] = 00 0111
4179      instr[9,5]   = Vn
4180      instr[4.0]   = Vd.  */
4181 
4182   unsigned vm = INSTR (20, 16);
4183   unsigned vn = INSTR (9, 5);
4184   unsigned vd = INSTR (4, 0);
4185   unsigned i;
4186   int      full = INSTR (30, 30);
4187 
4188   NYI_assert (29, 21, 0x077);
4189   NYI_assert (15, 10, 0x07);
4190 
4191   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4192   for (i = 0; i < (full ? 16 : 8); i++)
4193     aarch64_set_vec_u8 (cpu, vd, i,
4194 			aarch64_get_vec_u8 (cpu, vn, i)
4195 			| ~ aarch64_get_vec_u8 (cpu, vm, i));
4196 }
4197 
4198 static void
4199 do_vec_ORR (sim_cpu *cpu)
4200 {
4201   /* instr[31]    = 0
4202      instr[30]    = half (0)/full (1)
4203      instr[29,21] = 00 1110 101
4204      instr[20,16] = Vm
4205      instr[15,10] = 0001 11
4206      instr[9,5]   = Vn
4207      instr[4.0]   = Vd.  */
4208 
4209   unsigned vm = INSTR (20, 16);
4210   unsigned vn = INSTR (9, 5);
4211   unsigned vd = INSTR (4, 0);
4212   unsigned i;
4213   int      full = INSTR (30, 30);
4214 
4215   NYI_assert (29, 21, 0x075);
4216   NYI_assert (15, 10, 0x07);
4217 
4218   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4219   for (i = 0; i < (full ? 16 : 8); i++)
4220     aarch64_set_vec_u8 (cpu, vd, i,
4221 			aarch64_get_vec_u8 (cpu, vn, i)
4222 			| aarch64_get_vec_u8 (cpu, vm, i));
4223 }
4224 
4225 static void
4226 do_vec_BIC (sim_cpu *cpu)
4227 {
4228   /* instr[31]    = 0
4229      instr[30]    = half (0)/full (1)
4230      instr[29,21] = 00 1110 011
4231      instr[20,16] = Vm
4232      instr[15,10] = 00 0111
4233      instr[9,5]   = Vn
4234      instr[4.0]   = Vd.  */
4235 
4236   unsigned vm = INSTR (20, 16);
4237   unsigned vn = INSTR (9, 5);
4238   unsigned vd = INSTR (4, 0);
4239   unsigned i;
4240   int      full = INSTR (30, 30);
4241 
4242   NYI_assert (29, 21, 0x073);
4243   NYI_assert (15, 10, 0x07);
4244 
4245   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4246   for (i = 0; i < (full ? 16 : 8); i++)
4247     aarch64_set_vec_u8 (cpu, vd, i,
4248 			aarch64_get_vec_u8 (cpu, vn, i)
4249 			& ~ aarch64_get_vec_u8 (cpu, vm, i));
4250 }
4251 
4252 static void
4253 do_vec_XTN (sim_cpu *cpu)
4254 {
4255   /* instr[31]    = 0
4256      instr[30]    = first part (0)/ second part (1)
4257      instr[29,24] = 00 1110
4258      instr[23,22] = size: byte(00), half(01), word (10)
4259      instr[21,10] = 1000 0100 1010
4260      instr[9,5]   = Vs
4261      instr[4,0]   = Vd.  */
4262 
4263   unsigned vs = INSTR (9, 5);
4264   unsigned vd = INSTR (4, 0);
4265   unsigned bias = INSTR (30, 30);
4266   unsigned i;
4267 
4268   NYI_assert (29, 24, 0x0E);
4269   NYI_assert (21, 10, 0x84A);
4270 
4271   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4272   switch (INSTR (23, 22))
4273     {
4274     case 0:
4275       for (i = 0; i < 8; i++)
4276 	aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
4277 			    aarch64_get_vec_u16 (cpu, vs, i));
4278       return;
4279 
4280     case 1:
4281       for (i = 0; i < 4; i++)
4282 	aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
4283 			     aarch64_get_vec_u32 (cpu, vs, i));
4284       return;
4285 
4286     case 2:
4287       for (i = 0; i < 2; i++)
4288 	aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
4289 			     aarch64_get_vec_u64 (cpu, vs, i));
4290       return;
4291     }
4292 }
4293 
4294 /* Return the number of bits set in the input value.  */
4295 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
4296 # define popcount __builtin_popcount
4297 #else
4298 static int
4299 popcount (unsigned char x)
4300 {
4301   static const unsigned char popcnt[16] =
4302     {
4303       0, 1, 1, 2,
4304       1, 2, 2, 3,
4305       1, 2, 2, 3,
4306       2, 3, 3, 4
4307     };
4308 
4309   /* Only counts the low 8 bits of the input as that is all we need.  */
4310   return popcnt[x % 16] + popcnt[x / 16];
4311 }
4312 #endif
4313 
4314 static void
4315 do_vec_CNT (sim_cpu *cpu)
4316 {
4317   /* instr[31]    = 0
4318      instr[30]    = half (0)/ full (1)
4319      instr[29,24] = 00 1110
4320      instr[23,22] = size: byte(00)
4321      instr[21,10] = 1000 0001 0110
4322      instr[9,5]   = Vs
4323      instr[4,0]   = Vd.  */
4324 
4325   unsigned vs = INSTR (9, 5);
4326   unsigned vd = INSTR (4, 0);
4327   int full = INSTR (30, 30);
4328   int size = INSTR (23, 22);
4329   int i;
4330 
4331   NYI_assert (29, 24, 0x0E);
4332   NYI_assert (21, 10, 0x816);
4333 
4334   if (size != 0)
4335     HALT_UNALLOC;
4336 
4337   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4338 
4339   for (i = 0; i < (full ? 16 : 8); i++)
4340     aarch64_set_vec_u8 (cpu, vd, i,
4341 			popcount (aarch64_get_vec_u8 (cpu, vs, i)));
4342 }
4343 
4344 static void
4345 do_vec_maxv (sim_cpu *cpu)
4346 {
4347   /* instr[31]    = 0
4348      instr[30]    = half(0)/full(1)
4349      instr[29]    = signed (0)/unsigned(1)
4350      instr[28,24] = 0 1110
4351      instr[23,22] = size: byte(00), half(01), word (10)
4352      instr[21]    = 1
4353      instr[20,17] = 1 000
4354      instr[16]    = max(0)/min(1)
4355      instr[15,10] = 1010 10
4356      instr[9,5]   = V source
4357      instr[4.0]   = R dest.  */
4358 
4359   unsigned vs = INSTR (9, 5);
4360   unsigned rd = INSTR (4, 0);
4361   unsigned full = INSTR (30, 30);
4362   unsigned i;
4363 
4364   NYI_assert (28, 24, 0x0E);
4365   NYI_assert (21, 21, 1);
4366   NYI_assert (20, 17, 8);
4367   NYI_assert (15, 10, 0x2A);
4368 
4369   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4370   switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
4371     {
4372     case 0: /* SMAXV.  */
4373        {
4374 	int64_t smax;
4375 	switch (INSTR (23, 22))
4376 	  {
4377 	  case 0:
4378 	    smax = aarch64_get_vec_s8 (cpu, vs, 0);
4379 	    for (i = 1; i < (full ? 16 : 8); i++)
4380 	      smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
4381 	    break;
4382 	  case 1:
4383 	    smax = aarch64_get_vec_s16 (cpu, vs, 0);
4384 	    for (i = 1; i < (full ? 8 : 4); i++)
4385 	      smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
4386 	    break;
4387 	  case 2:
4388 	    smax = aarch64_get_vec_s32 (cpu, vs, 0);
4389 	    for (i = 1; i < (full ? 4 : 2); i++)
4390 	      smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
4391 	    break;
4392 	  case 3:
4393 	    HALT_UNALLOC;
4394 	  }
4395 	aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
4396 	return;
4397       }
4398 
4399     case 1: /* SMINV.  */
4400       {
4401 	int64_t smin;
4402 	switch (INSTR (23, 22))
4403 	  {
4404 	  case 0:
4405 	    smin = aarch64_get_vec_s8 (cpu, vs, 0);
4406 	    for (i = 1; i < (full ? 16 : 8); i++)
4407 	      smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
4408 	    break;
4409 	  case 1:
4410 	    smin = aarch64_get_vec_s16 (cpu, vs, 0);
4411 	    for (i = 1; i < (full ? 8 : 4); i++)
4412 	      smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
4413 	    break;
4414 	  case 2:
4415 	    smin = aarch64_get_vec_s32 (cpu, vs, 0);
4416 	    for (i = 1; i < (full ? 4 : 2); i++)
4417 	      smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
4418 	    break;
4419 
4420 	  case 3:
4421 	    HALT_UNALLOC;
4422 	  }
4423 	aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
4424 	return;
4425       }
4426 
4427     case 2: /* UMAXV.  */
4428       {
4429 	uint64_t umax;
4430 	switch (INSTR (23, 22))
4431 	  {
4432 	  case 0:
4433 	    umax = aarch64_get_vec_u8 (cpu, vs, 0);
4434 	    for (i = 1; i < (full ? 16 : 8); i++)
4435 	      umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
4436 	    break;
4437 	  case 1:
4438 	    umax = aarch64_get_vec_u16 (cpu, vs, 0);
4439 	    for (i = 1; i < (full ? 8 : 4); i++)
4440 	      umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
4441 	    break;
4442 	  case 2:
4443 	    umax = aarch64_get_vec_u32 (cpu, vs, 0);
4444 	    for (i = 1; i < (full ? 4 : 2); i++)
4445 	      umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
4446 	    break;
4447 
4448 	  case 3:
4449 	    HALT_UNALLOC;
4450 	  }
4451 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
4452 	return;
4453       }
4454 
4455     case 3: /* UMINV.  */
4456       {
4457 	uint64_t umin;
4458 	switch (INSTR (23, 22))
4459 	  {
4460 	  case 0:
4461 	    umin = aarch64_get_vec_u8 (cpu, vs, 0);
4462 	    for (i = 1; i < (full ? 16 : 8); i++)
4463 	      umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
4464 	    break;
4465 	  case 1:
4466 	    umin = aarch64_get_vec_u16 (cpu, vs, 0);
4467 	    for (i = 1; i < (full ? 8 : 4); i++)
4468 	      umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
4469 	    break;
4470 	  case 2:
4471 	    umin = aarch64_get_vec_u32 (cpu, vs, 0);
4472 	    for (i = 1; i < (full ? 4 : 2); i++)
4473 	      umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
4474 	    break;
4475 
4476 	  case 3:
4477 	    HALT_UNALLOC;
4478 	  }
4479 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
4480 	return;
4481       }
4482     }
4483 }
4484 
4485 static void
4486 do_vec_fminmaxV (sim_cpu *cpu)
4487 {
4488   /* instr[31,24] = 0110 1110
4489      instr[23]    = max(0)/min(1)
4490      instr[22,14] = 011 0000 11
4491      instr[13,12] = nm(00)/normal(11)
4492      instr[11,10] = 10
4493      instr[9,5]   = V source
4494      instr[4.0]   = R dest.  */
4495 
4496   unsigned vs = INSTR (9, 5);
4497   unsigned rd = INSTR (4, 0);
4498   unsigned i;
4499   float res   = aarch64_get_vec_float (cpu, vs, 0);
4500 
4501   NYI_assert (31, 24, 0x6E);
4502   NYI_assert (22, 14, 0x0C3);
4503   NYI_assert (11, 10, 2);
4504 
4505   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4506   if (INSTR (23, 23))
4507     {
4508       switch (INSTR (13, 12))
4509 	{
4510 	case 0: /* FMNINNMV.  */
4511 	  for (i = 1; i < 4; i++)
4512 	    res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
4513 	  break;
4514 
4515 	case 3: /* FMINV.  */
4516 	  for (i = 1; i < 4; i++)
4517 	    res = min (res, aarch64_get_vec_float (cpu, vs, i));
4518 	  break;
4519 
4520 	default:
4521 	  HALT_NYI;
4522 	}
4523     }
4524   else
4525     {
4526       switch (INSTR (13, 12))
4527 	{
4528 	case 0: /* FMNAXNMV.  */
4529 	  for (i = 1; i < 4; i++)
4530 	    res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
4531 	  break;
4532 
4533 	case 3: /* FMAXV.  */
4534 	  for (i = 1; i < 4; i++)
4535 	    res = max (res, aarch64_get_vec_float (cpu, vs, i));
4536 	  break;
4537 
4538 	default:
4539 	  HALT_NYI;
4540 	}
4541     }
4542 
4543   aarch64_set_FP_float (cpu, rd, res);
4544 }
4545 
4546 static void
4547 do_vec_Fminmax (sim_cpu *cpu)
4548 {
4549   /* instr[31]    = 0
4550      instr[30]    = half(0)/full(1)
4551      instr[29,24] = 00 1110
4552      instr[23]    = max(0)/min(1)
4553      instr[22]    = float(0)/double(1)
4554      instr[21]    = 1
4555      instr[20,16] = Vm
4556      instr[15,14] = 11
4557      instr[13,12] = nm(00)/normal(11)
4558      instr[11,10] = 01
4559      instr[9,5]   = Vn
4560      instr[4,0]   = Vd.  */
4561 
4562   unsigned vm = INSTR (20, 16);
4563   unsigned vn = INSTR (9, 5);
4564   unsigned vd = INSTR (4, 0);
4565   unsigned full = INSTR (30, 30);
4566   unsigned min = INSTR (23, 23);
4567   unsigned i;
4568 
4569   NYI_assert (29, 24, 0x0E);
4570   NYI_assert (21, 21, 1);
4571   NYI_assert (15, 14, 3);
4572   NYI_assert (11, 10, 1);
4573 
4574   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4575   if (INSTR (22, 22))
4576     {
4577       double (* func)(double, double);
4578 
4579       if (! full)
4580 	HALT_NYI;
4581 
4582       if (INSTR (13, 12) == 0)
4583 	func = min ? dminnm : dmaxnm;
4584       else if (INSTR (13, 12) == 3)
4585 	func = min ? fmin : fmax;
4586       else
4587 	HALT_NYI;
4588 
4589       for (i = 0; i < 2; i++)
4590 	aarch64_set_vec_double (cpu, vd, i,
4591 				func (aarch64_get_vec_double (cpu, vn, i),
4592 				      aarch64_get_vec_double (cpu, vm, i)));
4593     }
4594   else
4595     {
4596       float (* func)(float, float);
4597 
4598       if (INSTR (13, 12) == 0)
4599 	func = min ? fminnm : fmaxnm;
4600       else if (INSTR (13, 12) == 3)
4601 	func = min ? fminf : fmaxf;
4602       else
4603 	HALT_NYI;
4604 
4605       for (i = 0; i < (full ? 4 : 2); i++)
4606 	aarch64_set_vec_float (cpu, vd, i,
4607 			       func (aarch64_get_vec_float (cpu, vn, i),
4608 				     aarch64_get_vec_float (cpu, vm, i)));
4609     }
4610 }
4611 
4612 static void
4613 do_vec_SCVTF (sim_cpu *cpu)
4614 {
4615   /* instr[31]    = 0
4616      instr[30]    = Q
4617      instr[29,23] = 00 1110 0
4618      instr[22]    = float(0)/double(1)
4619      instr[21,10] = 10 0001 1101 10
4620      instr[9,5]   = Vn
4621      instr[4,0]   = Vd.  */
4622 
4623   unsigned vn = INSTR (9, 5);
4624   unsigned vd = INSTR (4, 0);
4625   unsigned full = INSTR (30, 30);
4626   unsigned size = INSTR (22, 22);
4627   unsigned i;
4628 
4629   NYI_assert (29, 23, 0x1C);
4630   NYI_assert (21, 10, 0x876);
4631 
4632   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4633   if (size)
4634     {
4635       if (! full)
4636 	HALT_UNALLOC;
4637 
4638       for (i = 0; i < 2; i++)
4639 	{
4640 	  double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
4641 	  aarch64_set_vec_double (cpu, vd, i, val);
4642 	}
4643     }
4644   else
4645     {
4646       for (i = 0; i < (full ? 4 : 2); i++)
4647 	{
4648 	  float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
4649 	  aarch64_set_vec_float (cpu, vd, i, val);
4650 	}
4651     }
4652 }
4653 
4654 #define VEC_CMP(SOURCE, CMP)						\
4655   do									\
4656     {									\
4657       switch (size)							\
4658 	{								\
4659 	case 0:								\
4660 	  for (i = 0; i < (full ? 16 : 8); i++)				\
4661 	    aarch64_set_vec_u8 (cpu, vd, i,				\
4662 				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4663 				CMP					\
4664 				aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
4665 				? -1 : 0);				\
4666 	  return;							\
4667 	case 1:								\
4668 	  for (i = 0; i < (full ? 8 : 4); i++)				\
4669 	    aarch64_set_vec_u16 (cpu, vd, i,				\
4670 				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4671 				 CMP					\
4672 				 aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
4673 				 ? -1 : 0);				\
4674 	  return;							\
4675 	case 2:								\
4676 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4677 	    aarch64_set_vec_u32 (cpu, vd, i, \
4678 				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4679 				 CMP					\
4680 				 aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
4681 				 ? -1 : 0);				\
4682 	  return;							\
4683 	case 3:								\
4684 	  if (! full)							\
4685 	    HALT_UNALLOC;						\
4686 	  for (i = 0; i < 2; i++)					\
4687 	    aarch64_set_vec_u64 (cpu, vd, i, \
4688 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4689 				 CMP					\
4690 				 aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
4691 				 ? -1ULL : 0);				\
4692 	  return;							\
4693 	}								\
4694     }									\
4695   while (0)
4696 
4697 #define VEC_CMP0(SOURCE, CMP)						\
4698   do									\
4699     {									\
4700       switch (size)							\
4701 	{								\
4702 	case 0:								\
4703 	  for (i = 0; i < (full ? 16 : 8); i++)				\
4704 	    aarch64_set_vec_u8 (cpu, vd, i,				\
4705 				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4706 				CMP 0 ? -1 : 0);			\
4707 	  return;							\
4708 	case 1:								\
4709 	  for (i = 0; i < (full ? 8 : 4); i++)				\
4710 	    aarch64_set_vec_u16 (cpu, vd, i,				\
4711 				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4712 				 CMP 0 ? -1 : 0);			\
4713 	  return;							\
4714 	case 2:								\
4715 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4716 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4717 				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4718 				 CMP 0 ? -1 : 0);			\
4719 	  return;							\
4720 	case 3:								\
4721 	  if (! full)							\
4722 	    HALT_UNALLOC;						\
4723 	  for (i = 0; i < 2; i++)					\
4724 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4725 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4726 				 CMP 0 ? -1ULL : 0);			\
4727 	  return;							\
4728 	}								\
4729     }									\
4730   while (0)
4731 
4732 #define VEC_FCMP0(CMP)							\
4733   do									\
4734     {									\
4735       if (vm != 0)							\
4736 	HALT_NYI;							\
4737       if (INSTR (22, 22))						\
4738 	{								\
4739 	  if (! full)							\
4740 	    HALT_NYI;							\
4741 	  for (i = 0; i < 2; i++)					\
4742 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4743 				 aarch64_get_vec_double (cpu, vn, i)	\
4744 				 CMP 0.0 ? -1 : 0);			\
4745 	}								\
4746       else								\
4747 	{								\
4748 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4749 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4750 				 aarch64_get_vec_float (cpu, vn, i)	\
4751 				 CMP 0.0 ? -1 : 0);			\
4752 	}								\
4753       return;								\
4754     }									\
4755   while (0)
4756 
4757 #define VEC_FCMP(CMP)							\
4758   do									\
4759     {									\
4760       if (INSTR (22, 22))						\
4761 	{								\
4762 	  if (! full)							\
4763 	    HALT_NYI;							\
4764 	  for (i = 0; i < 2; i++)					\
4765 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4766 				 aarch64_get_vec_double (cpu, vn, i)	\
4767 				 CMP					\
4768 				 aarch64_get_vec_double (cpu, vm, i)	\
4769 				 ? -1 : 0);				\
4770 	}								\
4771       else								\
4772 	{								\
4773 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4774 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4775 				 aarch64_get_vec_float (cpu, vn, i)	\
4776 				 CMP					\
4777 				 aarch64_get_vec_float (cpu, vm, i)	\
4778 				 ? -1 : 0);				\
4779 	}								\
4780       return;								\
4781     }									\
4782   while (0)
4783 
4784 static void
4785 do_vec_compare (sim_cpu *cpu)
4786 {
4787   /* instr[31]    = 0
4788      instr[30]    = half(0)/full(1)
4789      instr[29]    = part-of-comparison-type
4790      instr[28,24] = 0 1110
4791      instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
4792                     type of float compares: single (-0) / double (-1)
4793      instr[21]    = 1
4794      instr[20,16] = Vm or 00000 (compare vs 0)
4795      instr[15,10] = part-of-comparison-type
4796      instr[9,5]   = Vn
4797      instr[4.0]   = Vd.  */
4798 
4799   int full = INSTR (30, 30);
4800   int size = INSTR (23, 22);
4801   unsigned vm = INSTR (20, 16);
4802   unsigned vn = INSTR (9, 5);
4803   unsigned vd = INSTR (4, 0);
4804   unsigned i;
4805 
4806   NYI_assert (28, 24, 0x0E);
4807   NYI_assert (21, 21, 1);
4808 
4809   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4810   if ((INSTR (11, 11)
4811        && INSTR (14, 14))
4812       || ((INSTR (11, 11) == 0
4813 	   && INSTR (10, 10) == 0)))
4814     {
4815       /* A compare vs 0.  */
4816       if (vm != 0)
4817 	{
4818 	  if (INSTR (15, 10) == 0x2A)
4819 	    do_vec_maxv (cpu);
4820 	  else if (INSTR (15, 10) == 0x32
4821 		   || INSTR (15, 10) == 0x3E)
4822 	    do_vec_fminmaxV (cpu);
4823 	  else if (INSTR (29, 23) == 0x1C
4824 		   && INSTR (21, 10) == 0x876)
4825 	    do_vec_SCVTF (cpu);
4826 	  else
4827 	    HALT_NYI;
4828 	  return;
4829 	}
4830     }
4831 
4832   if (INSTR (14, 14))
4833     {
4834       /* A floating point compare.  */
4835       unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
4836 	| INSTR (13, 10);
4837 
4838       NYI_assert (15, 15, 1);
4839 
4840       switch (decode)
4841 	{
4842 	case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
4843 	case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
4844 	case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
4845 	case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
4846 	case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
4847 	case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
4848 	case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
4849 	case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);
4850 
4851 	default:
4852 	  HALT_NYI;
4853 	}
4854     }
4855   else
4856     {
4857       unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
4858 
4859       switch (decode)
4860 	{
4861 	case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
4862 	case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
4863 	case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
4864 	case 0x23: /* 0100011 TST */	VEC_CMP  (u, & );
4865 	case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
4866 	case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
4867 	case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
4868 	case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
4869 	case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
4870 	case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
4871 	case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
4872 	default:
4873 	  if (vm == 0)
4874 	    HALT_NYI;
4875 	  do_vec_maxv (cpu);
4876 	}
4877     }
4878 }
4879 
4880 static void
4881 do_vec_SSHL (sim_cpu *cpu)
4882 {
4883   /* instr[31]    = 0
4884      instr[30]    = first part (0)/ second part (1)
4885      instr[29,24] = 00 1110
4886      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4887      instr[21]    = 1
4888      instr[20,16] = Vm
4889      instr[15,10] = 0100 01
4890      instr[9,5]   = Vn
4891      instr[4,0]   = Vd.  */
4892 
4893   unsigned full = INSTR (30, 30);
4894   unsigned vm = INSTR (20, 16);
4895   unsigned vn = INSTR (9, 5);
4896   unsigned vd = INSTR (4, 0);
4897   unsigned i;
4898   signed int shift;
4899 
4900   NYI_assert (29, 24, 0x0E);
4901   NYI_assert (21, 21, 1);
4902   NYI_assert (15, 10, 0x11);
4903 
4904   /* FIXME: What is a signed shift left in this context ?.  */
4905 
4906   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4907   switch (INSTR (23, 22))
4908     {
4909     case 0:
4910       for (i = 0; i < (full ? 16 : 8); i++)
4911 	{
4912 	  shift = aarch64_get_vec_s8 (cpu, vm, i);
4913 	  if (shift >= 0)
4914 	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4915 				<< shift);
4916 	  else
4917 	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4918 				>> - shift);
4919 	}
4920       return;
4921 
4922     case 1:
4923       for (i = 0; i < (full ? 8 : 4); i++)
4924 	{
4925 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4926 	  if (shift >= 0)
4927 	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4928 				 << shift);
4929 	  else
4930 	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4931 				 >> - shift);
4932 	}
4933       return;
4934 
4935     case 2:
4936       for (i = 0; i < (full ? 4 : 2); i++)
4937 	{
4938 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4939 	  if (shift >= 0)
4940 	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4941 				 << shift);
4942 	  else
4943 	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4944 				 >> - shift);
4945 	}
4946       return;
4947 
4948     case 3:
4949       if (! full)
4950 	HALT_UNALLOC;
4951       for (i = 0; i < 2; i++)
4952 	{
4953 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4954 	  if (shift >= 0)
4955 	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4956 				 << shift);
4957 	  else
4958 	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4959 				 >> - shift);
4960 	}
4961       return;
4962     }
4963 }
4964 
4965 static void
4966 do_vec_USHL (sim_cpu *cpu)
4967 {
4968   /* instr[31]    = 0
4969      instr[30]    = first part (0)/ second part (1)
4970      instr[29,24] = 10 1110
4971      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4972      instr[21]    = 1
4973      instr[20,16] = Vm
4974      instr[15,10] = 0100 01
4975      instr[9,5]   = Vn
4976      instr[4,0]   = Vd  */
4977 
4978   unsigned full = INSTR (30, 30);
4979   unsigned vm = INSTR (20, 16);
4980   unsigned vn = INSTR (9, 5);
4981   unsigned vd = INSTR (4, 0);
4982   unsigned i;
4983   signed int shift;
4984 
4985   NYI_assert (29, 24, 0x2E);
4986   NYI_assert (15, 10, 0x11);
4987 
4988   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4989   switch (INSTR (23, 22))
4990     {
4991     case 0:
4992 	for (i = 0; i < (full ? 16 : 8); i++)
4993 	  {
4994 	    shift = aarch64_get_vec_s8 (cpu, vm, i);
4995 	    if (shift >= 0)
4996 	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4997 				  << shift);
4998 	    else
4999 	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
5000 				  >> - shift);
5001 	  }
5002       return;
5003 
5004     case 1:
5005       for (i = 0; i < (full ? 8 : 4); i++)
5006 	{
5007 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
5008 	  if (shift >= 0)
5009 	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
5010 				 << shift);
5011 	  else
5012 	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
5013 				 >> - shift);
5014 	}
5015       return;
5016 
5017     case 2:
5018       for (i = 0; i < (full ? 4 : 2); i++)
5019 	{
5020 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
5021 	  if (shift >= 0)
5022 	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
5023 				 << shift);
5024 	  else
5025 	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
5026 				 >> - shift);
5027 	}
5028       return;
5029 
5030     case 3:
5031       if (! full)
5032 	HALT_UNALLOC;
5033       for (i = 0; i < 2; i++)
5034 	{
5035 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
5036 	  if (shift >= 0)
5037 	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
5038 				 << shift);
5039 	  else
5040 	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
5041 				 >> - shift);
5042 	}
5043       return;
5044     }
5045 }
5046 
5047 static void
5048 do_vec_FMLA (sim_cpu *cpu)
5049 {
5050   /* instr[31]    = 0
5051      instr[30]    = full/half selector
5052      instr[29,23] = 0011100
5053      instr[22]    = size: 0=>float, 1=>double
5054      instr[21]    = 1
5055      instr[20,16] = Vn
5056      instr[15,10] = 1100 11
5057      instr[9,5]   = Vm
5058      instr[4.0]   = Vd.  */
5059 
5060   unsigned vm = INSTR (20, 16);
5061   unsigned vn = INSTR (9, 5);
5062   unsigned vd = INSTR (4, 0);
5063   unsigned i;
5064   int      full = INSTR (30, 30);
5065 
5066   NYI_assert (29, 23, 0x1C);
5067   NYI_assert (21, 21, 1);
5068   NYI_assert (15, 10, 0x33);
5069 
5070   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5071   if (INSTR (22, 22))
5072     {
5073       if (! full)
5074 	HALT_UNALLOC;
5075       for (i = 0; i < 2; i++)
5076 	aarch64_set_vec_double (cpu, vd, i,
5077 				aarch64_get_vec_double (cpu, vn, i) *
5078 				aarch64_get_vec_double (cpu, vm, i) +
5079 				aarch64_get_vec_double (cpu, vd, i));
5080     }
5081   else
5082     {
5083       for (i = 0; i < (full ? 4 : 2); i++)
5084 	aarch64_set_vec_float (cpu, vd, i,
5085 			       aarch64_get_vec_float (cpu, vn, i) *
5086 			       aarch64_get_vec_float (cpu, vm, i) +
5087 			       aarch64_get_vec_float (cpu, vd, i));
5088     }
5089 }
5090 
5091 static void
5092 do_vec_max (sim_cpu *cpu)
5093 {
5094   /* instr[31]    = 0
5095      instr[30]    = full/half selector
5096      instr[29]    = SMAX (0) / UMAX (1)
5097      instr[28,24] = 0 1110
5098      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5099      instr[21]    = 1
5100      instr[20,16] = Vn
5101      instr[15,10] = 0110 01
5102      instr[9,5]   = Vm
5103      instr[4.0]   = Vd.  */
5104 
5105   unsigned vm = INSTR (20, 16);
5106   unsigned vn = INSTR (9, 5);
5107   unsigned vd = INSTR (4, 0);
5108   unsigned i;
5109   int      full = INSTR (30, 30);
5110 
5111   NYI_assert (28, 24, 0x0E);
5112   NYI_assert (21, 21, 1);
5113   NYI_assert (15, 10, 0x19);
5114 
5115   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5116   if (INSTR (29, 29))
5117     {
5118       switch (INSTR (23, 22))
5119 	{
5120 	case 0:
5121 	  for (i = 0; i < (full ? 16 : 8); i++)
5122 	    aarch64_set_vec_u8 (cpu, vd, i,
5123 				aarch64_get_vec_u8 (cpu, vn, i)
5124 				> aarch64_get_vec_u8 (cpu, vm, i)
5125 				? aarch64_get_vec_u8 (cpu, vn, i)
5126 				: aarch64_get_vec_u8 (cpu, vm, i));
5127 	  return;
5128 
5129 	case 1:
5130 	  for (i = 0; i < (full ? 8 : 4); i++)
5131 	    aarch64_set_vec_u16 (cpu, vd, i,
5132 				 aarch64_get_vec_u16 (cpu, vn, i)
5133 				 > aarch64_get_vec_u16 (cpu, vm, i)
5134 				 ? aarch64_get_vec_u16 (cpu, vn, i)
5135 				 : aarch64_get_vec_u16 (cpu, vm, i));
5136 	  return;
5137 
5138 	case 2:
5139 	  for (i = 0; i < (full ? 4 : 2); i++)
5140 	    aarch64_set_vec_u32 (cpu, vd, i,
5141 				 aarch64_get_vec_u32 (cpu, vn, i)
5142 				 > aarch64_get_vec_u32 (cpu, vm, i)
5143 				 ? aarch64_get_vec_u32 (cpu, vn, i)
5144 				 : aarch64_get_vec_u32 (cpu, vm, i));
5145 	  return;
5146 
5147 	case 3:
5148 	  HALT_UNALLOC;
5149 	}
5150     }
5151   else
5152     {
5153       switch (INSTR (23, 22))
5154 	{
5155 	case 0:
5156 	  for (i = 0; i < (full ? 16 : 8); i++)
5157 	    aarch64_set_vec_s8 (cpu, vd, i,
5158 				aarch64_get_vec_s8 (cpu, vn, i)
5159 				> aarch64_get_vec_s8 (cpu, vm, i)
5160 				? aarch64_get_vec_s8 (cpu, vn, i)
5161 				: aarch64_get_vec_s8 (cpu, vm, i));
5162 	  return;
5163 
5164 	case 1:
5165 	  for (i = 0; i < (full ? 8 : 4); i++)
5166 	    aarch64_set_vec_s16 (cpu, vd, i,
5167 				 aarch64_get_vec_s16 (cpu, vn, i)
5168 				 > aarch64_get_vec_s16 (cpu, vm, i)
5169 				 ? aarch64_get_vec_s16 (cpu, vn, i)
5170 				 : aarch64_get_vec_s16 (cpu, vm, i));
5171 	  return;
5172 
5173 	case 2:
5174 	  for (i = 0; i < (full ? 4 : 2); i++)
5175 	    aarch64_set_vec_s32 (cpu, vd, i,
5176 				 aarch64_get_vec_s32 (cpu, vn, i)
5177 				 > aarch64_get_vec_s32 (cpu, vm, i)
5178 				 ? aarch64_get_vec_s32 (cpu, vn, i)
5179 				 : aarch64_get_vec_s32 (cpu, vm, i));
5180 	  return;
5181 
5182 	case 3:
5183 	  HALT_UNALLOC;
5184 	}
5185     }
5186 }
5187 
5188 static void
5189 do_vec_min (sim_cpu *cpu)
5190 {
5191   /* instr[31]    = 0
5192      instr[30]    = full/half selector
5193      instr[29]    = SMIN (0) / UMIN (1)
5194      instr[28,24] = 0 1110
5195      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5196      instr[21]    = 1
5197      instr[20,16] = Vn
5198      instr[15,10] = 0110 11
5199      instr[9,5]   = Vm
5200      instr[4.0]   = Vd.  */
5201 
5202   unsigned vm = INSTR (20, 16);
5203   unsigned vn = INSTR (9, 5);
5204   unsigned vd = INSTR (4, 0);
5205   unsigned i;
5206   int      full = INSTR (30, 30);
5207 
5208   NYI_assert (28, 24, 0x0E);
5209   NYI_assert (21, 21, 1);
5210   NYI_assert (15, 10, 0x1B);
5211 
5212   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5213   if (INSTR (29, 29))
5214     {
5215       switch (INSTR (23, 22))
5216 	{
5217 	case 0:
5218 	  for (i = 0; i < (full ? 16 : 8); i++)
5219 	    aarch64_set_vec_u8 (cpu, vd, i,
5220 				aarch64_get_vec_u8 (cpu, vn, i)
5221 				< aarch64_get_vec_u8 (cpu, vm, i)
5222 				? aarch64_get_vec_u8 (cpu, vn, i)
5223 				: aarch64_get_vec_u8 (cpu, vm, i));
5224 	  return;
5225 
5226 	case 1:
5227 	  for (i = 0; i < (full ? 8 : 4); i++)
5228 	    aarch64_set_vec_u16 (cpu, vd, i,
5229 				 aarch64_get_vec_u16 (cpu, vn, i)
5230 				 < aarch64_get_vec_u16 (cpu, vm, i)
5231 				 ? aarch64_get_vec_u16 (cpu, vn, i)
5232 				 : aarch64_get_vec_u16 (cpu, vm, i));
5233 	  return;
5234 
5235 	case 2:
5236 	  for (i = 0; i < (full ? 4 : 2); i++)
5237 	    aarch64_set_vec_u32 (cpu, vd, i,
5238 				 aarch64_get_vec_u32 (cpu, vn, i)
5239 				 < aarch64_get_vec_u32 (cpu, vm, i)
5240 				 ? aarch64_get_vec_u32 (cpu, vn, i)
5241 				 : aarch64_get_vec_u32 (cpu, vm, i));
5242 	  return;
5243 
5244 	case 3:
5245 	  HALT_UNALLOC;
5246 	}
5247     }
5248   else
5249     {
5250       switch (INSTR (23, 22))
5251 	{
5252 	case 0:
5253 	  for (i = 0; i < (full ? 16 : 8); i++)
5254 	    aarch64_set_vec_s8 (cpu, vd, i,
5255 				aarch64_get_vec_s8 (cpu, vn, i)
5256 				< aarch64_get_vec_s8 (cpu, vm, i)
5257 				? aarch64_get_vec_s8 (cpu, vn, i)
5258 				: aarch64_get_vec_s8 (cpu, vm, i));
5259 	  return;
5260 
5261 	case 1:
5262 	  for (i = 0; i < (full ? 8 : 4); i++)
5263 	    aarch64_set_vec_s16 (cpu, vd, i,
5264 				 aarch64_get_vec_s16 (cpu, vn, i)
5265 				 < aarch64_get_vec_s16 (cpu, vm, i)
5266 				 ? aarch64_get_vec_s16 (cpu, vn, i)
5267 				 : aarch64_get_vec_s16 (cpu, vm, i));
5268 	  return;
5269 
5270 	case 2:
5271 	  for (i = 0; i < (full ? 4 : 2); i++)
5272 	    aarch64_set_vec_s32 (cpu, vd, i,
5273 				 aarch64_get_vec_s32 (cpu, vn, i)
5274 				 < aarch64_get_vec_s32 (cpu, vm, i)
5275 				 ? aarch64_get_vec_s32 (cpu, vn, i)
5276 				 : aarch64_get_vec_s32 (cpu, vm, i));
5277 	  return;
5278 
5279 	case 3:
5280 	  HALT_UNALLOC;
5281 	}
5282     }
5283 }
5284 
5285 static void
5286 do_vec_sub_long (sim_cpu *cpu)
5287 {
5288   /* instr[31]    = 0
5289      instr[30]    = lower (0) / upper (1)
5290      instr[29]    = signed (0) / unsigned (1)
5291      instr[28,24] = 0 1110
5292      instr[23,22] = size: bytes (00), half (01), word (10)
5293      instr[21]    = 1
5294      insrt[20,16] = Vm
5295      instr[15,10] = 0010 00
5296      instr[9,5]   = Vn
5297      instr[4,0]   = V dest.  */
5298 
5299   unsigned size = INSTR (23, 22);
5300   unsigned vm = INSTR (20, 16);
5301   unsigned vn = INSTR (9, 5);
5302   unsigned vd = INSTR (4, 0);
5303   unsigned bias = 0;
5304   unsigned i;
5305 
5306   NYI_assert (28, 24, 0x0E);
5307   NYI_assert (21, 21, 1);
5308   NYI_assert (15, 10, 0x08);
5309 
5310   if (size == 3)
5311     HALT_UNALLOC;
5312 
5313   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5314   switch (INSTR (30, 29))
5315     {
5316     case 2: /* SSUBL2.  */
5317       bias = 2;
5318     case 0: /* SSUBL.  */
5319       switch (size)
5320 	{
5321 	case 0:
5322 	  bias *= 3;
5323 	  for (i = 0; i < 8; i++)
5324 	    aarch64_set_vec_s16 (cpu, vd, i,
5325 				 aarch64_get_vec_s8 (cpu, vn, i + bias)
5326 				 - aarch64_get_vec_s8 (cpu, vm, i + bias));
5327 	  break;
5328 
5329 	case 1:
5330 	  bias *= 2;
5331 	  for (i = 0; i < 4; i++)
5332 	    aarch64_set_vec_s32 (cpu, vd, i,
5333 				 aarch64_get_vec_s16 (cpu, vn, i + bias)
5334 				 - aarch64_get_vec_s16 (cpu, vm, i + bias));
5335 	  break;
5336 
5337 	case 2:
5338 	  for (i = 0; i < 2; i++)
5339 	    aarch64_set_vec_s64 (cpu, vd, i,
5340 				 aarch64_get_vec_s32 (cpu, vn, i + bias)
5341 				 - aarch64_get_vec_s32 (cpu, vm, i + bias));
5342 	  break;
5343 
5344 	default:
5345 	  HALT_UNALLOC;
5346 	}
5347       break;
5348 
5349     case 3: /* USUBL2.  */
5350       bias = 2;
5351     case 1: /* USUBL.  */
5352       switch (size)
5353 	{
5354 	case 0:
5355 	  bias *= 3;
5356 	  for (i = 0; i < 8; i++)
5357 	    aarch64_set_vec_u16 (cpu, vd, i,
5358 				 aarch64_get_vec_u8 (cpu, vn, i + bias)
5359 				 - aarch64_get_vec_u8 (cpu, vm, i + bias));
5360 	  break;
5361 
5362 	case 1:
5363 	  bias *= 2;
5364 	  for (i = 0; i < 4; i++)
5365 	    aarch64_set_vec_u32 (cpu, vd, i,
5366 				 aarch64_get_vec_u16 (cpu, vn, i + bias)
5367 				 - aarch64_get_vec_u16 (cpu, vm, i + bias));
5368 	  break;
5369 
5370 	case 2:
5371 	  for (i = 0; i < 2; i++)
5372 	    aarch64_set_vec_u64 (cpu, vd, i,
5373 				 aarch64_get_vec_u32 (cpu, vn, i + bias)
5374 				 - aarch64_get_vec_u32 (cpu, vm, i + bias));
5375 	  break;
5376 
5377 	default:
5378 	  HALT_UNALLOC;
5379 	}
5380       break;
5381     }
5382 }
5383 
5384 static void
5385 do_vec_ADDP (sim_cpu *cpu)
5386 {
5387   /* instr[31]    = 0
5388      instr[30]    = half(0)/full(1)
5389      instr[29,24] = 00 1110
5390      instr[23,22] = size: bytes (00), half (01), word (10), long (11)
5391      instr[21]    = 1
5392      insrt[20,16] = Vm
5393      instr[15,10] = 1011 11
5394      instr[9,5]   = Vn
5395      instr[4,0]   = V dest.  */
5396 
5397   FRegister copy_vn;
5398   FRegister copy_vm;
5399   unsigned full = INSTR (30, 30);
5400   unsigned size = INSTR (23, 22);
5401   unsigned vm = INSTR (20, 16);
5402   unsigned vn = INSTR (9, 5);
5403   unsigned vd = INSTR (4, 0);
5404   unsigned i, range;
5405 
5406   NYI_assert (29, 24, 0x0E);
5407   NYI_assert (21, 21, 1);
5408   NYI_assert (15, 10, 0x2F);
5409 
5410   /* Make copies of the source registers in case vd == vn/vm.  */
5411   copy_vn = cpu->fr[vn];
5412   copy_vm = cpu->fr[vm];
5413 
5414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5415   switch (size)
5416     {
5417     case 0:
5418       range = full ? 8 : 4;
5419       for (i = 0; i < range; i++)
5420 	{
5421 	  aarch64_set_vec_u8 (cpu, vd, i,
5422 			      copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
5423 	  aarch64_set_vec_u8 (cpu, vd, i + range,
5424 			      copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
5425 	}
5426       return;
5427 
5428     case 1:
5429       range = full ? 4 : 2;
5430       for (i = 0; i < range; i++)
5431 	{
5432 	  aarch64_set_vec_u16 (cpu, vd, i,
5433 			       copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
5434 	  aarch64_set_vec_u16 (cpu, vd, i + range,
5435 			       copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
5436 	}
5437       return;
5438 
5439     case 2:
5440       range = full ? 2 : 1;
5441       for (i = 0; i < range; i++)
5442 	{
5443 	  aarch64_set_vec_u32 (cpu, vd, i,
5444 			       copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
5445 	  aarch64_set_vec_u32 (cpu, vd, i + range,
5446 			       copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
5447 	}
5448       return;
5449 
5450     case 3:
5451       if (! full)
5452 	HALT_UNALLOC;
5453       aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
5454       aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
5455       return;
5456     }
5457 }
5458 
5459 /* Float point vector convert to longer (precision).  */
5460 static void
5461 do_vec_FCVTL (sim_cpu *cpu)
5462 {
5463   /* instr[31]    = 0
5464      instr[30]    = half (0) / all (1)
5465      instr[29,23] = 00 1110 0
5466      instr[22]    = single (0) / double (1)
5467      instr[21,10] = 10 0001 0111 10
5468      instr[9,5]   = Rn
5469      instr[4,0]   = Rd.  */
5470 
5471   unsigned rn = INSTR (9, 5);
5472   unsigned rd = INSTR (4, 0);
5473   unsigned full = INSTR (30, 30);
5474   unsigned i;
5475 
5476   NYI_assert (31, 31, 0);
5477   NYI_assert (29, 23, 0x1C);
5478   NYI_assert (21, 10, 0x85E);
5479 
5480   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5481   if (INSTR (22, 22))
5482     {
5483       for (i = 0; i < 2; i++)
5484 	aarch64_set_vec_double (cpu, rd, i,
5485 				aarch64_get_vec_float (cpu, rn, i + 2*full));
5486     }
5487   else
5488     {
5489       HALT_NYI;
5490 
5491 #if 0
5492       /* TODO: Implement missing half-float support.  */
5493       for (i = 0; i < 4; i++)
5494 	aarch64_set_vec_float (cpu, rd, i,
5495 			     aarch64_get_vec_halffloat (cpu, rn, i + 4*full));
5496 #endif
5497     }
5498 }
5499 
5500 static void
5501 do_vec_FABS (sim_cpu *cpu)
5502 {
5503   /* instr[31]    = 0
5504      instr[30]    = half(0)/full(1)
5505      instr[29,23] = 00 1110 1
5506      instr[22]    = float(0)/double(1)
5507      instr[21,16] = 10 0000
5508      instr[15,10] = 1111 10
5509      instr[9,5]   = Vn
5510      instr[4,0]   = Vd.  */
5511 
5512   unsigned vn = INSTR (9, 5);
5513   unsigned vd = INSTR (4, 0);
5514   unsigned full = INSTR (30, 30);
5515   unsigned i;
5516 
5517   NYI_assert (29, 23, 0x1D);
5518   NYI_assert (21, 10, 0x83E);
5519 
5520   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5521   if (INSTR (22, 22))
5522     {
5523       if (! full)
5524 	HALT_NYI;
5525 
5526       for (i = 0; i < 2; i++)
5527 	aarch64_set_vec_double (cpu, vd, i,
5528 				fabs (aarch64_get_vec_double (cpu, vn, i)));
5529     }
5530   else
5531     {
5532       for (i = 0; i < (full ? 4 : 2); i++)
5533 	aarch64_set_vec_float (cpu, vd, i,
5534 			       fabsf (aarch64_get_vec_float (cpu, vn, i)));
5535     }
5536 }
5537 
5538 static void
5539 do_vec_FCVTZS (sim_cpu *cpu)
5540 {
5541   /* instr[31]    = 0
5542      instr[30]    = half (0) / all (1)
5543      instr[29,23] = 00 1110 1
5544      instr[22]    = single (0) / double (1)
5545      instr[21,10] = 10 0001 1011 10
5546      instr[9,5]   = Rn
5547      instr[4,0]   = Rd.  */
5548 
5549   unsigned rn = INSTR (9, 5);
5550   unsigned rd = INSTR (4, 0);
5551   unsigned full = INSTR (30, 30);
5552   unsigned i;
5553 
5554   NYI_assert (31, 31, 0);
5555   NYI_assert (29, 23, 0x1D);
5556   NYI_assert (21, 10, 0x86E);
5557 
5558   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5559   if (INSTR (22, 22))
5560     {
5561       if (! full)
5562 	HALT_UNALLOC;
5563 
5564       for (i = 0; i < 2; i++)
5565 	aarch64_set_vec_s64 (cpu, rd, i,
5566 			     (int64_t) aarch64_get_vec_double (cpu, rn, i));
5567     }
5568   else
5569     for (i = 0; i < (full ? 4 : 2); i++)
5570       aarch64_set_vec_s32 (cpu, rd, i,
5571 			   (int32_t) aarch64_get_vec_float (cpu, rn, i));
5572 }
5573 
5574 static void
5575 do_vec_REV64 (sim_cpu *cpu)
5576 {
5577   /* instr[31]    = 0
5578      instr[30]    = full/half
5579      instr[29,24] = 00 1110
5580      instr[23,22] = size
5581      instr[21,10] = 10 0000 0000 10
5582      instr[9,5]   = Rn
5583      instr[4,0]   = Rd.  */
5584 
5585   unsigned rn = INSTR (9, 5);
5586   unsigned rd = INSTR (4, 0);
5587   unsigned size = INSTR (23, 22);
5588   unsigned full = INSTR (30, 30);
5589   unsigned i;
5590   FRegister val;
5591 
5592   NYI_assert (29, 24, 0x0E);
5593   NYI_assert (21, 10, 0x802);
5594 
5595   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5596   switch (size)
5597     {
5598     case 0:
5599       for (i = 0; i < (full ? 16 : 8); i++)
5600 	val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
5601       break;
5602 
5603     case 1:
5604       for (i = 0; i < (full ? 8 : 4); i++)
5605 	val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
5606       break;
5607 
5608     case 2:
5609       for (i = 0; i < (full ? 4 : 2); i++)
5610 	val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
5611       break;
5612 
5613     case 3:
5614       HALT_UNALLOC;
5615     }
5616 
5617   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5618   if (full)
5619     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5620 }
5621 
5622 static void
5623 do_vec_REV16 (sim_cpu *cpu)
5624 {
5625   /* instr[31]    = 0
5626      instr[30]    = full/half
5627      instr[29,24] = 00 1110
5628      instr[23,22] = size
5629      instr[21,10] = 10 0000 0001 10
5630      instr[9,5]   = Rn
5631      instr[4,0]   = Rd.  */
5632 
5633   unsigned rn = INSTR (9, 5);
5634   unsigned rd = INSTR (4, 0);
5635   unsigned size = INSTR (23, 22);
5636   unsigned full = INSTR (30, 30);
5637   unsigned i;
5638   FRegister val;
5639 
5640   NYI_assert (29, 24, 0x0E);
5641   NYI_assert (21, 10, 0x806);
5642 
5643   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5644   switch (size)
5645     {
5646     case 0:
5647       for (i = 0; i < (full ? 16 : 8); i++)
5648 	val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
5649       break;
5650 
5651     default:
5652       HALT_UNALLOC;
5653     }
5654 
5655   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5656   if (full)
5657     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5658 }
5659 
5660 static void
5661 do_vec_op1 (sim_cpu *cpu)
5662 {
5663   /* instr[31]    = 0
5664      instr[30]    = half/full
5665      instr[29,24] = 00 1110
5666      instr[23,21] = ???
5667      instr[20,16] = Vm
5668      instr[15,10] = sub-opcode
5669      instr[9,5]   = Vn
5670      instr[4,0]   = Vd  */
5671   NYI_assert (29, 24, 0x0E);
5672 
5673   if (INSTR (21, 21) == 0)
5674     {
5675       if (INSTR (23, 22) == 0)
5676 	{
5677 	  if (INSTR (30, 30) == 1
5678 	      && INSTR (17, 14) == 0
5679 	      && INSTR (12, 10) == 7)
5680 	    return do_vec_ins_2 (cpu);
5681 
5682 	  switch (INSTR (15, 10))
5683 	    {
5684 	    case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
5685 	    case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
5686 	    case 0x07: do_vec_INS (cpu); return;
5687 	    case 0x0B: do_vec_SMOV_into_scalar (cpu); return;
5688 	    case 0x0F: do_vec_UMOV_into_scalar (cpu); return;
5689 
5690 	    case 0x00:
5691 	    case 0x08:
5692 	    case 0x10:
5693 	    case 0x18:
5694 	      do_vec_TBL (cpu); return;
5695 
5696 	    case 0x06:
5697 	    case 0x16:
5698 	      do_vec_UZP (cpu); return;
5699 
5700 	    case 0x0A: do_vec_TRN (cpu); return;
5701 
5702 	    case 0x0E:
5703 	    case 0x1E:
5704 	      do_vec_ZIP (cpu); return;
5705 
5706 	    default:
5707 	      HALT_NYI;
5708 	    }
5709 	}
5710 
5711       switch (INSTR (13, 10))
5712 	{
5713 	case 0x6: do_vec_UZP (cpu); return;
5714 	case 0xE: do_vec_ZIP (cpu); return;
5715 	case 0xA: do_vec_TRN (cpu); return;
5716 	default:  HALT_NYI;
5717 	}
5718     }
5719 
5720   switch (INSTR (15, 10))
5721     {
5722     case 0x02: do_vec_REV64 (cpu); return;
5723     case 0x06: do_vec_REV16 (cpu); return;
5724 
5725     case 0x07:
5726       switch (INSTR (23, 21))
5727 	{
5728 	case 1: do_vec_AND (cpu); return;
5729 	case 3: do_vec_BIC (cpu); return;
5730 	case 5: do_vec_ORR (cpu); return;
5731 	case 7: do_vec_ORN (cpu); return;
5732 	default: HALT_NYI;
5733 	}
5734 
5735     case 0x08: do_vec_sub_long (cpu); return;
5736     case 0x0a: do_vec_XTN (cpu); return;
5737     case 0x11: do_vec_SSHL (cpu); return;
5738     case 0x16: do_vec_CNT (cpu); return;
5739     case 0x19: do_vec_max (cpu); return;
5740     case 0x1B: do_vec_min (cpu); return;
5741     case 0x21: do_vec_add (cpu); return;
5742     case 0x25: do_vec_MLA (cpu); return;
5743     case 0x27: do_vec_mul (cpu); return;
5744     case 0x2F: do_vec_ADDP (cpu); return;
5745     case 0x30: do_vec_mull (cpu); return;
5746     case 0x33: do_vec_FMLA (cpu); return;
5747     case 0x35: do_vec_fadd (cpu); return;
5748 
5749     case 0x1E:
5750       switch (INSTR (20, 16))
5751 	{
5752 	case 0x01: do_vec_FCVTL (cpu); return;
5753 	default: HALT_NYI;
5754 	}
5755 
5756     case 0x2E:
5757       switch (INSTR (20, 16))
5758 	{
5759 	case 0x00: do_vec_ABS (cpu); return;
5760 	case 0x01: do_vec_FCVTZS (cpu); return;
5761 	case 0x11: do_vec_ADDV (cpu); return;
5762 	default: HALT_NYI;
5763 	}
5764 
5765     case 0x31:
5766     case 0x3B:
5767       do_vec_Fminmax (cpu); return;
5768 
5769     case 0x0D:
5770     case 0x0F:
5771     case 0x22:
5772     case 0x23:
5773     case 0x26:
5774     case 0x2A:
5775     case 0x32:
5776     case 0x36:
5777     case 0x39:
5778     case 0x3A:
5779       do_vec_compare (cpu); return;
5780 
5781     case 0x3E:
5782       do_vec_FABS (cpu); return;
5783 
5784     default:
5785       HALT_NYI;
5786     }
5787 }
5788 
5789 static void
5790 do_vec_xtl (sim_cpu *cpu)
5791 {
5792   /* instr[31]    = 0
5793      instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
5794      instr[28,22] = 0 1111 00
5795      instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
5796      instr[15,10] = 1010 01
5797      instr[9,5]   = V source
5798      instr[4,0]   = V dest.  */
5799 
5800   unsigned vs = INSTR (9, 5);
5801   unsigned vd = INSTR (4, 0);
5802   unsigned i, shift, bias = 0;
5803 
5804   NYI_assert (28, 22, 0x3C);
5805   NYI_assert (15, 10, 0x29);
5806 
5807   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5808   switch (INSTR (30, 29))
5809     {
5810     case 2: /* SXTL2, SSHLL2.  */
5811       bias = 2;
5812     case 0: /* SXTL, SSHLL.  */
5813       if (INSTR (21, 21))
5814 	{
5815 	  int64_t val1, val2;
5816 
5817 	  shift = INSTR (20, 16);
5818 	  /* Get the source values before setting the destination values
5819 	     in case the source and destination are the same.  */
5820 	  val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
5821 	  val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
5822 	  aarch64_set_vec_s64 (cpu, vd, 0, val1);
5823 	  aarch64_set_vec_s64 (cpu, vd, 1, val2);
5824 	}
5825       else if (INSTR (20, 20))
5826 	{
5827 	  int32_t v[4];
5828 	  int32_t v1,v2,v3,v4;
5829 
5830 	  shift = INSTR (19, 16);
5831 	  bias *= 2;
5832 	  for (i = 0; i < 4; i++)
5833 	    v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
5834 	  for (i = 0; i < 4; i++)
5835 	    aarch64_set_vec_s32 (cpu, vd, i, v[i]);
5836 	}
5837       else
5838 	{
5839 	  int16_t v[8];
5840 	  NYI_assert (19, 19, 1);
5841 
5842 	  shift = INSTR (18, 16);
5843 	  bias *= 4;
5844 	  for (i = 0; i < 8; i++)
5845 	    v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
5846 	  for (i = 0; i < 8; i++)
5847 	    aarch64_set_vec_s16 (cpu, vd, i, v[i]);
5848 	}
5849       return;
5850 
5851     case 3: /* UXTL2, USHLL2.  */
5852       bias = 2;
5853     case 1: /* UXTL, USHLL.  */
5854       if (INSTR (21, 21))
5855 	{
5856 	  uint64_t v1, v2;
5857 	  shift = INSTR (20, 16);
5858 	  v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
5859 	  v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
5860 	  aarch64_set_vec_u64 (cpu, vd, 0, v1);
5861 	  aarch64_set_vec_u64 (cpu, vd, 1, v2);
5862 	}
5863       else if (INSTR (20, 20))
5864 	{
5865 	  uint32_t v[4];
5866 	  shift = INSTR (19, 16);
5867 	  bias *= 2;
5868 	  for (i = 0; i < 4; i++)
5869 	    v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
5870 	  for (i = 0; i < 4; i++)
5871 	    aarch64_set_vec_u32 (cpu, vd, i, v[i]);
5872 	}
5873       else
5874 	{
5875 	  uint16_t v[8];
5876 	  NYI_assert (19, 19, 1);
5877 
5878 	  shift = INSTR (18, 16);
5879 	  bias *= 4;
5880 	  for (i = 0; i < 8; i++)
5881 	    v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
5882 	  for (i = 0; i < 8; i++)
5883 	    aarch64_set_vec_u16 (cpu, vd, i, v[i]);
5884 	}
5885       return;
5886     }
5887 }
5888 
5889 static void
5890 do_vec_SHL (sim_cpu *cpu)
5891 {
5892   /* instr [31]    = 0
5893      instr [30]    = half(0)/full(1)
5894      instr [29,23] = 001 1110
5895      instr [22,16] = size and shift amount
5896      instr [15,10] = 01 0101
5897      instr [9, 5]  = Vs
5898      instr [4, 0]  = Vd.  */
5899 
5900   int shift;
5901   int full    = INSTR (30, 30);
5902   unsigned vs = INSTR (9, 5);
5903   unsigned vd = INSTR (4, 0);
5904   unsigned i;
5905 
5906   NYI_assert (29, 23, 0x1E);
5907   NYI_assert (15, 10, 0x15);
5908 
5909   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5910   if (INSTR (22, 22))
5911     {
5912       shift = INSTR (21, 16);
5913 
5914       if (full == 0)
5915 	HALT_UNALLOC;
5916 
5917       for (i = 0; i < 2; i++)
5918 	{
5919 	  uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5920 	  aarch64_set_vec_u64 (cpu, vd, i, val << shift);
5921 	}
5922 
5923       return;
5924     }
5925 
5926   if (INSTR (21, 21))
5927     {
5928       shift = INSTR (20, 16);
5929 
5930       for (i = 0; i < (full ? 4 : 2); i++)
5931 	{
5932 	  uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5933 	  aarch64_set_vec_u32 (cpu, vd, i, val << shift);
5934 	}
5935 
5936       return;
5937     }
5938 
5939   if (INSTR (20, 20))
5940     {
5941       shift = INSTR (19, 16);
5942 
5943       for (i = 0; i < (full ? 8 : 4); i++)
5944 	{
5945 	  uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5946 	  aarch64_set_vec_u16 (cpu, vd, i, val << shift);
5947 	}
5948 
5949       return;
5950     }
5951 
5952   if (INSTR (19, 19) == 0)
5953     HALT_UNALLOC;
5954 
5955   shift = INSTR (18, 16);
5956 
5957   for (i = 0; i < (full ? 16 : 8); i++)
5958     {
5959       uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5960       aarch64_set_vec_u8 (cpu, vd, i, val << shift);
5961     }
5962 }
5963 
5964 static void
5965 do_vec_SSHR_USHR (sim_cpu *cpu)
5966 {
5967   /* instr [31]    = 0
5968      instr [30]    = half(0)/full(1)
5969      instr [29]    = signed(0)/unsigned(1)
5970      instr [28,23] = 0 1111 0
5971      instr [22,16] = size and shift amount
5972      instr [15,10] = 0000 01
5973      instr [9, 5]  = Vs
5974      instr [4, 0]  = Vd.  */
5975 
5976   int full       = INSTR (30, 30);
5977   int sign       = ! INSTR (29, 29);
5978   unsigned shift = INSTR (22, 16);
5979   unsigned vs    = INSTR (9, 5);
5980   unsigned vd    = INSTR (4, 0);
5981   unsigned i;
5982 
5983   NYI_assert (28, 23, 0x1E);
5984   NYI_assert (15, 10, 0x01);
5985 
5986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5987   if (INSTR (22, 22))
5988     {
5989       shift = 128 - shift;
5990 
5991       if (full == 0)
5992 	HALT_UNALLOC;
5993 
5994       if (sign)
5995 	for (i = 0; i < 2; i++)
5996 	  {
5997 	    int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
5998 	    aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
5999 	  }
6000       else
6001 	for (i = 0; i < 2; i++)
6002 	  {
6003 	    uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
6004 	    aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
6005 	  }
6006 
6007       return;
6008     }
6009 
6010   if (INSTR (21, 21))
6011     {
6012       shift = 64 - shift;
6013 
6014       if (sign)
6015 	for (i = 0; i < (full ? 4 : 2); i++)
6016 	  {
6017 	    int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
6018 	    aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
6019 	  }
6020       else
6021 	for (i = 0; i < (full ? 4 : 2); i++)
6022 	  {
6023 	    uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
6024 	    aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
6025 	  }
6026 
6027       return;
6028     }
6029 
6030   if (INSTR (20, 20))
6031     {
6032       shift = 32 - shift;
6033 
6034       if (sign)
6035 	for (i = 0; i < (full ? 8 : 4); i++)
6036 	  {
6037 	    int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
6038 	    aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
6039 	  }
6040       else
6041 	for (i = 0; i < (full ? 8 : 4); i++)
6042 	  {
6043 	    uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
6044 	    aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
6045 	  }
6046 
6047       return;
6048     }
6049 
6050   if (INSTR (19, 19) == 0)
6051     HALT_UNALLOC;
6052 
6053   shift = 16 - shift;
6054 
6055   if (sign)
6056     for (i = 0; i < (full ? 16 : 8); i++)
6057       {
6058 	int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
6059 	aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
6060       }
6061   else
6062     for (i = 0; i < (full ? 16 : 8); i++)
6063       {
6064 	uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
6065 	aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
6066       }
6067 }
6068 
6069 static void
6070 do_vec_MUL_by_element (sim_cpu *cpu)
6071 {
6072   /* instr[31]    = 0
6073      instr[30]    = half/full
6074      instr[29,24] = 00 1111
6075      instr[23,22] = size
6076      instr[21]    = L
6077      instr[20]    = M
6078      instr[19,16] = m
6079      instr[15,12] = 1000
6080      instr[11]    = H
6081      instr[10]    = 0
6082      instr[9,5]   = Vn
6083      instr[4,0]   = Vd  */
6084 
6085   unsigned full     = INSTR (30, 30);
6086   unsigned L        = INSTR (21, 21);
6087   unsigned H        = INSTR (11, 11);
6088   unsigned vn       = INSTR (9, 5);
6089   unsigned vd       = INSTR (4, 0);
6090   unsigned size     = INSTR (23, 22);
6091   unsigned index;
6092   unsigned vm;
6093   unsigned e;
6094 
6095   NYI_assert (29, 24, 0x0F);
6096   NYI_assert (15, 12, 0x8);
6097   NYI_assert (10, 10, 0);
6098 
6099   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6100   switch (size)
6101     {
6102     case 1:
6103       {
6104 	/* 16 bit products.  */
6105 	uint16_t product;
6106 	uint16_t element1;
6107 	uint16_t element2;
6108 
6109 	index = (H << 2) | (L << 1) | INSTR (20, 20);
6110 	vm = INSTR (19, 16);
6111 	element2 = aarch64_get_vec_u16 (cpu, vm, index);
6112 
6113 	for (e = 0; e < (full ? 8 : 4); e ++)
6114 	  {
6115 	    element1 = aarch64_get_vec_u16 (cpu, vn, e);
6116 	    product  = element1 * element2;
6117 	    aarch64_set_vec_u16 (cpu, vd, e, product);
6118 	  }
6119       }
6120       break;
6121 
6122     case 2:
6123       {
6124 	/* 32 bit products.  */
6125 	uint32_t product;
6126 	uint32_t element1;
6127 	uint32_t element2;
6128 
6129 	index = (H << 1) | L;
6130 	vm = INSTR (20, 16);
6131 	element2 = aarch64_get_vec_u32 (cpu, vm, index);
6132 
6133 	for (e = 0; e < (full ? 4 : 2); e ++)
6134 	  {
6135 	    element1 = aarch64_get_vec_u32 (cpu, vn, e);
6136 	    product  = element1 * element2;
6137 	    aarch64_set_vec_u32 (cpu, vd, e, product);
6138 	  }
6139       }
6140       break;
6141 
6142     default:
6143       HALT_UNALLOC;
6144     }
6145 }
6146 
6147 static void
6148 do_FMLA_by_element (sim_cpu *cpu)
6149 {
6150   /* instr[31]    = 0
6151      instr[30]    = half/full
6152      instr[29,23] = 00 1111 1
6153      instr[22]    = size
6154      instr[21]    = L
6155      instr[20,16] = m
6156      instr[15,12] = 0001
6157      instr[11]    = H
6158      instr[10]    = 0
6159      instr[9,5]   = Vn
6160      instr[4,0]   = Vd  */
6161 
6162   unsigned full     = INSTR (30, 30);
6163   unsigned size     = INSTR (22, 22);
6164   unsigned L        = INSTR (21, 21);
6165   unsigned vm       = INSTR (20, 16);
6166   unsigned H        = INSTR (11, 11);
6167   unsigned vn       = INSTR (9, 5);
6168   unsigned vd       = INSTR (4, 0);
6169   unsigned e;
6170 
6171   NYI_assert (29, 23, 0x1F);
6172   NYI_assert (15, 12, 0x1);
6173   NYI_assert (10, 10, 0);
6174 
6175   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6176   if (size)
6177     {
6178       double element1, element2;
6179 
6180       if (! full || L)
6181 	HALT_UNALLOC;
6182 
6183       element2 = aarch64_get_vec_double (cpu, vm, H);
6184 
6185       for (e = 0; e < 2; e++)
6186 	{
6187 	  element1 = aarch64_get_vec_double (cpu, vn, e);
6188 	  element1 *= element2;
6189 	  element1 += aarch64_get_vec_double (cpu, vd, e);
6190 	  aarch64_set_vec_double (cpu, vd, e, element1);
6191 	}
6192     }
6193   else
6194     {
6195       float element1;
6196       float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
6197 
6198       for (e = 0; e < (full ? 4 : 2); e++)
6199 	{
6200 	  element1 = aarch64_get_vec_float (cpu, vn, e);
6201 	  element1 *= element2;
6202 	  element1 += aarch64_get_vec_float (cpu, vd, e);
6203 	  aarch64_set_vec_float (cpu, vd, e, element1);
6204 	}
6205     }
6206 }
6207 
6208 static void
6209 do_vec_op2 (sim_cpu *cpu)
6210 {
6211   /* instr[31]    = 0
6212      instr[30]    = half/full
6213      instr[29,24] = 00 1111
6214      instr[23]    = ?
6215      instr[22,16] = element size & index
6216      instr[15,10] = sub-opcode
6217      instr[9,5]   = Vm
6218      instr[4,0]   = Vd  */
6219 
6220   NYI_assert (29, 24, 0x0F);
6221 
6222   if (INSTR (23, 23) != 0)
6223     {
6224       switch (INSTR (15, 10))
6225 	{
6226 	case 0x04:
6227 	case 0x06:
6228 	  do_FMLA_by_element (cpu);
6229 	  return;
6230 
6231 	case 0x20:
6232 	case 0x22:
6233 	  do_vec_MUL_by_element (cpu);
6234 	  return;
6235 
6236 	default:
6237 	  HALT_NYI;
6238 	}
6239     }
6240   else
6241     {
6242       switch (INSTR (15, 10))
6243 	{
6244 	case 0x01: do_vec_SSHR_USHR (cpu); return;
6245 	case 0x15: do_vec_SHL (cpu); return;
6246 	case 0x20:
6247 	case 0x22: do_vec_MUL_by_element (cpu); return;
6248 	case 0x29: do_vec_xtl (cpu); return;
6249 	default:   HALT_NYI;
6250 	}
6251     }
6252 }
6253 
6254 static void
6255 do_vec_neg (sim_cpu *cpu)
6256 {
6257   /* instr[31]    = 0
6258      instr[30]    = full(1)/half(0)
6259      instr[29,24] = 10 1110
6260      instr[23,22] = size: byte(00), half (01), word (10), long (11)
6261      instr[21,10] = 1000 0010 1110
6262      instr[9,5]   = Vs
6263      instr[4,0]   = Vd  */
6264 
6265   int    full = INSTR (30, 30);
6266   unsigned vs = INSTR (9, 5);
6267   unsigned vd = INSTR (4, 0);
6268   unsigned i;
6269 
6270   NYI_assert (29, 24, 0x2E);
6271   NYI_assert (21, 10, 0x82E);
6272 
6273   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6274   switch (INSTR (23, 22))
6275     {
6276     case 0:
6277       for (i = 0; i < (full ? 16 : 8); i++)
6278 	aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
6279       return;
6280 
6281     case 1:
6282       for (i = 0; i < (full ? 8 : 4); i++)
6283 	aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
6284       return;
6285 
6286     case 2:
6287       for (i = 0; i < (full ? 4 : 2); i++)
6288 	aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
6289       return;
6290 
6291     case 3:
6292       if (! full)
6293 	HALT_NYI;
6294       for (i = 0; i < 2; i++)
6295 	aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
6296       return;
6297     }
6298 }
6299 
6300 static void
6301 do_vec_sqrt (sim_cpu *cpu)
6302 {
6303   /* instr[31]    = 0
6304      instr[30]    = full(1)/half(0)
6305      instr[29,23] = 101 1101
6306      instr[22]    = single(0)/double(1)
6307      instr[21,10] = 1000 0111 1110
6308      instr[9,5]   = Vs
6309      instr[4,0]   = Vd.  */
6310 
6311   int    full = INSTR (30, 30);
6312   unsigned vs = INSTR (9, 5);
6313   unsigned vd = INSTR (4, 0);
6314   unsigned i;
6315 
6316   NYI_assert (29, 23, 0x5B);
6317   NYI_assert (21, 10, 0x87E);
6318 
6319   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6320   if (INSTR (22, 22) == 0)
6321     for (i = 0; i < (full ? 4 : 2); i++)
6322       aarch64_set_vec_float (cpu, vd, i,
6323 			     sqrtf (aarch64_get_vec_float (cpu, vs, i)));
6324   else
6325     for (i = 0; i < 2; i++)
6326       aarch64_set_vec_double (cpu, vd, i,
6327 			      sqrt (aarch64_get_vec_double (cpu, vs, i)));
6328 }
6329 
6330 static void
6331 do_vec_mls_indexed (sim_cpu *cpu)
6332 {
6333   /* instr[31]       = 0
6334      instr[30]       = half(0)/full(1)
6335      instr[29,24]    = 10 1111
6336      instr[23,22]    = 16-bit(01)/32-bit(10)
6337      instr[21,20+11] = index (if 16-bit)
6338      instr[21+11]    = index (if 32-bit)
6339      instr[20,16]    = Vm
6340      instr[15,12]    = 0100
6341      instr[11]       = part of index
6342      instr[10]       = 0
6343      instr[9,5]      = Vs
6344      instr[4,0]      = Vd.  */
6345 
6346   int    full = INSTR (30, 30);
6347   unsigned vs = INSTR (9, 5);
6348   unsigned vd = INSTR (4, 0);
6349   unsigned vm = INSTR (20, 16);
6350   unsigned i;
6351 
6352   NYI_assert (15, 12, 4);
6353   NYI_assert (10, 10, 0);
6354 
6355   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6356   switch (INSTR (23, 22))
6357     {
6358     case 1:
6359       {
6360 	unsigned elem;
6361 	uint32_t val;
6362 
6363 	if (vm > 15)
6364 	  HALT_NYI;
6365 
6366 	elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
6367 	val = aarch64_get_vec_u16 (cpu, vm, elem);
6368 
6369 	for (i = 0; i < (full ? 8 : 4); i++)
6370 	  aarch64_set_vec_u32 (cpu, vd, i,
6371 			       aarch64_get_vec_u32 (cpu, vd, i) -
6372 			       (aarch64_get_vec_u32 (cpu, vs, i) * val));
6373 	return;
6374       }
6375 
6376     case 2:
6377       {
6378 	unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
6379 	uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
6380 
6381 	for (i = 0; i < (full ? 4 : 2); i++)
6382 	  aarch64_set_vec_u64 (cpu, vd, i,
6383 			       aarch64_get_vec_u64 (cpu, vd, i) -
6384 			       (aarch64_get_vec_u64 (cpu, vs, i) * val));
6385 	return;
6386       }
6387 
6388     case 0:
6389     case 3:
6390     default:
6391       HALT_NYI;
6392     }
6393 }
6394 
6395 static void
6396 do_vec_SUB (sim_cpu *cpu)
6397 {
6398   /* instr [31]    = 0
6399      instr [30]    = half(0)/full(1)
6400      instr [29,24] = 10 1110
6401      instr [23,22] = size: byte(00, half(01), word (10), long (11)
6402      instr [21]    = 1
6403      instr [20,16] = Vm
6404      instr [15,10] = 10 0001
6405      instr [9, 5]  = Vn
6406      instr [4, 0]  = Vd.  */
6407 
6408   unsigned full = INSTR (30, 30);
6409   unsigned vm = INSTR (20, 16);
6410   unsigned vn = INSTR (9, 5);
6411   unsigned vd = INSTR (4, 0);
6412   unsigned i;
6413 
6414   NYI_assert (29, 24, 0x2E);
6415   NYI_assert (21, 21, 1);
6416   NYI_assert (15, 10, 0x21);
6417 
6418   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6419   switch (INSTR (23, 22))
6420     {
6421     case 0:
6422       for (i = 0; i < (full ? 16 : 8); i++)
6423 	aarch64_set_vec_s8 (cpu, vd, i,
6424 			    aarch64_get_vec_s8 (cpu, vn, i)
6425 			    - aarch64_get_vec_s8 (cpu, vm, i));
6426       return;
6427 
6428     case 1:
6429       for (i = 0; i < (full ? 8 : 4); i++)
6430 	aarch64_set_vec_s16 (cpu, vd, i,
6431 			     aarch64_get_vec_s16 (cpu, vn, i)
6432 			     - aarch64_get_vec_s16 (cpu, vm, i));
6433       return;
6434 
6435     case 2:
6436       for (i = 0; i < (full ? 4 : 2); i++)
6437 	aarch64_set_vec_s32 (cpu, vd, i,
6438 			     aarch64_get_vec_s32 (cpu, vn, i)
6439 			     - aarch64_get_vec_s32 (cpu, vm, i));
6440       return;
6441 
6442     case 3:
6443       if (full == 0)
6444 	HALT_UNALLOC;
6445 
6446       for (i = 0; i < 2; i++)
6447 	aarch64_set_vec_s64 (cpu, vd, i,
6448 			     aarch64_get_vec_s64 (cpu, vn, i)
6449 			     - aarch64_get_vec_s64 (cpu, vm, i));
6450       return;
6451     }
6452 }
6453 
6454 static void
6455 do_vec_MLS (sim_cpu *cpu)
6456 {
6457   /* instr [31]    = 0
6458      instr [30]    = half(0)/full(1)
6459      instr [29,24] = 10 1110
6460      instr [23,22] = size: byte(00, half(01), word (10)
6461      instr [21]    = 1
6462      instr [20,16] = Vm
6463      instr [15,10] = 10 0101
6464      instr [9, 5]  = Vn
6465      instr [4, 0]  = Vd.  */
6466 
6467   unsigned full = INSTR (30, 30);
6468   unsigned vm = INSTR (20, 16);
6469   unsigned vn = INSTR (9, 5);
6470   unsigned vd = INSTR (4, 0);
6471   unsigned i;
6472 
6473   NYI_assert (29, 24, 0x2E);
6474   NYI_assert (21, 21, 1);
6475   NYI_assert (15, 10, 0x25);
6476 
6477   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6478   switch (INSTR (23, 22))
6479     {
6480     case 0:
6481       for (i = 0; i < (full ? 16 : 8); i++)
6482 	aarch64_set_vec_u8 (cpu, vd, i,
6483 			    aarch64_get_vec_u8 (cpu, vd, i)
6484 			    - (aarch64_get_vec_u8 (cpu, vn, i)
6485 			       * aarch64_get_vec_u8 (cpu, vm, i)));
6486       return;
6487 
6488     case 1:
6489       for (i = 0; i < (full ? 8 : 4); i++)
6490 	aarch64_set_vec_u16 (cpu, vd, i,
6491 			     aarch64_get_vec_u16 (cpu, vd, i)
6492 			     - (aarch64_get_vec_u16 (cpu, vn, i)
6493 				* aarch64_get_vec_u16 (cpu, vm, i)));
6494       return;
6495 
6496     case 2:
6497       for (i = 0; i < (full ? 4 : 2); i++)
6498 	aarch64_set_vec_u32 (cpu, vd, i,
6499 			     aarch64_get_vec_u32 (cpu, vd, i)
6500 			     - (aarch64_get_vec_u32 (cpu, vn, i)
6501 				* aarch64_get_vec_u32 (cpu, vm, i)));
6502       return;
6503 
6504     default:
6505       HALT_UNALLOC;
6506     }
6507 }
6508 
6509 static void
6510 do_vec_FDIV (sim_cpu *cpu)
6511 {
6512   /* instr [31]    = 0
6513      instr [30]    = half(0)/full(1)
6514      instr [29,23] = 10 1110 0
6515      instr [22]    = float()/double(1)
6516      instr [21]    = 1
6517      instr [20,16] = Vm
6518      instr [15,10] = 1111 11
6519      instr [9, 5]  = Vn
6520      instr [4, 0]  = Vd.  */
6521 
6522   unsigned full = INSTR (30, 30);
6523   unsigned vm = INSTR (20, 16);
6524   unsigned vn = INSTR (9, 5);
6525   unsigned vd = INSTR (4, 0);
6526   unsigned i;
6527 
6528   NYI_assert (29, 23, 0x5C);
6529   NYI_assert (21, 21, 1);
6530   NYI_assert (15, 10, 0x3F);
6531 
6532   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6533   if (INSTR (22, 22))
6534     {
6535       if (! full)
6536 	HALT_UNALLOC;
6537 
6538       for (i = 0; i < 2; i++)
6539 	aarch64_set_vec_double (cpu, vd, i,
6540 				aarch64_get_vec_double (cpu, vn, i)
6541 				/ aarch64_get_vec_double (cpu, vm, i));
6542     }
6543   else
6544     for (i = 0; i < (full ? 4 : 2); i++)
6545       aarch64_set_vec_float (cpu, vd, i,
6546 			     aarch64_get_vec_float (cpu, vn, i)
6547 			     / aarch64_get_vec_float (cpu, vm, i));
6548 }
6549 
6550 static void
6551 do_vec_FMUL (sim_cpu *cpu)
6552 {
6553   /* instr [31]    = 0
6554      instr [30]    = half(0)/full(1)
6555      instr [29,23] = 10 1110 0
6556      instr [22]    = float(0)/double(1)
6557      instr [21]    = 1
6558      instr [20,16] = Vm
6559      instr [15,10] = 1101 11
6560      instr [9, 5]  = Vn
6561      instr [4, 0]  = Vd.  */
6562 
6563   unsigned full = INSTR (30, 30);
6564   unsigned vm = INSTR (20, 16);
6565   unsigned vn = INSTR (9, 5);
6566   unsigned vd = INSTR (4, 0);
6567   unsigned i;
6568 
6569   NYI_assert (29, 23, 0x5C);
6570   NYI_assert (21, 21, 1);
6571   NYI_assert (15, 10, 0x37);
6572 
6573   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6574   if (INSTR (22, 22))
6575     {
6576       if (! full)
6577 	HALT_UNALLOC;
6578 
6579       for (i = 0; i < 2; i++)
6580 	aarch64_set_vec_double (cpu, vd, i,
6581 				aarch64_get_vec_double (cpu, vn, i)
6582 				* aarch64_get_vec_double (cpu, vm, i));
6583     }
6584   else
6585     for (i = 0; i < (full ? 4 : 2); i++)
6586       aarch64_set_vec_float (cpu, vd, i,
6587 			     aarch64_get_vec_float (cpu, vn, i)
6588 			     * aarch64_get_vec_float (cpu, vm, i));
6589 }
6590 
6591 static void
6592 do_vec_FADDP (sim_cpu *cpu)
6593 {
6594   /* instr [31]    = 0
6595      instr [30]    = half(0)/full(1)
6596      instr [29,23] = 10 1110 0
6597      instr [22]    = float(0)/double(1)
6598      instr [21]    = 1
6599      instr [20,16] = Vm
6600      instr [15,10] = 1101 01
6601      instr [9, 5]  = Vn
6602      instr [4, 0]  = Vd.  */
6603 
6604   unsigned full = INSTR (30, 30);
6605   unsigned vm = INSTR (20, 16);
6606   unsigned vn = INSTR (9, 5);
6607   unsigned vd = INSTR (4, 0);
6608 
6609   NYI_assert (29, 23, 0x5C);
6610   NYI_assert (21, 21, 1);
6611   NYI_assert (15, 10, 0x35);
6612 
6613   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6614   if (INSTR (22, 22))
6615     {
6616       /* Extract values before adding them incase vd == vn/vm.  */
6617       double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
6618       double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
6619       double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
6620       double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
6621 
6622       if (! full)
6623 	HALT_UNALLOC;
6624 
6625       aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
6626       aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
6627     }
6628   else
6629     {
6630       /* Extract values before adding them incase vd == vn/vm.  */
6631       float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
6632       float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
6633       float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
6634       float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
6635 
6636       if (full)
6637 	{
6638 	  float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
6639 	  float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
6640 	  float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
6641 	  float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
6642 
6643 	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6644 	  aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
6645 	  aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
6646 	  aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
6647 	}
6648       else
6649 	{
6650 	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6651 	  aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
6652 	}
6653     }
6654 }
6655 
6656 static void
6657 do_vec_FSQRT (sim_cpu *cpu)
6658 {
6659   /* instr[31]    = 0
6660      instr[30]    = half(0)/full(1)
6661      instr[29,23] = 10 1110 1
6662      instr[22]    = single(0)/double(1)
6663      instr[21,10] = 10 0001 1111 10
6664      instr[9,5]   = Vsrc
6665      instr[4,0]   = Vdest.  */
6666 
6667   unsigned vn = INSTR (9, 5);
6668   unsigned vd = INSTR (4, 0);
6669   unsigned full = INSTR (30, 30);
6670   int i;
6671 
6672   NYI_assert (29, 23, 0x5D);
6673   NYI_assert (21, 10, 0x87E);
6674 
6675   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6676   if (INSTR (22, 22))
6677     {
6678       if (! full)
6679 	HALT_UNALLOC;
6680 
6681       for (i = 0; i < 2; i++)
6682 	aarch64_set_vec_double (cpu, vd, i,
6683 				sqrt (aarch64_get_vec_double (cpu, vn, i)));
6684     }
6685   else
6686     {
6687       for (i = 0; i < (full ? 4 : 2); i++)
6688 	aarch64_set_vec_float (cpu, vd, i,
6689 			       sqrtf (aarch64_get_vec_float (cpu, vn, i)));
6690     }
6691 }
6692 
6693 static void
6694 do_vec_FNEG (sim_cpu *cpu)
6695 {
6696   /* instr[31]    = 0
6697      instr[30]    = half (0)/full (1)
6698      instr[29,23] = 10 1110 1
6699      instr[22]    = single (0)/double (1)
6700      instr[21,10] = 10 0000 1111 10
6701      instr[9,5]   = Vsrc
6702      instr[4,0]   = Vdest.  */
6703 
6704   unsigned vn = INSTR (9, 5);
6705   unsigned vd = INSTR (4, 0);
6706   unsigned full = INSTR (30, 30);
6707   int i;
6708 
6709   NYI_assert (29, 23, 0x5D);
6710   NYI_assert (21, 10, 0x83E);
6711 
6712   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6713   if (INSTR (22, 22))
6714     {
6715       if (! full)
6716 	HALT_UNALLOC;
6717 
6718       for (i = 0; i < 2; i++)
6719 	aarch64_set_vec_double (cpu, vd, i,
6720 				- aarch64_get_vec_double (cpu, vn, i));
6721     }
6722   else
6723     {
6724       for (i = 0; i < (full ? 4 : 2); i++)
6725 	aarch64_set_vec_float (cpu, vd, i,
6726 			       - aarch64_get_vec_float (cpu, vn, i));
6727     }
6728 }
6729 
6730 static void
6731 do_vec_NOT (sim_cpu *cpu)
6732 {
6733   /* instr[31]    = 0
6734      instr[30]    = half (0)/full (1)
6735      instr[29,10] = 10 1110 0010 0000 0101 10
6736      instr[9,5]   = Vn
6737      instr[4.0]   = Vd.  */
6738 
6739   unsigned vn = INSTR (9, 5);
6740   unsigned vd = INSTR (4, 0);
6741   unsigned i;
6742   int      full = INSTR (30, 30);
6743 
6744   NYI_assert (29, 10, 0xB8816);
6745 
6746   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6747   for (i = 0; i < (full ? 16 : 8); i++)
6748     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
6749 }
6750 
6751 static unsigned int
6752 clz (uint64_t val, unsigned size)
6753 {
6754   uint64_t mask = 1;
6755   int      count;
6756 
6757   mask <<= (size - 1);
6758   count = 0;
6759   do
6760     {
6761       if (val & mask)
6762 	break;
6763       mask >>= 1;
6764       count ++;
6765     }
6766   while (mask);
6767 
6768   return count;
6769 }
6770 
6771 static void
6772 do_vec_CLZ (sim_cpu *cpu)
6773 {
6774   /* instr[31]    = 0
6775      instr[30]    = half (0)/full (1)
6776      instr[29,24] = 10 1110
6777      instr[23,22] = size
6778      instr[21,10] = 10 0000 0100 10
6779      instr[9,5]   = Vn
6780      instr[4.0]   = Vd.  */
6781 
6782   unsigned vn = INSTR (9, 5);
6783   unsigned vd = INSTR (4, 0);
6784   unsigned i;
6785   int      full = INSTR (30,30);
6786 
6787   NYI_assert (29, 24, 0x2E);
6788   NYI_assert (21, 10, 0x812);
6789 
6790   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6791   switch (INSTR (23, 22))
6792     {
6793     case 0:
6794       for (i = 0; i < (full ? 16 : 8); i++)
6795 	aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
6796       break;
6797     case 1:
6798       for (i = 0; i < (full ? 8 : 4); i++)
6799 	aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
6800       break;
6801     case 2:
6802       for (i = 0; i < (full ? 4 : 2); i++)
6803 	aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
6804       break;
6805     case 3:
6806       if (! full)
6807 	HALT_UNALLOC;
6808       aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
6809       aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
6810       break;
6811     }
6812 }
6813 
6814 static void
6815 do_vec_MOV_element (sim_cpu *cpu)
6816 {
6817   /* instr[31,21] = 0110 1110 000
6818      instr[20,16] = size & dest index
6819      instr[15]    = 0
6820      instr[14,11] = source index
6821      instr[10]    = 1
6822      instr[9,5]   = Vs
6823      instr[4.0]   = Vd.  */
6824 
6825   unsigned vs = INSTR (9, 5);
6826   unsigned vd = INSTR (4, 0);
6827   unsigned src_index;
6828   unsigned dst_index;
6829 
6830   NYI_assert (31, 21, 0x370);
6831   NYI_assert (15, 15, 0);
6832   NYI_assert (10, 10, 1);
6833 
6834   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6835   if (INSTR (16, 16))
6836     {
6837       /* Move a byte.  */
6838       src_index = INSTR (14, 11);
6839       dst_index = INSTR (20, 17);
6840       aarch64_set_vec_u8 (cpu, vd, dst_index,
6841 			  aarch64_get_vec_u8 (cpu, vs, src_index));
6842     }
6843   else if (INSTR (17, 17))
6844     {
6845       /* Move 16-bits.  */
6846       NYI_assert (11, 11, 0);
6847       src_index = INSTR (14, 12);
6848       dst_index = INSTR (20, 18);
6849       aarch64_set_vec_u16 (cpu, vd, dst_index,
6850 			   aarch64_get_vec_u16 (cpu, vs, src_index));
6851     }
6852   else if (INSTR (18, 18))
6853     {
6854       /* Move 32-bits.  */
6855       NYI_assert (12, 11, 0);
6856       src_index = INSTR (14, 13);
6857       dst_index = INSTR (20, 19);
6858       aarch64_set_vec_u32 (cpu, vd, dst_index,
6859 			   aarch64_get_vec_u32 (cpu, vs, src_index));
6860     }
6861   else
6862     {
6863       NYI_assert (19, 19, 1);
6864       NYI_assert (13, 11, 0);
6865       src_index = INSTR (14, 14);
6866       dst_index = INSTR (20, 20);
6867       aarch64_set_vec_u64 (cpu, vd, dst_index,
6868 			   aarch64_get_vec_u64 (cpu, vs, src_index));
6869     }
6870 }
6871 
6872 static void
6873 do_vec_REV32 (sim_cpu *cpu)
6874 {
6875   /* instr[31]    = 0
6876      instr[30]    = full/half
6877      instr[29,24] = 10 1110
6878      instr[23,22] = size
6879      instr[21,10] = 10 0000 0000 10
6880      instr[9,5]   = Rn
6881      instr[4,0]   = Rd.  */
6882 
6883   unsigned rn = INSTR (9, 5);
6884   unsigned rd = INSTR (4, 0);
6885   unsigned size = INSTR (23, 22);
6886   unsigned full = INSTR (30, 30);
6887   unsigned i;
6888   FRegister val;
6889 
6890   NYI_assert (29, 24, 0x2E);
6891   NYI_assert (21, 10, 0x802);
6892 
6893   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6894   switch (size)
6895     {
6896     case 0:
6897       for (i = 0; i < (full ? 16 : 8); i++)
6898 	val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
6899       break;
6900 
6901     case 1:
6902       for (i = 0; i < (full ? 8 : 4); i++)
6903 	val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
6904       break;
6905 
6906     default:
6907       HALT_UNALLOC;
6908     }
6909 
6910   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
6911   if (full)
6912     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
6913 }
6914 
6915 static void
6916 do_vec_EXT (sim_cpu *cpu)
6917 {
6918   /* instr[31]    = 0
6919      instr[30]    = full/half
6920      instr[29,21] = 10 1110 000
6921      instr[20,16] = Vm
6922      instr[15]    = 0
6923      instr[14,11] = source index
6924      instr[10]    = 0
6925      instr[9,5]   = Vn
6926      instr[4.0]   = Vd.  */
6927 
6928   unsigned vm = INSTR (20, 16);
6929   unsigned vn = INSTR (9, 5);
6930   unsigned vd = INSTR (4, 0);
6931   unsigned src_index = INSTR (14, 11);
6932   unsigned full = INSTR (30, 30);
6933   unsigned i;
6934   unsigned j;
6935   FRegister val;
6936 
6937   NYI_assert (31, 21, 0x370);
6938   NYI_assert (15, 15, 0);
6939   NYI_assert (10, 10, 0);
6940 
6941   if (!full && (src_index & 0x8))
6942     HALT_UNALLOC;
6943 
6944   j = 0;
6945 
6946   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6947   for (i = src_index; i < (full ? 16 : 8); i++)
6948     val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
6949   for (i = 0; i < src_index; i++)
6950     val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
6951 
6952   aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
6953   if (full)
6954     aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
6955 }
6956 
6957 static void
6958 dexAdvSIMD0 (sim_cpu *cpu)
6959 {
6960   /* instr [28,25] = 0 111.  */
6961   if (    INSTR (15, 10) == 0x07
6962       && (INSTR (9, 5) ==
6963 	  INSTR (20, 16)))
6964     {
6965       if (INSTR (31, 21) == 0x075
6966 	  || INSTR (31, 21) == 0x275)
6967 	{
6968 	  do_vec_MOV_whole_vector (cpu);
6969 	  return;
6970 	}
6971     }
6972 
6973   if (INSTR (29, 19) == 0x1E0)
6974     {
6975       do_vec_MOV_immediate (cpu);
6976       return;
6977     }
6978 
6979   if (INSTR (29, 19) == 0x5E0)
6980     {
6981       do_vec_MVNI (cpu);
6982       return;
6983     }
6984 
6985   if (INSTR (29, 19) == 0x1C0
6986       || INSTR (29, 19) == 0x1C1)
6987     {
6988       if (INSTR (15, 10) == 0x03)
6989 	{
6990 	  do_vec_DUP_scalar_into_vector (cpu);
6991 	  return;
6992 	}
6993     }
6994 
6995   switch (INSTR (29, 24))
6996     {
6997     case 0x0E: do_vec_op1 (cpu); return;
6998     case 0x0F: do_vec_op2 (cpu); return;
6999 
7000     case 0x2E:
7001       if (INSTR (21, 21) == 1)
7002 	{
7003 	  switch (INSTR (15, 10))
7004 	    {
7005 	    case 0x02:
7006 	      do_vec_REV32 (cpu);
7007 	      return;
7008 
7009 	    case 0x07:
7010 	      switch (INSTR (23, 22))
7011 		{
7012 		case 0: do_vec_EOR (cpu); return;
7013 		case 1: do_vec_BSL (cpu); return;
7014 		case 2:
7015 		case 3: do_vec_bit (cpu); return;
7016 		}
7017 	      break;
7018 
7019 	    case 0x08: do_vec_sub_long (cpu); return;
7020 	    case 0x11: do_vec_USHL (cpu); return;
7021 	    case 0x12: do_vec_CLZ (cpu); return;
7022 	    case 0x16: do_vec_NOT (cpu); return;
7023 	    case 0x19: do_vec_max (cpu); return;
7024 	    case 0x1B: do_vec_min (cpu); return;
7025 	    case 0x21: do_vec_SUB (cpu); return;
7026 	    case 0x25: do_vec_MLS (cpu); return;
7027 	    case 0x31: do_vec_FminmaxNMP (cpu); return;
7028 	    case 0x35: do_vec_FADDP (cpu); return;
7029 	    case 0x37: do_vec_FMUL (cpu); return;
7030 	    case 0x3F: do_vec_FDIV (cpu); return;
7031 
7032 	    case 0x3E:
7033 	      switch (INSTR (20, 16))
7034 		{
7035 		case 0x00: do_vec_FNEG (cpu); return;
7036 		case 0x01: do_vec_FSQRT (cpu); return;
7037 		default:   HALT_NYI;
7038 		}
7039 
7040 	    case 0x0D:
7041 	    case 0x0F:
7042 	    case 0x22:
7043 	    case 0x23:
7044 	    case 0x26:
7045 	    case 0x2A:
7046 	    case 0x32:
7047 	    case 0x36:
7048 	    case 0x39:
7049 	    case 0x3A:
7050 	      do_vec_compare (cpu); return;
7051 
7052 	    default:
7053 	      break;
7054 	    }
7055 	}
7056 
7057       if (INSTR (31, 21) == 0x370)
7058 	{
7059 	  if (INSTR (10, 10))
7060 	    do_vec_MOV_element (cpu);
7061 	  else
7062 	    do_vec_EXT (cpu);
7063 	  return;
7064 	}
7065 
7066       switch (INSTR (21, 10))
7067 	{
7068 	case 0x82E: do_vec_neg (cpu); return;
7069 	case 0x87E: do_vec_sqrt (cpu); return;
7070 	default:
7071 	  if (INSTR (15, 10) == 0x30)
7072 	    {
7073 	      do_vec_mull (cpu);
7074 	      return;
7075 	    }
7076 	  break;
7077 	}
7078       break;
7079 
7080     case 0x2f:
7081       switch (INSTR (15, 10))
7082 	{
7083 	case 0x01: do_vec_SSHR_USHR (cpu); return;
7084 	case 0x10:
7085 	case 0x12: do_vec_mls_indexed (cpu); return;
7086 	case 0x29: do_vec_xtl (cpu); return;
7087 	default:
7088 	  HALT_NYI;
7089 	}
7090 
7091     default:
7092       break;
7093     }
7094 
7095   HALT_NYI;
7096 }
7097 
7098 /* 3 sources.  */
7099 
7100 /* Float multiply add.  */
7101 static void
7102 fmadds (sim_cpu *cpu)
7103 {
7104   unsigned sa = INSTR (14, 10);
7105   unsigned sm = INSTR (20, 16);
7106   unsigned sn = INSTR ( 9,  5);
7107   unsigned sd = INSTR ( 4,  0);
7108 
7109   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7110   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7111 			+ aarch64_get_FP_float (cpu, sn)
7112 			* aarch64_get_FP_float (cpu, sm));
7113 }
7114 
7115 /* Double multiply add.  */
7116 static void
7117 fmaddd (sim_cpu *cpu)
7118 {
7119   unsigned sa = INSTR (14, 10);
7120   unsigned sm = INSTR (20, 16);
7121   unsigned sn = INSTR ( 9,  5);
7122   unsigned sd = INSTR ( 4,  0);
7123 
7124   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7125   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7126 			 + aarch64_get_FP_double (cpu, sn)
7127 			 * aarch64_get_FP_double (cpu, sm));
7128 }
7129 
7130 /* Float multiply subtract.  */
7131 static void
7132 fmsubs (sim_cpu *cpu)
7133 {
7134   unsigned sa = INSTR (14, 10);
7135   unsigned sm = INSTR (20, 16);
7136   unsigned sn = INSTR ( 9,  5);
7137   unsigned sd = INSTR ( 4,  0);
7138 
7139   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7140   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7141 			- aarch64_get_FP_float (cpu, sn)
7142 			* aarch64_get_FP_float (cpu, sm));
7143 }
7144 
7145 /* Double multiply subtract.  */
7146 static void
7147 fmsubd (sim_cpu *cpu)
7148 {
7149   unsigned sa = INSTR (14, 10);
7150   unsigned sm = INSTR (20, 16);
7151   unsigned sn = INSTR ( 9,  5);
7152   unsigned sd = INSTR ( 4,  0);
7153 
7154   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7155   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7156 			 - aarch64_get_FP_double (cpu, sn)
7157 			 * aarch64_get_FP_double (cpu, sm));
7158 }
7159 
7160 /* Float negative multiply add.  */
7161 static void
7162 fnmadds (sim_cpu *cpu)
7163 {
7164   unsigned sa = INSTR (14, 10);
7165   unsigned sm = INSTR (20, 16);
7166   unsigned sn = INSTR ( 9,  5);
7167   unsigned sd = INSTR ( 4,  0);
7168 
7169   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7170   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7171 			+ (- aarch64_get_FP_float (cpu, sn))
7172 			* aarch64_get_FP_float (cpu, sm));
7173 }
7174 
7175 /* Double negative multiply add.  */
7176 static void
7177 fnmaddd (sim_cpu *cpu)
7178 {
7179   unsigned sa = INSTR (14, 10);
7180   unsigned sm = INSTR (20, 16);
7181   unsigned sn = INSTR ( 9,  5);
7182   unsigned sd = INSTR ( 4,  0);
7183 
7184   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7185   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7186 			 + (- aarch64_get_FP_double (cpu, sn))
7187 			 * aarch64_get_FP_double (cpu, sm));
7188 }
7189 
7190 /* Float negative multiply subtract.  */
7191 static void
7192 fnmsubs (sim_cpu *cpu)
7193 {
7194   unsigned sa = INSTR (14, 10);
7195   unsigned sm = INSTR (20, 16);
7196   unsigned sn = INSTR ( 9,  5);
7197   unsigned sd = INSTR ( 4,  0);
7198 
7199   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7200   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7201 			+ aarch64_get_FP_float (cpu, sn)
7202 			* aarch64_get_FP_float (cpu, sm));
7203 }
7204 
7205 /* Double negative multiply subtract.  */
7206 static void
7207 fnmsubd (sim_cpu *cpu)
7208 {
7209   unsigned sa = INSTR (14, 10);
7210   unsigned sm = INSTR (20, 16);
7211   unsigned sn = INSTR ( 9,  5);
7212   unsigned sd = INSTR ( 4,  0);
7213 
7214   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7215   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7216 			 + aarch64_get_FP_double (cpu, sn)
7217 			 * aarch64_get_FP_double (cpu, sm));
7218 }
7219 
7220 static void
7221 dexSimpleFPDataProc3Source (sim_cpu *cpu)
7222 {
7223   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7224      instr[30]    = 0
7225      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7226      instr[28,25] = 1111
7227      instr[24]    = 1
7228      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7229      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
7230      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
7231 
7232   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7233   /* dispatch on combined type:o1:o2.  */
7234   uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
7235 
7236   if (M_S != 0)
7237     HALT_UNALLOC;
7238 
7239   switch (dispatch)
7240     {
7241     case 0: fmadds (cpu); return;
7242     case 1: fmsubs (cpu); return;
7243     case 2: fnmadds (cpu); return;
7244     case 3: fnmsubs (cpu); return;
7245     case 4: fmaddd (cpu); return;
7246     case 5: fmsubd (cpu); return;
7247     case 6: fnmaddd (cpu); return;
7248     case 7: fnmsubd (cpu); return;
7249     default:
7250       /* type > 1 is currently unallocated.  */
7251       HALT_UNALLOC;
7252     }
7253 }
7254 
7255 static void
7256 dexSimpleFPFixedConvert (sim_cpu *cpu)
7257 {
7258   HALT_NYI;
7259 }
7260 
7261 static void
7262 dexSimpleFPCondCompare (sim_cpu *cpu)
7263 {
7264   /* instr [31,23] = 0001 1110 0
7265      instr [22]    = type
7266      instr [21]    = 1
7267      instr [20,16] = Rm
7268      instr [15,12] = condition
7269      instr [11,10] = 01
7270      instr [9,5]   = Rn
7271      instr [4]     = 0
7272      instr [3,0]   = nzcv  */
7273 
7274   unsigned rm = INSTR (20, 16);
7275   unsigned rn = INSTR (9, 5);
7276 
7277   NYI_assert (31, 23, 0x3C);
7278   NYI_assert (11, 10, 0x1);
7279   NYI_assert (4,  4,  0);
7280 
7281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7282   if (! testConditionCode (cpu, INSTR (15, 12)))
7283     {
7284       aarch64_set_CPSR (cpu, INSTR (3, 0));
7285       return;
7286     }
7287 
7288   if (INSTR (22, 22))
7289     {
7290       /* Double precision.  */
7291       double val1 = aarch64_get_vec_double (cpu, rn, 0);
7292       double val2 = aarch64_get_vec_double (cpu, rm, 0);
7293 
7294       /* FIXME: Check for NaNs.  */
7295       if (val1 == val2)
7296 	aarch64_set_CPSR (cpu, (Z | C));
7297       else if (val1 < val2)
7298 	aarch64_set_CPSR (cpu, N);
7299       else /* val1 > val2 */
7300 	aarch64_set_CPSR (cpu, C);
7301     }
7302   else
7303     {
7304       /* Single precision.  */
7305       float val1 = aarch64_get_vec_float (cpu, rn, 0);
7306       float val2 = aarch64_get_vec_float (cpu, rm, 0);
7307 
7308       /* FIXME: Check for NaNs.  */
7309       if (val1 == val2)
7310 	aarch64_set_CPSR (cpu, (Z | C));
7311       else if (val1 < val2)
7312 	aarch64_set_CPSR (cpu, N);
7313       else /* val1 > val2 */
7314 	aarch64_set_CPSR (cpu, C);
7315     }
7316 }
7317 
7318 /* 2 sources.  */
7319 
7320 /* Float add.  */
7321 static void
7322 fadds (sim_cpu *cpu)
7323 {
7324   unsigned sm = INSTR (20, 16);
7325   unsigned sn = INSTR ( 9,  5);
7326   unsigned sd = INSTR ( 4,  0);
7327 
7328   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7329   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7330 			+ aarch64_get_FP_float (cpu, sm));
7331 }
7332 
7333 /* Double add.  */
7334 static void
7335 faddd (sim_cpu *cpu)
7336 {
7337   unsigned sm = INSTR (20, 16);
7338   unsigned sn = INSTR ( 9,  5);
7339   unsigned sd = INSTR ( 4,  0);
7340 
7341   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7342   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7343 			 + aarch64_get_FP_double (cpu, sm));
7344 }
7345 
7346 /* Float divide.  */
7347 static void
7348 fdivs (sim_cpu *cpu)
7349 {
7350   unsigned sm = INSTR (20, 16);
7351   unsigned sn = INSTR ( 9,  5);
7352   unsigned sd = INSTR ( 4,  0);
7353 
7354   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7355   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7356 			/ aarch64_get_FP_float (cpu, sm));
7357 }
7358 
7359 /* Double divide.  */
7360 static void
7361 fdivd (sim_cpu *cpu)
7362 {
7363   unsigned sm = INSTR (20, 16);
7364   unsigned sn = INSTR ( 9,  5);
7365   unsigned sd = INSTR ( 4,  0);
7366 
7367   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7368   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7369 			 / aarch64_get_FP_double (cpu, sm));
7370 }
7371 
7372 /* Float multiply.  */
7373 static void
7374 fmuls (sim_cpu *cpu)
7375 {
7376   unsigned sm = INSTR (20, 16);
7377   unsigned sn = INSTR ( 9,  5);
7378   unsigned sd = INSTR ( 4,  0);
7379 
7380   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7381   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7382 			* aarch64_get_FP_float (cpu, sm));
7383 }
7384 
7385 /* Double multiply.  */
7386 static void
7387 fmuld (sim_cpu *cpu)
7388 {
7389   unsigned sm = INSTR (20, 16);
7390   unsigned sn = INSTR ( 9,  5);
7391   unsigned sd = INSTR ( 4,  0);
7392 
7393   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7394   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7395 			 * aarch64_get_FP_double (cpu, sm));
7396 }
7397 
7398 /* Float negate and multiply.  */
7399 static void
7400 fnmuls (sim_cpu *cpu)
7401 {
7402   unsigned sm = INSTR (20, 16);
7403   unsigned sn = INSTR ( 9,  5);
7404   unsigned sd = INSTR ( 4,  0);
7405 
7406   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7407   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
7408 				    * aarch64_get_FP_float (cpu, sm)));
7409 }
7410 
7411 /* Double negate and multiply.  */
7412 static void
7413 fnmuld (sim_cpu *cpu)
7414 {
7415   unsigned sm = INSTR (20, 16);
7416   unsigned sn = INSTR ( 9,  5);
7417   unsigned sd = INSTR ( 4,  0);
7418 
7419   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7420   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
7421 				     * aarch64_get_FP_double (cpu, sm)));
7422 }
7423 
7424 /* Float subtract.  */
7425 static void
7426 fsubs (sim_cpu *cpu)
7427 {
7428   unsigned sm = INSTR (20, 16);
7429   unsigned sn = INSTR ( 9,  5);
7430   unsigned sd = INSTR ( 4,  0);
7431 
7432   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7433   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7434 			- aarch64_get_FP_float (cpu, sm));
7435 }
7436 
7437 /* Double subtract.  */
7438 static void
7439 fsubd (sim_cpu *cpu)
7440 {
7441   unsigned sm = INSTR (20, 16);
7442   unsigned sn = INSTR ( 9,  5);
7443   unsigned sd = INSTR ( 4,  0);
7444 
7445   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7446   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7447 			 - aarch64_get_FP_double (cpu, sm));
7448 }
7449 
7450 static void
7451 do_FMINNM (sim_cpu *cpu)
7452 {
7453   /* instr[31,23] = 0 0011 1100
7454      instr[22]    = float(0)/double(1)
7455      instr[21]    = 1
7456      instr[20,16] = Sm
7457      instr[15,10] = 01 1110
7458      instr[9,5]   = Sn
7459      instr[4,0]   = Cpu  */
7460 
7461   unsigned sm = INSTR (20, 16);
7462   unsigned sn = INSTR ( 9,  5);
7463   unsigned sd = INSTR ( 4,  0);
7464 
7465   NYI_assert (31, 23, 0x03C);
7466   NYI_assert (15, 10, 0x1E);
7467 
7468   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7469   if (INSTR (22, 22))
7470     aarch64_set_FP_double (cpu, sd,
7471 			   dminnm (aarch64_get_FP_double (cpu, sn),
7472 				   aarch64_get_FP_double (cpu, sm)));
7473   else
7474     aarch64_set_FP_float (cpu, sd,
7475 			  fminnm (aarch64_get_FP_float (cpu, sn),
7476 				  aarch64_get_FP_float (cpu, sm)));
7477 }
7478 
7479 static void
7480 do_FMAXNM (sim_cpu *cpu)
7481 {
7482   /* instr[31,23] = 0 0011 1100
7483      instr[22]    = float(0)/double(1)
7484      instr[21]    = 1
7485      instr[20,16] = Sm
7486      instr[15,10] = 01 1010
7487      instr[9,5]   = Sn
7488      instr[4,0]   = Cpu  */
7489 
7490   unsigned sm = INSTR (20, 16);
7491   unsigned sn = INSTR ( 9,  5);
7492   unsigned sd = INSTR ( 4,  0);
7493 
7494   NYI_assert (31, 23, 0x03C);
7495   NYI_assert (15, 10, 0x1A);
7496 
7497   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7498   if (INSTR (22, 22))
7499     aarch64_set_FP_double (cpu, sd,
7500 			   dmaxnm (aarch64_get_FP_double (cpu, sn),
7501 				   aarch64_get_FP_double (cpu, sm)));
7502   else
7503     aarch64_set_FP_float (cpu, sd,
7504 			  fmaxnm (aarch64_get_FP_float (cpu, sn),
7505 				  aarch64_get_FP_float (cpu, sm)));
7506 }
7507 
7508 static void
7509 dexSimpleFPDataProc2Source (sim_cpu *cpu)
7510 {
7511   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7512      instr[30]    = 0
7513      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7514      instr[28,25] = 1111
7515      instr[24]    = 0
7516      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7517      instr[21]    = 1
7518      instr[20,16] = Vm
7519      instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
7520                                0010 ==> FADD, 0011 ==> FSUB,
7521                                0100 ==> FMAX, 0101 ==> FMIN
7522                                0110 ==> FMAXNM, 0111 ==> FMINNM
7523                                1000 ==> FNMUL, ow ==> UNALLOC
7524      instr[11,10] = 10
7525      instr[9,5]   = Vn
7526      instr[4,0]   = Vd  */
7527 
7528   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7529   uint32_t type = INSTR (23, 22);
7530   /* Dispatch on opcode.  */
7531   uint32_t dispatch = INSTR (15, 12);
7532 
7533   if (type > 1)
7534     HALT_UNALLOC;
7535 
7536   if (M_S != 0)
7537     HALT_UNALLOC;
7538 
7539   if (type)
7540     switch (dispatch)
7541       {
7542       case 0: fmuld (cpu); return;
7543       case 1: fdivd (cpu); return;
7544       case 2: faddd (cpu); return;
7545       case 3: fsubd (cpu); return;
7546       case 6: do_FMAXNM (cpu); return;
7547       case 7: do_FMINNM (cpu); return;
7548       case 8: fnmuld (cpu); return;
7549 
7550 	/* Have not yet implemented fmax and fmin.  */
7551       case 4:
7552       case 5:
7553 	HALT_NYI;
7554 
7555       default:
7556 	HALT_UNALLOC;
7557       }
7558   else /* type == 0 => floats.  */
7559     switch (dispatch)
7560       {
7561       case 0: fmuls (cpu); return;
7562       case 1: fdivs (cpu); return;
7563       case 2: fadds (cpu); return;
7564       case 3: fsubs (cpu); return;
7565       case 6: do_FMAXNM (cpu); return;
7566       case 7: do_FMINNM (cpu); return;
7567       case 8: fnmuls (cpu); return;
7568 
7569       case 4:
7570       case 5:
7571 	HALT_NYI;
7572 
7573       default:
7574 	HALT_UNALLOC;
7575       }
7576 }
7577 
7578 static void
7579 dexSimpleFPCondSelect (sim_cpu *cpu)
7580 {
7581   /* FCSEL
7582      instr[31,23] = 0 0011 1100
7583      instr[22]    = 0=>single 1=>double
7584      instr[21]    = 1
7585      instr[20,16] = Sm
7586      instr[15,12] = cond
7587      instr[11,10] = 11
7588      instr[9,5]   = Sn
7589      instr[4,0]   = Cpu  */
7590   unsigned sm = INSTR (20, 16);
7591   unsigned sn = INSTR ( 9, 5);
7592   unsigned sd = INSTR ( 4, 0);
7593   uint32_t set = testConditionCode (cpu, INSTR (15, 12));
7594 
7595   NYI_assert (31, 23, 0x03C);
7596   NYI_assert (11, 10, 0x3);
7597 
7598   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7599   if (INSTR (22, 22))
7600     aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
7601 				     : aarch64_get_FP_double (cpu, sm)));
7602   else
7603     aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
7604 				    : aarch64_get_FP_float (cpu, sm)));
7605 }
7606 
7607 /* Store 32 bit unscaled signed 9 bit.  */
7608 static void
7609 fsturs (sim_cpu *cpu, int32_t offset)
7610 {
7611   unsigned int rn = INSTR (9, 5);
7612   unsigned int st = INSTR (4, 0);
7613 
7614   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7615   aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7616 		       aarch64_get_vec_u32 (cpu, st, 0));
7617 }
7618 
7619 /* Store 64 bit unscaled signed 9 bit.  */
7620 static void
7621 fsturd (sim_cpu *cpu, int32_t offset)
7622 {
7623   unsigned int rn = INSTR (9, 5);
7624   unsigned int st = INSTR (4, 0);
7625 
7626   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7627   aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7628 		       aarch64_get_vec_u64 (cpu, st, 0));
7629 }
7630 
7631 /* Store 128 bit unscaled signed 9 bit.  */
7632 static void
7633 fsturq (sim_cpu *cpu, int32_t offset)
7634 {
7635   unsigned int rn = INSTR (9, 5);
7636   unsigned int st = INSTR (4, 0);
7637   FRegister a;
7638 
7639   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7640   aarch64_get_FP_long_double (cpu, st, & a);
7641   aarch64_set_mem_long_double (cpu,
7642 			       aarch64_get_reg_u64 (cpu, rn, 1)
7643 			       + offset, a);
7644 }
7645 
7646 /* TODO FP move register.  */
7647 
7648 /* 32 bit fp to fp move register.  */
7649 static void
7650 ffmovs (sim_cpu *cpu)
7651 {
7652   unsigned int rn = INSTR (9, 5);
7653   unsigned int st = INSTR (4, 0);
7654 
7655   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7656   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
7657 }
7658 
7659 /* 64 bit fp to fp move register.  */
7660 static void
7661 ffmovd (sim_cpu *cpu)
7662 {
7663   unsigned int rn = INSTR (9, 5);
7664   unsigned int st = INSTR (4, 0);
7665 
7666   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7667   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
7668 }
7669 
7670 /* 32 bit GReg to Vec move register.  */
7671 static void
7672 fgmovs (sim_cpu *cpu)
7673 {
7674   unsigned int rn = INSTR (9, 5);
7675   unsigned int st = INSTR (4, 0);
7676 
7677   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7678   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
7679 }
7680 
7681 /* 64 bit g to fp move register.  */
7682 static void
7683 fgmovd (sim_cpu *cpu)
7684 {
7685   unsigned int rn = INSTR (9, 5);
7686   unsigned int st = INSTR (4, 0);
7687 
7688   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7689   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
7690 }
7691 
7692 /* 32 bit fp to g move register.  */
7693 static void
7694 gfmovs (sim_cpu *cpu)
7695 {
7696   unsigned int rn = INSTR (9, 5);
7697   unsigned int st = INSTR (4, 0);
7698 
7699   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7700   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
7701 }
7702 
7703 /* 64 bit fp to g move register.  */
7704 static void
7705 gfmovd (sim_cpu *cpu)
7706 {
7707   unsigned int rn = INSTR (9, 5);
7708   unsigned int st = INSTR (4, 0);
7709 
7710   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7711   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
7712 }
7713 
7714 /* FP move immediate
7715 
7716    These install an immediate 8 bit value in the target register
7717    where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
7718    bit exponent.  */
7719 
7720 static void
7721 fmovs (sim_cpu *cpu)
7722 {
7723   unsigned int sd = INSTR (4, 0);
7724   uint32_t imm = INSTR (20, 13);
7725   float f = fp_immediate_for_encoding_32 (imm);
7726 
7727   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7728   aarch64_set_FP_float (cpu, sd, f);
7729 }
7730 
7731 static void
7732 fmovd (sim_cpu *cpu)
7733 {
7734   unsigned int sd = INSTR (4, 0);
7735   uint32_t imm = INSTR (20, 13);
7736   double d = fp_immediate_for_encoding_64 (imm);
7737 
7738   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7739   aarch64_set_FP_double (cpu, sd, d);
7740 }
7741 
7742 static void
7743 dexSimpleFPImmediate (sim_cpu *cpu)
7744 {
7745   /* instr[31,23] == 00111100
7746      instr[22]    == type : single(0)/double(1)
7747      instr[21]    == 1
7748      instr[20,13] == imm8
7749      instr[12,10] == 100
7750      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
7751      instr[4,0]   == Rd  */
7752   uint32_t imm5 = INSTR (9, 5);
7753 
7754   NYI_assert (31, 23, 0x3C);
7755 
7756   if (imm5 != 0)
7757     HALT_UNALLOC;
7758 
7759   if (INSTR (22, 22))
7760     fmovd (cpu);
7761   else
7762     fmovs (cpu);
7763 }
7764 
7765 /* TODO specific decode and execute for group Load Store.  */
7766 
7767 /* TODO FP load/store single register (unscaled offset).  */
7768 
7769 /* TODO load 8 bit unscaled signed 9 bit.  */
7770 /* TODO load 16 bit unscaled signed 9 bit.  */
7771 
7772 /* Load 32 bit unscaled signed 9 bit.  */
7773 static void
7774 fldurs (sim_cpu *cpu, int32_t offset)
7775 {
7776   unsigned int rn = INSTR (9, 5);
7777   unsigned int st = INSTR (4, 0);
7778 
7779   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7780   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
7781 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7782 }
7783 
7784 /* Load 64 bit unscaled signed 9 bit.  */
7785 static void
7786 fldurd (sim_cpu *cpu, int32_t offset)
7787 {
7788   unsigned int rn = INSTR (9, 5);
7789   unsigned int st = INSTR (4, 0);
7790 
7791   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7792   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
7793 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7794 }
7795 
7796 /* Load 128 bit unscaled signed 9 bit.  */
7797 static void
7798 fldurq (sim_cpu *cpu, int32_t offset)
7799 {
7800   unsigned int rn = INSTR (9, 5);
7801   unsigned int st = INSTR (4, 0);
7802   FRegister a;
7803   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
7804 
7805   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7806   aarch64_get_mem_long_double (cpu, addr, & a);
7807   aarch64_set_FP_long_double (cpu, st, a);
7808 }
7809 
7810 /* TODO store 8 bit unscaled signed 9 bit.  */
7811 /* TODO store 16 bit unscaled signed 9 bit.  */
7812 
7813 
7814 /* 1 source.  */
7815 
7816 /* Float absolute value.  */
7817 static void
7818 fabss (sim_cpu *cpu)
7819 {
7820   unsigned sn = INSTR (9, 5);
7821   unsigned sd = INSTR (4, 0);
7822   float value = aarch64_get_FP_float (cpu, sn);
7823 
7824   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7825   aarch64_set_FP_float (cpu, sd, fabsf (value));
7826 }
7827 
7828 /* Double absolute value.  */
7829 static void
7830 fabcpu (sim_cpu *cpu)
7831 {
7832   unsigned sn = INSTR (9, 5);
7833   unsigned sd = INSTR (4, 0);
7834   double value = aarch64_get_FP_double (cpu, sn);
7835 
7836   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7837   aarch64_set_FP_double (cpu, sd, fabs (value));
7838 }
7839 
7840 /* Float negative value.  */
7841 static void
7842 fnegs (sim_cpu *cpu)
7843 {
7844   unsigned sn = INSTR (9, 5);
7845   unsigned sd = INSTR (4, 0);
7846 
7847   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7848   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
7849 }
7850 
7851 /* Double negative value.  */
7852 static void
7853 fnegd (sim_cpu *cpu)
7854 {
7855   unsigned sn = INSTR (9, 5);
7856   unsigned sd = INSTR (4, 0);
7857 
7858   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7859   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
7860 }
7861 
7862 /* Float square root.  */
7863 static void
7864 fsqrts (sim_cpu *cpu)
7865 {
7866   unsigned sn = INSTR (9, 5);
7867   unsigned sd = INSTR (4, 0);
7868 
7869   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7870   aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
7871 }
7872 
7873 /* Double square root.  */
7874 static void
7875 fsqrtd (sim_cpu *cpu)
7876 {
7877   unsigned sn = INSTR (9, 5);
7878   unsigned sd = INSTR (4, 0);
7879 
7880   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7881   aarch64_set_FP_double (cpu, sd,
7882 			 sqrt (aarch64_get_FP_double (cpu, sn)));
7883 }
7884 
7885 /* Convert double to float.  */
7886 static void
7887 fcvtds (sim_cpu *cpu)
7888 {
7889   unsigned sn = INSTR (9, 5);
7890   unsigned sd = INSTR (4, 0);
7891 
7892   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7893   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
7894 }
7895 
7896 /* Convert float to double.  */
7897 static void
7898 fcvtcpu (sim_cpu *cpu)
7899 {
7900   unsigned sn = INSTR (9, 5);
7901   unsigned sd = INSTR (4, 0);
7902 
7903   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7904   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
7905 }
7906 
7907 static void
7908 do_FRINT (sim_cpu *cpu)
7909 {
7910   /* instr[31,23] = 0001 1110 0
7911      instr[22]    = single(0)/double(1)
7912      instr[21,18] = 1001
7913      instr[17,15] = rounding mode
7914      instr[14,10] = 10000
7915      instr[9,5]   = source
7916      instr[4,0]   = dest  */
7917 
7918   float val;
7919   unsigned rs = INSTR (9, 5);
7920   unsigned rd = INSTR (4, 0);
7921   unsigned int rmode = INSTR (17, 15);
7922 
7923   NYI_assert (31, 23, 0x03C);
7924   NYI_assert (21, 18, 0x9);
7925   NYI_assert (14, 10, 0x10);
7926 
7927   if (rmode == 6 || rmode == 7)
7928     /* FIXME: Add support for rmode == 6 exactness check.  */
7929     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
7930 
7931   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7932   if (INSTR (22, 22))
7933     {
7934       double val = aarch64_get_FP_double (cpu, rs);
7935 
7936       switch (rmode)
7937 	{
7938 	case 0: /* mode N: nearest or even.  */
7939 	  {
7940 	    double rval = round (val);
7941 
7942 	    if (val - rval == 0.5)
7943 	      {
7944 		if (((rval / 2.0) * 2.0) != rval)
7945 		  rval += 1.0;
7946 	      }
7947 
7948 	    aarch64_set_FP_double (cpu, rd, round (val));
7949 	    return;
7950 	  }
7951 
7952 	case 1: /* mode P: towards +inf.  */
7953 	  if (val < 0.0)
7954 	    aarch64_set_FP_double (cpu, rd, trunc (val));
7955 	  else
7956 	    aarch64_set_FP_double (cpu, rd, round (val));
7957 	  return;
7958 
7959 	case 2: /* mode M: towards -inf.  */
7960 	  if (val < 0.0)
7961 	    aarch64_set_FP_double (cpu, rd, round (val));
7962 	  else
7963 	    aarch64_set_FP_double (cpu, rd, trunc (val));
7964 	  return;
7965 
7966 	case 3: /* mode Z: towards 0.  */
7967 	  aarch64_set_FP_double (cpu, rd, trunc (val));
7968 	  return;
7969 
7970 	case 4: /* mode A: away from 0.  */
7971 	  aarch64_set_FP_double (cpu, rd, round (val));
7972 	  return;
7973 
7974 	case 6: /* mode X: use FPCR with exactness check.  */
7975 	case 7: /* mode I: use FPCR mode.  */
7976 	  HALT_NYI;
7977 
7978 	default:
7979 	  HALT_UNALLOC;
7980 	}
7981     }
7982 
7983   val = aarch64_get_FP_float (cpu, rs);
7984 
7985   switch (rmode)
7986     {
7987     case 0: /* mode N: nearest or even.  */
7988       {
7989 	float rval = roundf (val);
7990 
7991 	if (val - rval == 0.5)
7992 	  {
7993 	    if (((rval / 2.0) * 2.0) != rval)
7994 	      rval += 1.0;
7995 	  }
7996 
7997 	aarch64_set_FP_float (cpu, rd, rval);
7998 	return;
7999       }
8000 
8001     case 1: /* mode P: towards +inf.  */
8002       if (val < 0.0)
8003 	aarch64_set_FP_float (cpu, rd, truncf (val));
8004       else
8005 	aarch64_set_FP_float (cpu, rd, roundf (val));
8006       return;
8007 
8008     case 2: /* mode M: towards -inf.  */
8009       if (val < 0.0)
8010 	aarch64_set_FP_float (cpu, rd, truncf (val));
8011       else
8012 	aarch64_set_FP_float (cpu, rd, roundf (val));
8013       return;
8014 
8015     case 3: /* mode Z: towards 0.  */
8016       aarch64_set_FP_float (cpu, rd, truncf (val));
8017       return;
8018 
8019     case 4: /* mode A: away from 0.  */
8020       aarch64_set_FP_float (cpu, rd, roundf (val));
8021       return;
8022 
8023     case 6: /* mode X: use FPCR with exactness check.  */
8024     case 7: /* mode I: use FPCR mode.  */
8025       HALT_NYI;
8026 
8027     default:
8028       HALT_UNALLOC;
8029     }
8030 }
8031 
8032 /* Convert half to float.  */
8033 static void
8034 do_FCVT_half_to_single (sim_cpu *cpu)
8035 {
8036   unsigned rn = INSTR (9, 5);
8037   unsigned rd = INSTR (4, 0);
8038 
8039   NYI_assert (31, 10, 0x7B890);
8040 
8041   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8042   aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
8043 }
8044 
8045 /* Convert half to double.  */
8046 static void
8047 do_FCVT_half_to_double (sim_cpu *cpu)
8048 {
8049   unsigned rn = INSTR (9, 5);
8050   unsigned rd = INSTR (4, 0);
8051 
8052   NYI_assert (31, 10, 0x7B8B0);
8053 
8054   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8055   aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
8056 }
8057 
8058 static void
8059 do_FCVT_single_to_half (sim_cpu *cpu)
8060 {
8061   unsigned rn = INSTR (9, 5);
8062   unsigned rd = INSTR (4, 0);
8063 
8064   NYI_assert (31, 10, 0x788F0);
8065 
8066   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8067   aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
8068 }
8069 
8070 /* Convert double to half.  */
8071 static void
8072 do_FCVT_double_to_half (sim_cpu *cpu)
8073 {
8074   unsigned rn = INSTR (9, 5);
8075   unsigned rd = INSTR (4, 0);
8076 
8077   NYI_assert (31, 10, 0x798F0);
8078 
8079   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8080   aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
8081 }
8082 
8083 static void
8084 dexSimpleFPDataProc1Source (sim_cpu *cpu)
8085 {
8086   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
8087      instr[30]    = 0
8088      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
8089      instr[28,25] = 1111
8090      instr[24]    = 0
8091      instr[23,22] ==> type : 00 ==> source is single,
8092                              01 ==> source is double
8093                              10 ==> UNALLOC
8094                              11 ==> UNALLOC or source is half
8095      instr[21]    = 1
8096      instr[20,15] ==> opcode : with type 00 or 01
8097                                000000 ==> FMOV, 000001 ==> FABS,
8098                                000010 ==> FNEG, 000011 ==> FSQRT,
8099                                000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
8100                                000110 ==> UNALLOC, 000111 ==> FCVT (to half)
8101                                001000 ==> FRINTN, 001001 ==> FRINTP,
8102                                001010 ==> FRINTM, 001011 ==> FRINTZ,
8103                                001100 ==> FRINTA, 001101 ==> UNALLOC
8104                                001110 ==> FRINTX, 001111 ==> FRINTI
8105                                with type 11
8106                                000100 ==> FCVT (half-to-single)
8107                                000101 ==> FCVT (half-to-double)
8108 			       instr[14,10] = 10000.  */
8109 
8110   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8111   uint32_t type   = INSTR (23, 22);
8112   uint32_t opcode = INSTR (20, 15);
8113 
8114   if (M_S != 0)
8115     HALT_UNALLOC;
8116 
8117   if (type == 3)
8118     {
8119       if (opcode == 4)
8120 	do_FCVT_half_to_single (cpu);
8121       else if (opcode == 5)
8122 	do_FCVT_half_to_double (cpu);
8123       else
8124 	HALT_UNALLOC;
8125       return;
8126     }
8127 
8128   if (type == 2)
8129     HALT_UNALLOC;
8130 
8131   switch (opcode)
8132     {
8133     case 0:
8134       if (type)
8135 	ffmovd (cpu);
8136       else
8137 	ffmovs (cpu);
8138       return;
8139 
8140     case 1:
8141       if (type)
8142 	fabcpu (cpu);
8143       else
8144 	fabss (cpu);
8145       return;
8146 
8147     case 2:
8148       if (type)
8149 	fnegd (cpu);
8150       else
8151 	fnegs (cpu);
8152       return;
8153 
8154     case 3:
8155       if (type)
8156 	fsqrtd (cpu);
8157       else
8158 	fsqrts (cpu);
8159       return;
8160 
8161     case 4:
8162       if (type)
8163 	fcvtds (cpu);
8164       else
8165 	HALT_UNALLOC;
8166       return;
8167 
8168     case 5:
8169       if (type)
8170 	HALT_UNALLOC;
8171       fcvtcpu (cpu);
8172       return;
8173 
8174     case 8:		/* FRINTN etc.  */
8175     case 9:
8176     case 10:
8177     case 11:
8178     case 12:
8179     case 14:
8180     case 15:
8181        do_FRINT (cpu);
8182        return;
8183 
8184     case 7:
8185       if (INSTR (22, 22))
8186 	do_FCVT_double_to_half (cpu);
8187       else
8188 	do_FCVT_single_to_half (cpu);
8189       return;
8190 
8191     case 13:
8192       HALT_NYI;
8193 
8194     default:
8195       HALT_UNALLOC;
8196     }
8197 }
8198 
8199 /* 32 bit signed int to float.  */
8200 static void
8201 scvtf32 (sim_cpu *cpu)
8202 {
8203   unsigned rn = INSTR (9, 5);
8204   unsigned sd = INSTR (4, 0);
8205 
8206   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8207   aarch64_set_FP_float
8208     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8209 }
8210 
8211 /* signed int to float.  */
8212 static void
8213 scvtf (sim_cpu *cpu)
8214 {
8215   unsigned rn = INSTR (9, 5);
8216   unsigned sd = INSTR (4, 0);
8217 
8218   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8219   aarch64_set_FP_float
8220     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8221 }
8222 
8223 /* 32 bit signed int to double.  */
8224 static void
8225 scvtd32 (sim_cpu *cpu)
8226 {
8227   unsigned rn = INSTR (9, 5);
8228   unsigned sd = INSTR (4, 0);
8229 
8230   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8231   aarch64_set_FP_double
8232     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8233 }
8234 
8235 /* signed int to double.  */
8236 static void
8237 scvtd (sim_cpu *cpu)
8238 {
8239   unsigned rn = INSTR (9, 5);
8240   unsigned sd = INSTR (4, 0);
8241 
8242   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8243   aarch64_set_FP_double
8244     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8245 }
8246 
8247 static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
8248 static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
8249 static const double DOUBLE_INT_MAX  = (double) INT_MAX;
8250 static const double DOUBLE_INT_MIN  = (double) INT_MIN;
8251 static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
8252 static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
8253 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
8254 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
8255 
8256 #define UINT_MIN 0
8257 #define ULONG_MIN 0
8258 static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
8259 static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
8260 static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
8261 static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
8262 static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
8263 static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
8264 static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
8265 static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
8266 
8267 /* Check for FP exception conditions:
8268      NaN raises IO
8269      Infinity raises IO
8270      Out of Range raises IO and IX and saturates value
8271      Denormal raises ID and IX and sets to zero.  */
8272 #define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)	\
8273   do							\
8274     {							\
8275       switch (fpclassify (F))				\
8276 	{						\
8277 	case FP_INFINITE:				\
8278 	case FP_NAN:					\
8279 	  aarch64_set_FPSR (cpu, IO);			\
8280 	  if (signbit (F))				\
8281 	    VALUE = ITYPE##_MAX;			\
8282 	  else						\
8283 	    VALUE = ITYPE##_MIN;			\
8284 	  break;					\
8285 							\
8286 	case FP_NORMAL:					\
8287 	  if (F >= FTYPE##_##ITYPE##_MAX)		\
8288 	    {						\
8289 	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
8290 	      VALUE = ITYPE##_MAX;			\
8291 	    }						\
8292 	  else if (F <= FTYPE##_##ITYPE##_MIN)		\
8293 	    {						\
8294 	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
8295 	      VALUE = ITYPE##_MIN;			\
8296 	    }						\
8297 	  break;					\
8298 							\
8299 	case FP_SUBNORMAL:				\
8300 	  aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);	\
8301 	  VALUE = 0;					\
8302 	  break;					\
8303 							\
8304 	default:					\
8305 	case FP_ZERO:					\
8306 	  VALUE = 0;					\
8307 	  break;					\
8308 	}						\
8309     }							\
8310   while (0)
8311 
8312 /* 32 bit convert float to signed int truncate towards zero.  */
8313 static void
8314 fcvtszs32 (sim_cpu *cpu)
8315 {
8316   unsigned sn = INSTR (9, 5);
8317   unsigned rd = INSTR (4, 0);
8318   /* TODO : check that this rounds toward zero.  */
8319   float   f = aarch64_get_FP_float (cpu, sn);
8320   int32_t value = (int32_t) f;
8321 
8322   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
8323 
8324   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8325   /* Avoid sign extension to 64 bit.  */
8326   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8327 }
8328 
8329 /* 64 bit convert float to signed int truncate towards zero.  */
8330 static void
8331 fcvtszs (sim_cpu *cpu)
8332 {
8333   unsigned sn = INSTR (9, 5);
8334   unsigned rd = INSTR (4, 0);
8335   float f = aarch64_get_FP_float (cpu, sn);
8336   int64_t value = (int64_t) f;
8337 
8338   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
8339 
8340   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8341   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8342 }
8343 
8344 /* 32 bit convert double to signed int truncate towards zero.  */
8345 static void
8346 fcvtszd32 (sim_cpu *cpu)
8347 {
8348   unsigned sn = INSTR (9, 5);
8349   unsigned rd = INSTR (4, 0);
8350   /* TODO : check that this rounds toward zero.  */
8351   double   d = aarch64_get_FP_double (cpu, sn);
8352   int32_t  value = (int32_t) d;
8353 
8354   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
8355 
8356   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8357   /* Avoid sign extension to 64 bit.  */
8358   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8359 }
8360 
8361 /* 64 bit convert double to signed int truncate towards zero.  */
8362 static void
8363 fcvtszd (sim_cpu *cpu)
8364 {
8365   unsigned sn = INSTR (9, 5);
8366   unsigned rd = INSTR (4, 0);
8367   /* TODO : check that this rounds toward zero.  */
8368   double  d = aarch64_get_FP_double (cpu, sn);
8369   int64_t value;
8370 
8371   value = (int64_t) d;
8372 
8373   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
8374 
8375   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8376   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8377 }
8378 
8379 static void
8380 do_fcvtzu (sim_cpu *cpu)
8381 {
8382   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8383      instr[30,23] = 00111100
8384      instr[22]    = type: single (0)/ double (1)
8385      instr[21]    = enable (0)/disable(1) precision
8386      instr[20,16] = 11001
8387      instr[15,10] = precision
8388      instr[9,5]   = Rs
8389      instr[4,0]   = Rd.  */
8390 
8391   unsigned rs = INSTR (9, 5);
8392   unsigned rd = INSTR (4, 0);
8393 
8394   NYI_assert (30, 23, 0x3C);
8395   NYI_assert (20, 16, 0x19);
8396 
8397   if (INSTR (21, 21) != 1)
8398     /* Convert to fixed point.  */
8399     HALT_NYI;
8400 
8401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8402   if (INSTR (31, 31))
8403     {
8404       /* Convert to unsigned 64-bit integer.  */
8405       if (INSTR (22, 22))
8406 	{
8407 	  double  d = aarch64_get_FP_double (cpu, rs);
8408 	  uint64_t value = (uint64_t) d;
8409 
8410 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
8411 	  if (value != (1ULL << 63))
8412 	    RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
8413 
8414 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8415 	}
8416       else
8417 	{
8418 	  float  f = aarch64_get_FP_float (cpu, rs);
8419 	  uint64_t value = (uint64_t) f;
8420 
8421 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
8422 	  if (value != (1ULL << 63))
8423 	    RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
8424 
8425 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8426 	}
8427     }
8428   else
8429     {
8430       uint32_t value;
8431 
8432       /* Convert to unsigned 32-bit integer.  */
8433       if (INSTR (22, 22))
8434 	{
8435 	  double  d = aarch64_get_FP_double (cpu, rs);
8436 
8437 	  value = (uint32_t) d;
8438 	  /* Do not raise an exception if we have reached UINT_MAX.  */
8439 	  if (value != (1UL << 31))
8440 	    RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
8441 	}
8442       else
8443 	{
8444 	  float  f = aarch64_get_FP_float (cpu, rs);
8445 
8446 	  value = (uint32_t) f;
8447 	  /* Do not raise an exception if we have reached UINT_MAX.  */
8448 	  if (value != (1UL << 31))
8449 	    RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
8450 	}
8451 
8452       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8453     }
8454 }
8455 
8456 static void
8457 do_UCVTF (sim_cpu *cpu)
8458 {
8459   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8460      instr[30,23] = 001 1110 0
8461      instr[22]    = type: single (0)/ double (1)
8462      instr[21]    = enable (0)/disable(1) precision
8463      instr[20,16] = 0 0011
8464      instr[15,10] = precision
8465      instr[9,5]   = Rs
8466      instr[4,0]   = Rd.  */
8467 
8468   unsigned rs = INSTR (9, 5);
8469   unsigned rd = INSTR (4, 0);
8470 
8471   NYI_assert (30, 23, 0x3C);
8472   NYI_assert (20, 16, 0x03);
8473 
8474   if (INSTR (21, 21) != 1)
8475     HALT_NYI;
8476 
8477   /* FIXME: Add exception raising.  */
8478   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8479   if (INSTR (31, 31))
8480     {
8481       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
8482 
8483       if (INSTR (22, 22))
8484 	aarch64_set_FP_double (cpu, rd, (double) value);
8485       else
8486 	aarch64_set_FP_float (cpu, rd, (float) value);
8487     }
8488   else
8489     {
8490       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
8491 
8492       if (INSTR (22, 22))
8493 	aarch64_set_FP_double (cpu, rd, (double) value);
8494       else
8495 	aarch64_set_FP_float (cpu, rd, (float) value);
8496     }
8497 }
8498 
8499 static void
8500 float_vector_move (sim_cpu *cpu)
8501 {
8502   /* instr[31,17] == 100 1111 0101 0111
8503      instr[16]    ==> direction 0=> to GR, 1=> from GR
8504      instr[15,10] => ???
8505      instr[9,5]   ==> source
8506      instr[4,0]   ==> dest.  */
8507 
8508   unsigned rn = INSTR (9, 5);
8509   unsigned rd = INSTR (4, 0);
8510 
8511   NYI_assert (31, 17, 0x4F57);
8512 
8513   if (INSTR (15, 10) != 0)
8514     HALT_UNALLOC;
8515 
8516   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8517   if (INSTR (16, 16))
8518     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
8519   else
8520     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
8521 }
8522 
8523 static void
8524 dexSimpleFPIntegerConvert (sim_cpu *cpu)
8525 {
8526   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
8527      instr[30     = 0
8528      instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
8529      instr[28,25] = 1111
8530      instr[24]    = 0
8531      instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
8532      instr[21]    = 1
8533      instr[20,19] = rmode
8534      instr[18,16] = opcode
8535      instr[15,10] = 10 0000  */
8536 
8537   uint32_t rmode_opcode;
8538   uint32_t size_type;
8539   uint32_t type;
8540   uint32_t size;
8541   uint32_t S;
8542 
8543   if (INSTR (31, 17) == 0x4F57)
8544     {
8545       float_vector_move (cpu);
8546       return;
8547     }
8548 
8549   size = INSTR (31, 31);
8550   S = INSTR (29, 29);
8551   if (S != 0)
8552     HALT_UNALLOC;
8553 
8554   type = INSTR (23, 22);
8555   if (type > 1)
8556     HALT_UNALLOC;
8557 
8558   rmode_opcode = INSTR (20, 16);
8559   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
8560 
8561   switch (rmode_opcode)
8562     {
8563     case 2:			/* SCVTF.  */
8564       switch (size_type)
8565 	{
8566 	case 0: scvtf32 (cpu); return;
8567 	case 1: scvtd32 (cpu); return;
8568 	case 2: scvtf (cpu); return;
8569 	case 3: scvtd (cpu); return;
8570 	}
8571 
8572     case 6:			/* FMOV GR, Vec.  */
8573       switch (size_type)
8574 	{
8575 	case 0:  gfmovs (cpu); return;
8576 	case 3:  gfmovd (cpu); return;
8577 	default: HALT_UNALLOC;
8578 	}
8579 
8580     case 7:			/* FMOV vec, GR.  */
8581       switch (size_type)
8582 	{
8583 	case 0:  fgmovs (cpu); return;
8584 	case 3:  fgmovd (cpu); return;
8585 	default: HALT_UNALLOC;
8586 	}
8587 
8588     case 24:			/* FCVTZS.  */
8589       switch (size_type)
8590 	{
8591 	case 0: fcvtszs32 (cpu); return;
8592 	case 1: fcvtszd32 (cpu); return;
8593 	case 2: fcvtszs (cpu); return;
8594 	case 3: fcvtszd (cpu); return;
8595 	}
8596 
8597     case 25: do_fcvtzu (cpu); return;
8598     case 3:  do_UCVTF (cpu); return;
8599 
8600     case 0:	/* FCVTNS.  */
8601     case 1:	/* FCVTNU.  */
8602     case 4:	/* FCVTAS.  */
8603     case 5:	/* FCVTAU.  */
8604     case 8:	/* FCVPTS.  */
8605     case 9:	/* FCVTPU.  */
8606     case 16:	/* FCVTMS.  */
8607     case 17:	/* FCVTMU.  */
8608     default:
8609       HALT_NYI;
8610     }
8611 }
8612 
8613 static void
8614 set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
8615 {
8616   uint32_t flags;
8617 
8618   /* FIXME: Add exception raising.  */
8619   if (isnan (fvalue1) || isnan (fvalue2))
8620     flags = C|V;
8621   else if (isinf (fvalue1) && isinf (fvalue2))
8622     {
8623       /* Subtracting two infinities may give a NaN.  We only need to compare
8624 	 the signs, which we can get from isinf.  */
8625       int result = isinf (fvalue1) - isinf (fvalue2);
8626 
8627       if (result == 0)
8628 	flags = Z|C;
8629       else if (result < 0)
8630 	flags = N;
8631       else /* (result > 0).  */
8632 	flags = C;
8633     }
8634   else
8635     {
8636       float result = fvalue1 - fvalue2;
8637 
8638       if (result == 0.0)
8639 	flags = Z|C;
8640       else if (result < 0)
8641 	flags = N;
8642       else /* (result > 0).  */
8643 	flags = C;
8644     }
8645 
8646   aarch64_set_CPSR (cpu, flags);
8647 }
8648 
8649 static void
8650 fcmps (sim_cpu *cpu)
8651 {
8652   unsigned sm = INSTR (20, 16);
8653   unsigned sn = INSTR ( 9,  5);
8654 
8655   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8656   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8657 
8658   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8659   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8660 }
8661 
8662 /* Float compare to zero -- Invalid Operation exception
8663    only on signaling NaNs.  */
8664 static void
8665 fcmpzs (sim_cpu *cpu)
8666 {
8667   unsigned sn = INSTR ( 9,  5);
8668   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8669 
8670   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8671   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8672 }
8673 
8674 /* Float compare -- Invalid Operation exception on all NaNs.  */
8675 static void
8676 fcmpes (sim_cpu *cpu)
8677 {
8678   unsigned sm = INSTR (20, 16);
8679   unsigned sn = INSTR ( 9,  5);
8680 
8681   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8682   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8683 
8684   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8685   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8686 }
8687 
8688 /* Float compare to zero -- Invalid Operation exception on all NaNs.  */
8689 static void
8690 fcmpzes (sim_cpu *cpu)
8691 {
8692   unsigned sn = INSTR ( 9,  5);
8693   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8694 
8695   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8696   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8697 }
8698 
8699 static void
8700 set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
8701 {
8702   uint32_t flags;
8703 
8704   /* FIXME: Add exception raising.  */
8705   if (isnan (dval1) || isnan (dval2))
8706     flags = C|V;
8707   else if (isinf (dval1) && isinf (dval2))
8708     {
8709       /* Subtracting two infinities may give a NaN.  We only need to compare
8710 	 the signs, which we can get from isinf.  */
8711       int result = isinf (dval1) - isinf (dval2);
8712 
8713       if (result == 0)
8714 	flags = Z|C;
8715       else if (result < 0)
8716 	flags = N;
8717       else /* (result > 0).  */
8718 	flags = C;
8719     }
8720   else
8721     {
8722       double result = dval1 - dval2;
8723 
8724       if (result == 0.0)
8725 	flags = Z|C;
8726       else if (result < 0)
8727 	flags = N;
8728       else /* (result > 0).  */
8729 	flags = C;
8730     }
8731 
8732   aarch64_set_CPSR (cpu, flags);
8733 }
8734 
8735 /* Double compare -- Invalid Operation exception only on signaling NaNs.  */
8736 static void
8737 fcmpd (sim_cpu *cpu)
8738 {
8739   unsigned sm = INSTR (20, 16);
8740   unsigned sn = INSTR ( 9,  5);
8741 
8742   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8743   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8744 
8745   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8746   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8747 }
8748 
8749 /* Double compare to zero -- Invalid Operation exception
8750    only on signaling NaNs.  */
8751 static void
8752 fcmpzd (sim_cpu *cpu)
8753 {
8754   unsigned sn = INSTR ( 9,  5);
8755   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8756 
8757   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8758   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8759 }
8760 
8761 /* Double compare -- Invalid Operation exception on all NaNs.  */
8762 static void
8763 fcmped (sim_cpu *cpu)
8764 {
8765   unsigned sm = INSTR (20, 16);
8766   unsigned sn = INSTR ( 9,  5);
8767 
8768   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8769   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8770 
8771   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8772   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8773 }
8774 
8775 /* Double compare to zero -- Invalid Operation exception on all NaNs.  */
8776 static void
8777 fcmpzed (sim_cpu *cpu)
8778 {
8779   unsigned sn = INSTR ( 9,  5);
8780   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8781 
8782   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8783   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8784 }
8785 
8786 static void
8787 dexSimpleFPCompare (sim_cpu *cpu)
8788 {
8789   /* assert instr[28,25] == 1111
8790      instr[30:24:21:13,10] = 0011000
8791      instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
8792      instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
8793      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
8794      instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
8795      instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
8796                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
8797                               ow ==> UNALLOC  */
8798   uint32_t dispatch;
8799   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8800   uint32_t type = INSTR (23, 22);
8801   uint32_t op = INSTR (15, 14);
8802   uint32_t op2_2_0 = INSTR (2, 0);
8803 
8804   if (op2_2_0 != 0)
8805     HALT_UNALLOC;
8806 
8807   if (M_S != 0)
8808     HALT_UNALLOC;
8809 
8810   if (type > 1)
8811     HALT_UNALLOC;
8812 
8813   if (op != 0)
8814     HALT_UNALLOC;
8815 
8816   /* dispatch on type and top 2 bits of opcode.  */
8817   dispatch = (type << 2) | INSTR (4, 3);
8818 
8819   switch (dispatch)
8820     {
8821     case 0: fcmps (cpu); return;
8822     case 1: fcmpzs (cpu); return;
8823     case 2: fcmpes (cpu); return;
8824     case 3: fcmpzes (cpu); return;
8825     case 4: fcmpd (cpu); return;
8826     case 5: fcmpzd (cpu); return;
8827     case 6: fcmped (cpu); return;
8828     case 7: fcmpzed (cpu); return;
8829     }
8830 }
8831 
8832 static void
8833 do_scalar_FADDP (sim_cpu *cpu)
8834 {
8835   /* instr [31,23] = 0111 1110 0
8836      instr [22]    = single(0)/double(1)
8837      instr [21,10] = 11 0000 1101 10
8838      instr [9,5]   = Fn
8839      instr [4,0]   = Fd.  */
8840 
8841   unsigned Fn = INSTR (9, 5);
8842   unsigned Fd = INSTR (4, 0);
8843 
8844   NYI_assert (31, 23, 0x0FC);
8845   NYI_assert (21, 10, 0xC36);
8846 
8847   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8848   if (INSTR (22, 22))
8849     {
8850       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
8851       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
8852 
8853       aarch64_set_FP_double (cpu, Fd, val1 + val2);
8854     }
8855   else
8856     {
8857       float val1 = aarch64_get_vec_float (cpu, Fn, 0);
8858       float val2 = aarch64_get_vec_float (cpu, Fn, 1);
8859 
8860       aarch64_set_FP_float (cpu, Fd, val1 + val2);
8861     }
8862 }
8863 
8864 /* Floating point absolute difference.  */
8865 
8866 static void
8867 do_scalar_FABD (sim_cpu *cpu)
8868 {
8869   /* instr [31,23] = 0111 1110 1
8870      instr [22]    = float(0)/double(1)
8871      instr [21]    = 1
8872      instr [20,16] = Rm
8873      instr [15,10] = 1101 01
8874      instr [9, 5]  = Rn
8875      instr [4, 0]  = Rd.  */
8876 
8877   unsigned rm = INSTR (20, 16);
8878   unsigned rn = INSTR (9, 5);
8879   unsigned rd = INSTR (4, 0);
8880 
8881   NYI_assert (31, 23, 0x0FD);
8882   NYI_assert (21, 21, 1);
8883   NYI_assert (15, 10, 0x35);
8884 
8885   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8886   if (INSTR (22, 22))
8887     aarch64_set_FP_double (cpu, rd,
8888 			   fabs (aarch64_get_FP_double (cpu, rn)
8889 				 - aarch64_get_FP_double (cpu, rm)));
8890   else
8891     aarch64_set_FP_float (cpu, rd,
8892 			  fabsf (aarch64_get_FP_float (cpu, rn)
8893 				 - aarch64_get_FP_float (cpu, rm)));
8894 }
8895 
8896 static void
8897 do_scalar_CMGT (sim_cpu *cpu)
8898 {
8899   /* instr [31,21] = 0101 1110 111
8900      instr [20,16] = Rm
8901      instr [15,10] = 00 1101
8902      instr [9, 5]  = Rn
8903      instr [4, 0]  = Rd.  */
8904 
8905   unsigned rm = INSTR (20, 16);
8906   unsigned rn = INSTR (9, 5);
8907   unsigned rd = INSTR (4, 0);
8908 
8909   NYI_assert (31, 21, 0x2F7);
8910   NYI_assert (15, 10, 0x0D);
8911 
8912   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8913   aarch64_set_vec_u64 (cpu, rd, 0,
8914 		       aarch64_get_vec_u64 (cpu, rn, 0) >
8915 		       aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
8916 }
8917 
8918 static void
8919 do_scalar_USHR (sim_cpu *cpu)
8920 {
8921   /* instr [31,23] = 0111 1111 0
8922      instr [22,16] = shift amount
8923      instr [15,10] = 0000 01
8924      instr [9, 5]  = Rn
8925      instr [4, 0]  = Rd.  */
8926 
8927   unsigned amount = 128 - INSTR (22, 16);
8928   unsigned rn = INSTR (9, 5);
8929   unsigned rd = INSTR (4, 0);
8930 
8931   NYI_assert (31, 23, 0x0FE);
8932   NYI_assert (15, 10, 0x01);
8933 
8934   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8935   aarch64_set_vec_u64 (cpu, rd, 0,
8936 		       aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
8937 }
8938 
8939 static void
8940 do_scalar_SSHL (sim_cpu *cpu)
8941 {
8942   /* instr [31,21] = 0101 1110 111
8943      instr [20,16] = Rm
8944      instr [15,10] = 0100 01
8945      instr [9, 5]  = Rn
8946      instr [4, 0]  = Rd.  */
8947 
8948   unsigned rm = INSTR (20, 16);
8949   unsigned rn = INSTR (9, 5);
8950   unsigned rd = INSTR (4, 0);
8951   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
8952 
8953   NYI_assert (31, 21, 0x2F7);
8954   NYI_assert (15, 10, 0x11);
8955 
8956   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8957   if (shift >= 0)
8958     aarch64_set_vec_s64 (cpu, rd, 0,
8959 			 aarch64_get_vec_s64 (cpu, rn, 0) << shift);
8960   else
8961     aarch64_set_vec_s64 (cpu, rd, 0,
8962 			 aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
8963 }
8964 
8965 /* Floating point scalar compare greater than or equal to 0.  */
8966 static void
8967 do_scalar_FCMGE_zero (sim_cpu *cpu)
8968 {
8969   /* instr [31,23] = 0111 1110 1
8970      instr [22,22] = size
8971      instr [21,16] = 1000 00
8972      instr [15,10] = 1100 10
8973      instr [9, 5]  = Rn
8974      instr [4, 0]  = Rd.  */
8975 
8976   unsigned size = INSTR (22, 22);
8977   unsigned rn = INSTR (9, 5);
8978   unsigned rd = INSTR (4, 0);
8979 
8980   NYI_assert (31, 23, 0x0FD);
8981   NYI_assert (21, 16, 0x20);
8982   NYI_assert (15, 10, 0x32);
8983 
8984   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8985   if (size)
8986     aarch64_set_vec_u64 (cpu, rd, 0,
8987 			 aarch64_get_vec_double (cpu, rn, 0) >= 0.0 ? -1 : 0);
8988   else
8989     aarch64_set_vec_u32 (cpu, rd, 0,
8990 			 aarch64_get_vec_float (cpu, rn, 0) >= 0.0 ? -1 : 0);
8991 }
8992 
8993 /* Floating point scalar compare less than or equal to 0.  */
8994 static void
8995 do_scalar_FCMLE_zero (sim_cpu *cpu)
8996 {
8997   /* instr [31,23] = 0111 1110 1
8998      instr [22,22] = size
8999      instr [21,16] = 1000 00
9000      instr [15,10] = 1101 10
9001      instr [9, 5]  = Rn
9002      instr [4, 0]  = Rd.  */
9003 
9004   unsigned size = INSTR (22, 22);
9005   unsigned rn = INSTR (9, 5);
9006   unsigned rd = INSTR (4, 0);
9007 
9008   NYI_assert (31, 23, 0x0FD);
9009   NYI_assert (21, 16, 0x20);
9010   NYI_assert (15, 10, 0x36);
9011 
9012   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9013   if (size)
9014     aarch64_set_vec_u64 (cpu, rd, 0,
9015 			 aarch64_get_vec_double (cpu, rn, 0) <= 0.0 ? -1 : 0);
9016   else
9017     aarch64_set_vec_u32 (cpu, rd, 0,
9018 			 aarch64_get_vec_float (cpu, rn, 0) <= 0.0 ? -1 : 0);
9019 }
9020 
9021 /* Floating point scalar compare greater than 0.  */
9022 static void
9023 do_scalar_FCMGT_zero (sim_cpu *cpu)
9024 {
9025   /* instr [31,23] = 0101 1110 1
9026      instr [22,22] = size
9027      instr [21,16] = 1000 00
9028      instr [15,10] = 1100 10
9029      instr [9, 5]  = Rn
9030      instr [4, 0]  = Rd.  */
9031 
9032   unsigned size = INSTR (22, 22);
9033   unsigned rn = INSTR (9, 5);
9034   unsigned rd = INSTR (4, 0);
9035 
9036   NYI_assert (31, 23, 0x0BD);
9037   NYI_assert (21, 16, 0x20);
9038   NYI_assert (15, 10, 0x32);
9039 
9040   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9041   if (size)
9042     aarch64_set_vec_u64 (cpu, rd, 0,
9043 			 aarch64_get_vec_double (cpu, rn, 0) > 0.0 ? -1 : 0);
9044   else
9045     aarch64_set_vec_u32 (cpu, rd, 0,
9046 			 aarch64_get_vec_float (cpu, rn, 0) > 0.0 ? -1 : 0);
9047 }
9048 
9049 /* Floating point scalar compare equal to 0.  */
9050 static void
9051 do_scalar_FCMEQ_zero (sim_cpu *cpu)
9052 {
9053   /* instr [31,23] = 0101 1110 1
9054      instr [22,22] = size
9055      instr [21,16] = 1000 00
9056      instr [15,10] = 1101 10
9057      instr [9, 5]  = Rn
9058      instr [4, 0]  = Rd.  */
9059 
9060   unsigned size = INSTR (22, 22);
9061   unsigned rn = INSTR (9, 5);
9062   unsigned rd = INSTR (4, 0);
9063 
9064   NYI_assert (31, 23, 0x0BD);
9065   NYI_assert (21, 16, 0x20);
9066   NYI_assert (15, 10, 0x36);
9067 
9068   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9069   if (size)
9070     aarch64_set_vec_u64 (cpu, rd, 0,
9071 			 aarch64_get_vec_double (cpu, rn, 0) == 0.0 ? -1 : 0);
9072   else
9073     aarch64_set_vec_u32 (cpu, rd, 0,
9074 			 aarch64_get_vec_float (cpu, rn, 0) == 0.0 ? -1 : 0);
9075 }
9076 
9077 /* Floating point scalar compare less than 0.  */
9078 static void
9079 do_scalar_FCMLT_zero (sim_cpu *cpu)
9080 {
9081   /* instr [31,23] = 0101 1110 1
9082      instr [22,22] = size
9083      instr [21,16] = 1000 00
9084      instr [15,10] = 1110 10
9085      instr [9, 5]  = Rn
9086      instr [4, 0]  = Rd.  */
9087 
9088   unsigned size = INSTR (22, 22);
9089   unsigned rn = INSTR (9, 5);
9090   unsigned rd = INSTR (4, 0);
9091 
9092   NYI_assert (31, 23, 0x0BD);
9093   NYI_assert (21, 16, 0x20);
9094   NYI_assert (15, 10, 0x3A);
9095 
9096   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9097   if (size)
9098     aarch64_set_vec_u64 (cpu, rd, 0,
9099 			 aarch64_get_vec_double (cpu, rn, 0) < 0.0 ? -1 : 0);
9100   else
9101     aarch64_set_vec_u32 (cpu, rd, 0,
9102 			 aarch64_get_vec_float (cpu, rn, 0) < 0.0 ? -1 : 0);
9103 }
9104 
9105 static void
9106 do_scalar_shift (sim_cpu *cpu)
9107 {
9108   /* instr [31,23] = 0101 1111 0
9109      instr [22,16] = shift amount
9110      instr [15,10] = 0101 01   [SHL]
9111      instr [15,10] = 0000 01   [SSHR]
9112      instr [9, 5]  = Rn
9113      instr [4, 0]  = Rd.  */
9114 
9115   unsigned rn = INSTR (9, 5);
9116   unsigned rd = INSTR (4, 0);
9117   unsigned amount;
9118 
9119   NYI_assert (31, 23, 0x0BE);
9120 
9121   if (INSTR (22, 22) == 0)
9122     HALT_UNALLOC;
9123 
9124   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9125   switch (INSTR (15, 10))
9126     {
9127     case 0x01: /* SSHR */
9128       amount = 128 - INSTR (22, 16);
9129       aarch64_set_vec_s64 (cpu, rd, 0,
9130 			   aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
9131       return;
9132     case 0x15: /* SHL */
9133       amount = INSTR (22, 16) - 64;
9134       aarch64_set_vec_u64 (cpu, rd, 0,
9135 			   aarch64_get_vec_u64 (cpu, rn, 0) << amount);
9136       return;
9137     default:
9138       HALT_NYI;
9139     }
9140 }
9141 
9142 /* FCMEQ FCMGT FCMGE.  */
9143 static void
9144 do_scalar_FCM (sim_cpu *cpu)
9145 {
9146   /* instr [31,30] = 01
9147      instr [29]    = U
9148      instr [28,24] = 1 1110
9149      instr [23]    = E
9150      instr [22]    = size
9151      instr [21]    = 1
9152      instr [20,16] = Rm
9153      instr [15,12] = 1110
9154      instr [11]    = AC
9155      instr [10]    = 1
9156      instr [9, 5]  = Rn
9157      instr [4, 0]  = Rd.  */
9158 
9159   unsigned rm = INSTR (20, 16);
9160   unsigned rn = INSTR (9, 5);
9161   unsigned rd = INSTR (4, 0);
9162   unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
9163   unsigned result;
9164   float val1;
9165   float val2;
9166 
9167   NYI_assert (31, 30, 1);
9168   NYI_assert (28, 24, 0x1E);
9169   NYI_assert (21, 21, 1);
9170   NYI_assert (15, 12, 0xE);
9171   NYI_assert (10, 10, 1);
9172 
9173   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9174   if (INSTR (22, 22))
9175     {
9176       double val1 = aarch64_get_FP_double (cpu, rn);
9177       double val2 = aarch64_get_FP_double (cpu, rm);
9178 
9179       switch (EUac)
9180 	{
9181 	case 0: /* 000 */
9182 	  result = val1 == val2;
9183 	  break;
9184 
9185 	case 3: /* 011 */
9186 	  val1 = fabs (val1);
9187 	  val2 = fabs (val2);
9188 	  /* Fall through. */
9189 	case 2: /* 010 */
9190 	  result = val1 >= val2;
9191 	  break;
9192 
9193 	case 7: /* 111 */
9194 	  val1 = fabs (val1);
9195 	  val2 = fabs (val2);
9196 	  /* Fall through. */
9197 	case 6: /* 110 */
9198 	  result = val1 > val2;
9199 	  break;
9200 
9201 	default:
9202 	  HALT_UNALLOC;
9203 	}
9204 
9205       aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
9206       return;
9207     }
9208 
9209   val1 = aarch64_get_FP_float (cpu, rn);
9210   val2 = aarch64_get_FP_float (cpu, rm);
9211 
9212   switch (EUac)
9213     {
9214     case 0: /* 000 */
9215       result = val1 == val2;
9216       break;
9217 
9218     case 3: /* 011 */
9219       val1 = fabsf (val1);
9220       val2 = fabsf (val2);
9221       /* Fall through. */
9222     case 2: /* 010 */
9223       result = val1 >= val2;
9224       break;
9225 
9226     case 7: /* 111 */
9227       val1 = fabsf (val1);
9228       val2 = fabsf (val2);
9229       /* Fall through. */
9230     case 6: /* 110 */
9231       result = val1 > val2;
9232       break;
9233 
9234     default:
9235       HALT_UNALLOC;
9236     }
9237 
9238   aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
9239 }
9240 
9241 /* An alias of DUP.  */
9242 static void
9243 do_scalar_MOV (sim_cpu *cpu)
9244 {
9245   /* instr [31,21] = 0101 1110 000
9246      instr [20,16] = imm5
9247      instr [15,10] = 0000 01
9248      instr [9, 5]  = Rn
9249      instr [4, 0]  = Rd.  */
9250 
9251   unsigned rn = INSTR (9, 5);
9252   unsigned rd = INSTR (4, 0);
9253   unsigned index;
9254 
9255   NYI_assert (31, 21, 0x2F0);
9256   NYI_assert (15, 10, 0x01);
9257 
9258   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9259   if (INSTR (16, 16))
9260     {
9261       /* 8-bit.  */
9262       index = INSTR (20, 17);
9263       aarch64_set_vec_u8
9264 	(cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
9265     }
9266   else if (INSTR (17, 17))
9267     {
9268       /* 16-bit.  */
9269       index = INSTR (20, 18);
9270       aarch64_set_vec_u16
9271 	(cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
9272     }
9273   else if (INSTR (18, 18))
9274     {
9275       /* 32-bit.  */
9276       index = INSTR (20, 19);
9277       aarch64_set_vec_u32
9278 	(cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
9279     }
9280   else if (INSTR (19, 19))
9281     {
9282       /* 64-bit.  */
9283       index = INSTR (20, 20);
9284       aarch64_set_vec_u64
9285 	(cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
9286     }
9287   else
9288     HALT_UNALLOC;
9289 }
9290 
9291 static void
9292 do_scalar_NEG (sim_cpu *cpu)
9293 {
9294   /* instr [31,10] = 0111 1110 1110 0000 1011 10
9295      instr [9, 5]  = Rn
9296      instr [4, 0]  = Rd.  */
9297 
9298   unsigned rn = INSTR (9, 5);
9299   unsigned rd = INSTR (4, 0);
9300 
9301   NYI_assert (31, 10, 0x1FB82E);
9302 
9303   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9304   aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
9305 }
9306 
9307 static void
9308 do_scalar_USHL (sim_cpu *cpu)
9309 {
9310   /* instr [31,21] = 0111 1110 111
9311      instr [20,16] = Rm
9312      instr [15,10] = 0100 01
9313      instr [9, 5]  = Rn
9314      instr [4, 0]  = Rd.  */
9315 
9316   unsigned rm = INSTR (20, 16);
9317   unsigned rn = INSTR (9, 5);
9318   unsigned rd = INSTR (4, 0);
9319   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
9320 
9321   NYI_assert (31, 21, 0x3F7);
9322   NYI_assert (15, 10, 0x11);
9323 
9324   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9325   if (shift >= 0)
9326     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
9327   else
9328     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
9329 }
9330 
9331 static void
9332 do_double_add (sim_cpu *cpu)
9333 {
9334   /* instr [31,21] = 0101 1110 111
9335      instr [20,16] = Fn
9336      instr [15,10] = 1000 01
9337      instr [9,5]   = Fm
9338      instr [4,0]   = Fd.  */
9339   unsigned Fd;
9340   unsigned Fm;
9341   unsigned Fn;
9342   double val1;
9343   double val2;
9344 
9345   NYI_assert (31, 21, 0x2F7);
9346   NYI_assert (15, 10, 0x21);
9347 
9348   Fd = INSTR (4, 0);
9349   Fm = INSTR (9, 5);
9350   Fn = INSTR (20, 16);
9351 
9352   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9353   val1 = aarch64_get_FP_double (cpu, Fm);
9354   val2 = aarch64_get_FP_double (cpu, Fn);
9355 
9356   aarch64_set_FP_double (cpu, Fd, val1 + val2);
9357 }
9358 
9359 static void
9360 do_scalar_UCVTF (sim_cpu *cpu)
9361 {
9362   /* instr [31,23] = 0111 1110 0
9363      instr [22]    = single(0)/double(1)
9364      instr [21,10] = 10 0001 1101 10
9365      instr [9,5]   = rn
9366      instr [4,0]   = rd.  */
9367 
9368   unsigned rn = INSTR (9, 5);
9369   unsigned rd = INSTR (4, 0);
9370 
9371   NYI_assert (31, 23, 0x0FC);
9372   NYI_assert (21, 10, 0x876);
9373 
9374   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9375   if (INSTR (22, 22))
9376     {
9377       uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
9378 
9379       aarch64_set_vec_double (cpu, rd, 0, (double) val);
9380     }
9381   else
9382     {
9383       uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
9384 
9385       aarch64_set_vec_float (cpu, rd, 0, (float) val);
9386     }
9387 }
9388 
9389 static void
9390 do_scalar_vec (sim_cpu *cpu)
9391 {
9392   /* instr [30] = 1.  */
9393   /* instr [28,25] = 1111.  */
9394   switch (INSTR (31, 23))
9395     {
9396     case 0xBC:
9397       switch (INSTR (15, 10))
9398 	{
9399 	case 0x01: do_scalar_MOV (cpu); return;
9400 	case 0x39: do_scalar_FCM (cpu); return;
9401 	case 0x3B: do_scalar_FCM (cpu); return;
9402 	}
9403       break;
9404 
9405     case 0xBE: do_scalar_shift (cpu); return;
9406 
9407     case 0xFC:
9408       switch (INSTR (15, 10))
9409 	{
9410 	case 0x36:
9411 	  switch (INSTR (21, 16))
9412 	    {
9413 	    case 0x30: do_scalar_FADDP (cpu); return;
9414 	    case 0x21: do_scalar_UCVTF (cpu); return;
9415 	    }
9416 	  HALT_NYI;
9417 	case 0x39: do_scalar_FCM (cpu); return;
9418 	case 0x3B: do_scalar_FCM (cpu); return;
9419 	}
9420       break;
9421 
9422     case 0xFD:
9423       switch (INSTR (15, 10))
9424 	{
9425 	case 0x0D: do_scalar_CMGT (cpu); return;
9426 	case 0x11: do_scalar_USHL (cpu); return;
9427 	case 0x2E: do_scalar_NEG (cpu); return;
9428 	case 0x32: do_scalar_FCMGE_zero (cpu); return;
9429 	case 0x35: do_scalar_FABD (cpu); return;
9430 	case 0x36: do_scalar_FCMLE_zero (cpu); return;
9431 	case 0x39: do_scalar_FCM (cpu); return;
9432 	case 0x3B: do_scalar_FCM (cpu); return;
9433 	default:
9434 	  HALT_NYI;
9435 	}
9436 
9437     case 0xFE: do_scalar_USHR (cpu); return;
9438 
9439     case 0xBD:
9440       switch (INSTR (15, 10))
9441 	{
9442 	case 0x21: do_double_add (cpu); return;
9443 	case 0x11: do_scalar_SSHL (cpu); return;
9444 	case 0x32: do_scalar_FCMGT_zero (cpu); return;
9445 	case 0x36: do_scalar_FCMEQ_zero (cpu); return;
9446 	case 0x3A: do_scalar_FCMLT_zero (cpu); return;
9447 	default:
9448 	  HALT_NYI;
9449 	}
9450 
9451     default:
9452       HALT_NYI;
9453     }
9454 }
9455 
9456 static void
9457 dexAdvSIMD1 (sim_cpu *cpu)
9458 {
9459   /* instr [28,25] = 1 111.  */
9460 
9461   /* We are currently only interested in the basic
9462      scalar fp routines which all have bit 30 = 0.  */
9463   if (INSTR (30, 30))
9464     do_scalar_vec (cpu);
9465 
9466   /* instr[24] is set for FP data processing 3-source and clear for
9467      all other basic scalar fp instruction groups.  */
9468   else if (INSTR (24, 24))
9469     dexSimpleFPDataProc3Source (cpu);
9470 
9471   /* instr[21] is clear for floating <-> fixed conversions and set for
9472      all other basic scalar fp instruction groups.  */
9473   else if (!INSTR (21, 21))
9474     dexSimpleFPFixedConvert (cpu);
9475 
9476   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
9477      11 ==> cond select,  00 ==> other.  */
9478   else
9479     switch (INSTR (11, 10))
9480       {
9481       case 1: dexSimpleFPCondCompare (cpu); return;
9482       case 2: dexSimpleFPDataProc2Source (cpu); return;
9483       case 3: dexSimpleFPCondSelect (cpu); return;
9484 
9485       default:
9486 	/* Now an ordered cascade of tests.
9487 	   FP immediate has instr [12] == 1.
9488 	   FP compare has   instr [13] == 1.
9489 	   FP Data Proc 1 Source has instr [14] == 1.
9490 	   FP floating <--> integer conversions has instr [15] == 0.  */
9491 	if (INSTR (12, 12))
9492 	  dexSimpleFPImmediate (cpu);
9493 
9494 	else if (INSTR (13, 13))
9495 	  dexSimpleFPCompare (cpu);
9496 
9497 	else if (INSTR (14, 14))
9498 	  dexSimpleFPDataProc1Source (cpu);
9499 
9500 	else if (!INSTR (15, 15))
9501 	  dexSimpleFPIntegerConvert (cpu);
9502 
9503 	else
9504 	  /* If we get here then instr[15] == 1 which means UNALLOC.  */
9505 	  HALT_UNALLOC;
9506       }
9507 }
9508 
9509 /* PC relative addressing.  */
9510 
9511 static void
9512 pcadr (sim_cpu *cpu)
9513 {
9514   /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
9515      instr[30,29] = immlo
9516      instr[23,5] = immhi.  */
9517   uint64_t address;
9518   unsigned rd = INSTR (4, 0);
9519   uint32_t isPage = INSTR (31, 31);
9520   union { int64_t u64; uint64_t s64; } imm;
9521   uint64_t offset;
9522 
9523   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
9524   offset = imm.u64;
9525   offset = (offset << 2) | INSTR (30, 29);
9526 
9527   address = aarch64_get_PC (cpu);
9528 
9529   if (isPage)
9530     {
9531       offset <<= 12;
9532       address &= ~0xfff;
9533     }
9534 
9535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9536   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
9537 }
9538 
9539 /* Specific decode and execute for group Data Processing Immediate.  */
9540 
9541 static void
9542 dexPCRelAddressing (sim_cpu *cpu)
9543 {
9544   /* assert instr[28,24] = 10000.  */
9545   pcadr (cpu);
9546 }
9547 
9548 /* Immediate logical.
9549    The bimm32/64 argument is constructed by replicating a 2, 4, 8,
9550    16, 32 or 64 bit sequence pulled out at decode and possibly
9551    inverting it..
9552 
9553    N.B. the output register (dest) can normally be Xn or SP
9554    the exception occurs for flag setting instructions which may
9555    only use Xn for the output (dest).  The input register can
9556    never be SP.  */
9557 
9558 /* 32 bit and immediate.  */
9559 static void
9560 and32 (sim_cpu *cpu, uint32_t bimm)
9561 {
9562   unsigned rn = INSTR (9, 5);
9563   unsigned rd = INSTR (4, 0);
9564 
9565   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9566   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9567 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
9568 }
9569 
9570 /* 64 bit and immediate.  */
9571 static void
9572 and64 (sim_cpu *cpu, uint64_t bimm)
9573 {
9574   unsigned rn = INSTR (9, 5);
9575   unsigned rd = INSTR (4, 0);
9576 
9577   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9578   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9579 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
9580 }
9581 
9582 /* 32 bit and immediate set flags.  */
9583 static void
9584 ands32 (sim_cpu *cpu, uint32_t bimm)
9585 {
9586   unsigned rn = INSTR (9, 5);
9587   unsigned rd = INSTR (4, 0);
9588 
9589   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9590   uint32_t value2 = bimm;
9591 
9592   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9593   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9594   set_flags_for_binop32 (cpu, value1 & value2);
9595 }
9596 
9597 /* 64 bit and immediate set flags.  */
9598 static void
9599 ands64 (sim_cpu *cpu, uint64_t bimm)
9600 {
9601   unsigned rn = INSTR (9, 5);
9602   unsigned rd = INSTR (4, 0);
9603 
9604   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9605   uint64_t value2 = bimm;
9606 
9607   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9608   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9609   set_flags_for_binop64 (cpu, value1 & value2);
9610 }
9611 
9612 /* 32 bit exclusive or immediate.  */
9613 static void
9614 eor32 (sim_cpu *cpu, uint32_t bimm)
9615 {
9616   unsigned rn = INSTR (9, 5);
9617   unsigned rd = INSTR (4, 0);
9618 
9619   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9620   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9621 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
9622 }
9623 
9624 /* 64 bit exclusive or immediate.  */
9625 static void
9626 eor64 (sim_cpu *cpu, uint64_t bimm)
9627 {
9628   unsigned rn = INSTR (9, 5);
9629   unsigned rd = INSTR (4, 0);
9630 
9631   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9632   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9633 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
9634 }
9635 
9636 /* 32 bit or immediate.  */
9637 static void
9638 orr32 (sim_cpu *cpu, uint32_t bimm)
9639 {
9640   unsigned rn = INSTR (9, 5);
9641   unsigned rd = INSTR (4, 0);
9642 
9643   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9644   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9645 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
9646 }
9647 
9648 /* 64 bit or immediate.  */
9649 static void
9650 orr64 (sim_cpu *cpu, uint64_t bimm)
9651 {
9652   unsigned rn = INSTR (9, 5);
9653   unsigned rd = INSTR (4, 0);
9654 
9655   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9656   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9657 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
9658 }
9659 
9660 /* Logical shifted register.
9661    These allow an optional LSL, ASR, LSR or ROR to the second source
9662    register with a count up to the register bit count.
9663    N.B register args may not be SP.  */
9664 
9665 /* 32 bit AND shifted register.  */
9666 static void
9667 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9668 {
9669   unsigned rm = INSTR (20, 16);
9670   unsigned rn = INSTR (9, 5);
9671   unsigned rd = INSTR (4, 0);
9672 
9673   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9674   aarch64_set_reg_u64
9675     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9676      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9677 }
9678 
9679 /* 64 bit AND shifted register.  */
9680 static void
9681 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9682 {
9683   unsigned rm = INSTR (20, 16);
9684   unsigned rn = INSTR (9, 5);
9685   unsigned rd = INSTR (4, 0);
9686 
9687   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9688   aarch64_set_reg_u64
9689     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9690      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9691 }
9692 
9693 /* 32 bit AND shifted register setting flags.  */
9694 static void
9695 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9696 {
9697   unsigned rm = INSTR (20, 16);
9698   unsigned rn = INSTR (9, 5);
9699   unsigned rd = INSTR (4, 0);
9700 
9701   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9702   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9703 			       shift, count);
9704 
9705   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9706   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9707   set_flags_for_binop32 (cpu, value1 & value2);
9708 }
9709 
9710 /* 64 bit AND shifted register setting flags.  */
9711 static void
9712 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9713 {
9714   unsigned rm = INSTR (20, 16);
9715   unsigned rn = INSTR (9, 5);
9716   unsigned rd = INSTR (4, 0);
9717 
9718   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9719   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9720 			       shift, count);
9721 
9722   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9723   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9724   set_flags_for_binop64 (cpu, value1 & value2);
9725 }
9726 
9727 /* 32 bit BIC shifted register.  */
9728 static void
9729 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9730 {
9731   unsigned rm = INSTR (20, 16);
9732   unsigned rn = INSTR (9, 5);
9733   unsigned rd = INSTR (4, 0);
9734 
9735   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9736   aarch64_set_reg_u64
9737     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9738      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9739 }
9740 
9741 /* 64 bit BIC shifted register.  */
9742 static void
9743 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9744 {
9745   unsigned rm = INSTR (20, 16);
9746   unsigned rn = INSTR (9, 5);
9747   unsigned rd = INSTR (4, 0);
9748 
9749   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9750   aarch64_set_reg_u64
9751     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9752      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9753 }
9754 
9755 /* 32 bit BIC shifted register setting flags.  */
9756 static void
9757 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9758 {
9759   unsigned rm = INSTR (20, 16);
9760   unsigned rn = INSTR (9, 5);
9761   unsigned rd = INSTR (4, 0);
9762 
9763   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9764   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9765 				 shift, count);
9766 
9767   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9768   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9769   set_flags_for_binop32 (cpu, value1 & value2);
9770 }
9771 
9772 /* 64 bit BIC shifted register setting flags.  */
9773 static void
9774 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9775 {
9776   unsigned rm = INSTR (20, 16);
9777   unsigned rn = INSTR (9, 5);
9778   unsigned rd = INSTR (4, 0);
9779 
9780   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9781   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9782 				 shift, count);
9783 
9784   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9785   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9786   set_flags_for_binop64 (cpu, value1 & value2);
9787 }
9788 
9789 /* 32 bit EON shifted register.  */
9790 static void
9791 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9792 {
9793   unsigned rm = INSTR (20, 16);
9794   unsigned rn = INSTR (9, 5);
9795   unsigned rd = INSTR (4, 0);
9796 
9797   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9798   aarch64_set_reg_u64
9799     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9800      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9801 }
9802 
9803 /* 64 bit EON shifted register.  */
9804 static void
9805 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9806 {
9807   unsigned rm = INSTR (20, 16);
9808   unsigned rn = INSTR (9, 5);
9809   unsigned rd = INSTR (4, 0);
9810 
9811   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9812   aarch64_set_reg_u64
9813     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9814      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9815 }
9816 
9817 /* 32 bit EOR shifted register.  */
9818 static void
9819 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9820 {
9821   unsigned rm = INSTR (20, 16);
9822   unsigned rn = INSTR (9, 5);
9823   unsigned rd = INSTR (4, 0);
9824 
9825   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9826   aarch64_set_reg_u64
9827     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9828      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9829 }
9830 
9831 /* 64 bit EOR shifted register.  */
9832 static void
9833 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9834 {
9835   unsigned rm = INSTR (20, 16);
9836   unsigned rn = INSTR (9, 5);
9837   unsigned rd = INSTR (4, 0);
9838 
9839   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9840   aarch64_set_reg_u64
9841     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9842      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9843 }
9844 
9845 /* 32 bit ORR shifted register.  */
9846 static void
9847 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9848 {
9849   unsigned rm = INSTR (20, 16);
9850   unsigned rn = INSTR (9, 5);
9851   unsigned rd = INSTR (4, 0);
9852 
9853   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9854   aarch64_set_reg_u64
9855     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9856      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9857 }
9858 
9859 /* 64 bit ORR shifted register.  */
9860 static void
9861 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9862 {
9863   unsigned rm = INSTR (20, 16);
9864   unsigned rn = INSTR (9, 5);
9865   unsigned rd = INSTR (4, 0);
9866 
9867   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9868   aarch64_set_reg_u64
9869     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9870      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9871 }
9872 
9873 /* 32 bit ORN shifted register.  */
9874 static void
9875 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9876 {
9877   unsigned rm = INSTR (20, 16);
9878   unsigned rn = INSTR (9, 5);
9879   unsigned rd = INSTR (4, 0);
9880 
9881   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9882   aarch64_set_reg_u64
9883     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9884      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9885 }
9886 
9887 /* 64 bit ORN shifted register.  */
9888 static void
9889 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9890 {
9891   unsigned rm = INSTR (20, 16);
9892   unsigned rn = INSTR (9, 5);
9893   unsigned rd = INSTR (4, 0);
9894 
9895   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9896   aarch64_set_reg_u64
9897     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9898      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9899 }
9900 
9901 static void
9902 dexLogicalImmediate (sim_cpu *cpu)
9903 {
9904   /* assert instr[28,23] = 1001000
9905      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9906      instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
9907      instr[22] = N : used to construct immediate mask
9908      instr[21,16] = immr
9909      instr[15,10] = imms
9910      instr[9,5] = Rn
9911      instr[4,0] = Rd  */
9912 
9913   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
9914   uint32_t size = INSTR (31, 31);
9915   uint32_t N = INSTR (22, 22);
9916   /* uint32_t immr = INSTR (21, 16);.  */
9917   /* uint32_t imms = INSTR (15, 10);.  */
9918   uint32_t index = INSTR (22, 10);
9919   uint64_t bimm64 = LITable [index];
9920   uint32_t dispatch = INSTR (30, 29);
9921 
9922   if (~size & N)
9923     HALT_UNALLOC;
9924 
9925   if (!bimm64)
9926     HALT_UNALLOC;
9927 
9928   if (size == 0)
9929     {
9930       uint32_t bimm = (uint32_t) bimm64;
9931 
9932       switch (dispatch)
9933 	{
9934 	case 0: and32 (cpu, bimm); return;
9935 	case 1: orr32 (cpu, bimm); return;
9936 	case 2: eor32 (cpu, bimm); return;
9937 	case 3: ands32 (cpu, bimm); return;
9938 	}
9939     }
9940   else
9941     {
9942       switch (dispatch)
9943 	{
9944 	case 0: and64 (cpu, bimm64); return;
9945 	case 1: orr64 (cpu, bimm64); return;
9946 	case 2: eor64 (cpu, bimm64); return;
9947 	case 3: ands64 (cpu, bimm64); return;
9948 	}
9949     }
9950   HALT_UNALLOC;
9951 }
9952 
9953 /* Immediate move.
9954    The uimm argument is a 16 bit value to be inserted into the
9955    target register the pos argument locates the 16 bit word in the
9956    dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
9957    3} for 64 bit.
9958    N.B register arg may not be SP so it should be.
9959    accessed using the setGZRegisterXXX accessors.  */
9960 
9961 /* 32 bit move 16 bit immediate zero remaining shorts.  */
9962 static void
9963 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9964 {
9965   unsigned rd = INSTR (4, 0);
9966 
9967   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9968   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
9969 }
9970 
9971 /* 64 bit move 16 bit immediate zero remaining shorts.  */
9972 static void
9973 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9974 {
9975   unsigned rd = INSTR (4, 0);
9976 
9977   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9978   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
9979 }
9980 
9981 /* 32 bit move 16 bit immediate negated.  */
9982 static void
9983 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9984 {
9985   unsigned rd = INSTR (4, 0);
9986 
9987   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9988   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
9989 }
9990 
9991 /* 64 bit move 16 bit immediate negated.  */
9992 static void
9993 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9994 {
9995   unsigned rd = INSTR (4, 0);
9996 
9997   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9998   aarch64_set_reg_u64
9999     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
10000 		      ^ 0xffffffffffffffffULL));
10001 }
10002 
10003 /* 32 bit move 16 bit immediate keep remaining shorts.  */
10004 static void
10005 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
10006 {
10007   unsigned rd = INSTR (4, 0);
10008   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
10009   uint32_t value = val << (pos * 16);
10010   uint32_t mask = ~(0xffffU << (pos * 16));
10011 
10012   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10013   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
10014 }
10015 
10016 /* 64 bit move 16 it immediate keep remaining shorts.  */
10017 static void
10018 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
10019 {
10020   unsigned rd = INSTR (4, 0);
10021   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
10022   uint64_t value = (uint64_t) val << (pos * 16);
10023   uint64_t mask = ~(0xffffULL << (pos * 16));
10024 
10025   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10026   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
10027 }
10028 
10029 static void
10030 dexMoveWideImmediate (sim_cpu *cpu)
10031 {
10032   /* assert instr[28:23] = 100101
10033      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10034      instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
10035      instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
10036      instr[20,5] = uimm16
10037      instr[4,0] = Rd  */
10038 
10039   /* N.B. the (multiple of 16) shift is applied by the called routine,
10040      we just pass the multiplier.  */
10041 
10042   uint32_t imm;
10043   uint32_t size = INSTR (31, 31);
10044   uint32_t op = INSTR (30, 29);
10045   uint32_t shift = INSTR (22, 21);
10046 
10047   /* 32 bit can only shift 0 or 1 lot of 16.
10048      anything else is an unallocated instruction.  */
10049   if (size == 0 && (shift > 1))
10050     HALT_UNALLOC;
10051 
10052   if (op == 1)
10053     HALT_UNALLOC;
10054 
10055   imm = INSTR (20, 5);
10056 
10057   if (size == 0)
10058     {
10059       if (op == 0)
10060 	movn32 (cpu, imm, shift);
10061       else if (op == 2)
10062 	movz32 (cpu, imm, shift);
10063       else
10064 	movk32 (cpu, imm, shift);
10065     }
10066   else
10067     {
10068       if (op == 0)
10069 	movn64 (cpu, imm, shift);
10070       else if (op == 2)
10071 	movz64 (cpu, imm, shift);
10072       else
10073 	movk64 (cpu, imm, shift);
10074     }
10075 }
10076 
10077 /* Bitfield operations.
10078    These take a pair of bit positions r and s which are in {0..31}
10079    or {0..63} depending on the instruction word size.
10080    N.B register args may not be SP.  */
10081 
10082 /* OK, we start with ubfm which just needs to pick
10083    some bits out of source zero the rest and write
10084    the result to dest.  Just need two logical shifts.  */
10085 
10086 /* 32 bit bitfield move, left and right of affected zeroed
10087    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
10088 static void
10089 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
10090 {
10091   unsigned rd;
10092   unsigned rn = INSTR (9, 5);
10093   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10094 
10095   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
10096   if (r <= s)
10097     {
10098       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
10099          We want only bits s:xxx:r at the bottom of the word
10100          so we LSL bit s up to bit 31 i.e. by 31 - s
10101          and then we LSR to bring bit 31 down to bit s - r
10102 	 i.e. by 31 + r - s.  */
10103       value <<= 31 - s;
10104       value >>= 31 + r - s;
10105     }
10106   else
10107     {
10108       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
10109          We want only bits s:xxx:0 starting at it 31-(r-1)
10110          so we LSL bit s up to bit 31 i.e. by 31 - s
10111          and then we LSL to bring bit 31 down to 31-(r-1)+s
10112 	 i.e. by r - (s + 1).  */
10113       value <<= 31 - s;
10114       value >>= r - (s + 1);
10115     }
10116 
10117   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10118   rd = INSTR (4, 0);
10119   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
10120 }
10121 
10122 /* 64 bit bitfield move, left and right of affected zeroed
10123    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10124 static void
10125 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10126 {
10127   unsigned rd;
10128   unsigned rn = INSTR (9, 5);
10129   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
10130 
10131   if (r <= s)
10132     {
10133       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
10134          We want only bits s:xxx:r at the bottom of the word.
10135          So we LSL bit s up to bit 63 i.e. by 63 - s
10136          and then we LSR to bring bit 63 down to bit s - r
10137 	 i.e. by 63 + r - s.  */
10138       value <<= 63 - s;
10139       value >>= 63 + r - s;
10140     }
10141   else
10142     {
10143       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
10144          We want only bits s:xxx:0 starting at it 63-(r-1).
10145          So we LSL bit s up to bit 63 i.e. by 63 - s
10146          and then we LSL to bring bit 63 down to 63-(r-1)+s
10147 	 i.e. by r - (s + 1).  */
10148       value <<= 63 - s;
10149       value >>= r - (s + 1);
10150     }
10151 
10152   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10153   rd = INSTR (4, 0);
10154   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
10155 }
10156 
10157 /* The signed versions need to insert sign bits
10158    on the left of the inserted bit field. so we do
10159    much the same as the unsigned version except we
10160    use an arithmetic shift right -- this just means
10161    we need to operate on signed values.  */
10162 
10163 /* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
10164 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
10165 static void
10166 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
10167 {
10168   unsigned rd;
10169   unsigned rn = INSTR (9, 5);
10170   /* as per ubfm32 but use an ASR instead of an LSR.  */
10171   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
10172 
10173   if (r <= s)
10174     {
10175       value <<= 31 - s;
10176       value >>= 31 + r - s;
10177     }
10178   else
10179     {
10180       value <<= 31 - s;
10181       value >>= r - (s + 1);
10182     }
10183 
10184   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10185   rd = INSTR (4, 0);
10186   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
10187 }
10188 
10189 /* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
10190 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10191 static void
10192 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10193 {
10194   unsigned rd;
10195   unsigned rn = INSTR (9, 5);
10196   /* acpu per ubfm but use an ASR instead of an LSR.  */
10197   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
10198 
10199   if (r <= s)
10200     {
10201       value <<= 63 - s;
10202       value >>= 63 + r - s;
10203     }
10204   else
10205     {
10206       value <<= 63 - s;
10207       value >>= r - (s + 1);
10208     }
10209 
10210   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10211   rd = INSTR (4, 0);
10212   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
10213 }
10214 
10215 /* Finally, these versions leave non-affected bits
10216    as is. so we need to generate the bits as per
10217    ubfm and also generate a mask to pick the
10218    bits from the original and computed values.  */
10219 
10220 /* 32 bit bitfield move, non-affected bits left as is.
10221    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
10222 static void
10223 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
10224 {
10225   unsigned rn = INSTR (9, 5);
10226   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10227   uint32_t mask = -1;
10228   unsigned rd;
10229   uint32_t value2;
10230 
10231   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
10232   if (r <= s)
10233     {
10234       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
10235          We want only bits s:xxx:r at the bottom of the word
10236          so we LSL bit s up to bit 31 i.e. by 31 - s
10237          and then we LSR to bring bit 31 down to bit s - r
10238 	 i.e. by 31 + r - s.  */
10239       value <<= 31 - s;
10240       value >>= 31 + r - s;
10241       /* the mask must include the same bits.  */
10242       mask <<= 31 - s;
10243       mask >>= 31 + r - s;
10244     }
10245   else
10246     {
10247       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
10248          We want only bits s:xxx:0 starting at it 31-(r-1)
10249          so we LSL bit s up to bit 31 i.e. by 31 - s
10250          and then we LSL to bring bit 31 down to 31-(r-1)+s
10251 	 i.e. by r - (s + 1).  */
10252       value <<= 31 - s;
10253       value >>= r - (s + 1);
10254       /* The mask must include the same bits.  */
10255       mask <<= 31 - s;
10256       mask >>= r - (s + 1);
10257     }
10258 
10259   rd = INSTR (4, 0);
10260   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
10261 
10262   value2 &= ~mask;
10263   value2 |= value;
10264 
10265   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10266   aarch64_set_reg_u64
10267     (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
10268 }
10269 
10270 /* 64 bit bitfield move, non-affected bits left as is.
10271    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10272 static void
10273 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10274 {
10275   unsigned rd;
10276   unsigned rn = INSTR (9, 5);
10277   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
10278   uint64_t mask = 0xffffffffffffffffULL;
10279 
10280   if (r <= s)
10281     {
10282       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
10283          We want only bits s:xxx:r at the bottom of the word
10284          so we LSL bit s up to bit 63 i.e. by 63 - s
10285          and then we LSR to bring bit 63 down to bit s - r
10286 	 i.e. by 63 + r - s.  */
10287       value <<= 63 - s;
10288       value >>= 63 + r - s;
10289       /* The mask must include the same bits.  */
10290       mask <<= 63 - s;
10291       mask >>= 63 + r - s;
10292     }
10293   else
10294     {
10295       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
10296          We want only bits s:xxx:0 starting at it 63-(r-1)
10297          so we LSL bit s up to bit 63 i.e. by 63 - s
10298          and then we LSL to bring bit 63 down to 63-(r-1)+s
10299 	 i.e. by r - (s + 1).  */
10300       value <<= 63 - s;
10301       value >>= r - (s + 1);
10302       /* The mask must include the same bits.  */
10303       mask <<= 63 - s;
10304       mask >>= r - (s + 1);
10305     }
10306 
10307   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10308   rd = INSTR (4, 0);
10309   aarch64_set_reg_u64
10310     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
10311 }
10312 
10313 static void
10314 dexBitfieldImmediate (sim_cpu *cpu)
10315 {
10316   /* assert instr[28:23] = 100110
10317      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10318      instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
10319      instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
10320      instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
10321      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10322      instr[9,5] = Rn
10323      instr[4,0] = Rd  */
10324 
10325   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10326   uint32_t dispatch;
10327   uint32_t imms;
10328   uint32_t size = INSTR (31, 31);
10329   uint32_t N = INSTR (22, 22);
10330   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
10331   /* or else we have an UNALLOC.  */
10332   uint32_t immr = INSTR (21, 16);
10333 
10334   if (~size & N)
10335     HALT_UNALLOC;
10336 
10337   if (!size && uimm (immr, 5, 5))
10338     HALT_UNALLOC;
10339 
10340   imms = INSTR (15, 10);
10341   if (!size && uimm (imms, 5, 5))
10342     HALT_UNALLOC;
10343 
10344   /* Switch on combined size and op.  */
10345   dispatch = INSTR (31, 29);
10346   switch (dispatch)
10347     {
10348     case 0: sbfm32 (cpu, immr, imms); return;
10349     case 1: bfm32 (cpu, immr, imms); return;
10350     case 2: ubfm32 (cpu, immr, imms); return;
10351     case 4: sbfm (cpu, immr, imms); return;
10352     case 5: bfm (cpu, immr, imms); return;
10353     case 6: ubfm (cpu, immr, imms); return;
10354     default: HALT_UNALLOC;
10355     }
10356 }
10357 
10358 static void
10359 do_EXTR_32 (sim_cpu *cpu)
10360 {
10361   /* instr[31:21] = 00010011100
10362      instr[20,16] = Rm
10363      instr[15,10] = imms :  0xxxxx for 32 bit
10364      instr[9,5]   = Rn
10365      instr[4,0]   = Rd  */
10366   unsigned rm   = INSTR (20, 16);
10367   unsigned imms = INSTR (15, 10) & 31;
10368   unsigned rn   = INSTR ( 9,  5);
10369   unsigned rd   = INSTR ( 4,  0);
10370   uint64_t val1;
10371   uint64_t val2;
10372 
10373   val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
10374   val1 >>= imms;
10375   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10376   val2 <<= (32 - imms);
10377 
10378   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10379   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
10380 }
10381 
10382 static void
10383 do_EXTR_64 (sim_cpu *cpu)
10384 {
10385   /* instr[31:21] = 10010011100
10386      instr[20,16] = Rm
10387      instr[15,10] = imms
10388      instr[9,5]   = Rn
10389      instr[4,0]   = Rd  */
10390   unsigned rm   = INSTR (20, 16);
10391   unsigned imms = INSTR (15, 10) & 63;
10392   unsigned rn   = INSTR ( 9,  5);
10393   unsigned rd   = INSTR ( 4,  0);
10394   uint64_t val;
10395 
10396   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
10397   val >>= imms;
10398   val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));
10399 
10400   aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
10401 }
10402 
10403 static void
10404 dexExtractImmediate (sim_cpu *cpu)
10405 {
10406   /* assert instr[28:23] = 100111
10407      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
10408      instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
10409      instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
10410      instr[21]    = op0 : must be 0 or UNALLOC
10411      instr[20,16] = Rm
10412      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10413      instr[9,5]   = Rn
10414      instr[4,0]   = Rd  */
10415 
10416   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10417   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
10418   uint32_t dispatch;
10419   uint32_t size = INSTR (31, 31);
10420   uint32_t N = INSTR (22, 22);
10421   /* 32 bit operations must have imms[5] = 0
10422      or else we have an UNALLOC.  */
10423   uint32_t imms = INSTR (15, 10);
10424 
10425   if (size ^ N)
10426     HALT_UNALLOC;
10427 
10428   if (!size && uimm (imms, 5, 5))
10429     HALT_UNALLOC;
10430 
10431   /* Switch on combined size and op.  */
10432   dispatch = INSTR (31, 29);
10433 
10434   if (dispatch == 0)
10435     do_EXTR_32 (cpu);
10436 
10437   else if (dispatch == 4)
10438     do_EXTR_64 (cpu);
10439 
10440   else if (dispatch == 1)
10441     HALT_NYI;
10442   else
10443     HALT_UNALLOC;
10444 }
10445 
10446 static void
10447 dexDPImm (sim_cpu *cpu)
10448 {
10449   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
10450      assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
10451      bits [25,23] of a DPImm are the secondary dispatch vector.  */
10452   uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));
10453 
10454   switch (group2)
10455     {
10456     case DPIMM_PCADR_000:
10457     case DPIMM_PCADR_001:
10458       dexPCRelAddressing (cpu);
10459       return;
10460 
10461     case DPIMM_ADDSUB_010:
10462     case DPIMM_ADDSUB_011:
10463       dexAddSubtractImmediate (cpu);
10464       return;
10465 
10466     case DPIMM_LOG_100:
10467       dexLogicalImmediate (cpu);
10468       return;
10469 
10470     case DPIMM_MOV_101:
10471       dexMoveWideImmediate (cpu);
10472       return;
10473 
10474     case DPIMM_BITF_110:
10475       dexBitfieldImmediate (cpu);
10476       return;
10477 
10478     case DPIMM_EXTR_111:
10479       dexExtractImmediate (cpu);
10480       return;
10481 
10482     default:
10483       /* Should never reach here.  */
10484       HALT_NYI;
10485     }
10486 }
10487 
10488 static void
10489 dexLoadUnscaledImmediate (sim_cpu *cpu)
10490 {
10491   /* instr[29,24] == 111_00
10492      instr[21] == 0
10493      instr[11,10] == 00
10494      instr[31,30] = size
10495      instr[26] = V
10496      instr[23,22] = opc
10497      instr[20,12] = simm9
10498      instr[9,5] = rn may be SP.  */
10499   /* unsigned rt = INSTR (4, 0);  */
10500   uint32_t V = INSTR (26, 26);
10501   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10502   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
10503 
10504   if (!V)
10505     {
10506       /* GReg operations.  */
10507       switch (dispatch)
10508 	{
10509 	case 0:	 sturb (cpu, imm); return;
10510 	case 1:	 ldurb32 (cpu, imm); return;
10511 	case 2:	 ldursb64 (cpu, imm); return;
10512 	case 3:	 ldursb32 (cpu, imm); return;
10513 	case 4:	 sturh (cpu, imm); return;
10514 	case 5:	 ldurh32 (cpu, imm); return;
10515 	case 6:	 ldursh64 (cpu, imm); return;
10516 	case 7:	 ldursh32 (cpu, imm); return;
10517 	case 8:	 stur32 (cpu, imm); return;
10518 	case 9:	 ldur32 (cpu, imm); return;
10519 	case 10: ldursw (cpu, imm); return;
10520 	case 12: stur64 (cpu, imm); return;
10521 	case 13: ldur64 (cpu, imm); return;
10522 
10523 	case 14:
10524 	  /* PRFUM NYI.  */
10525 	  HALT_NYI;
10526 
10527 	default:
10528 	case 11:
10529 	case 15:
10530 	  HALT_UNALLOC;
10531 	}
10532     }
10533 
10534   /* FReg operations.  */
10535   switch (dispatch)
10536     {
10537     case 2:  fsturq (cpu, imm); return;
10538     case 3:  fldurq (cpu, imm); return;
10539     case 8:  fsturs (cpu, imm); return;
10540     case 9:  fldurs (cpu, imm); return;
10541     case 12: fsturd (cpu, imm); return;
10542     case 13: fldurd (cpu, imm); return;
10543 
10544     case 0: /* STUR 8 bit FP.  */
10545     case 1: /* LDUR 8 bit FP.  */
10546     case 4: /* STUR 16 bit FP.  */
10547     case 5: /* LDUR 8 bit FP.  */
10548       HALT_NYI;
10549 
10550     default:
10551     case 6:
10552     case 7:
10553     case 10:
10554     case 11:
10555     case 14:
10556     case 15:
10557       HALT_UNALLOC;
10558     }
10559 }
10560 
10561 /*  N.B. A preliminary note regarding all the ldrs<x>32
10562     instructions
10563 
10564    The signed value loaded by these instructions is cast to unsigned
10565    before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
10566    64 bit element of the GReg union. this performs a 32 bit sign extension
10567    (as required) but avoids 64 bit sign extension, thus ensuring that the
10568    top half of the register word is zero. this is what the spec demands
10569    when a 32 bit load occurs.  */
10570 
10571 /* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
10572 static void
10573 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
10574 {
10575   unsigned int rn = INSTR (9, 5);
10576   unsigned int rt = INSTR (4, 0);
10577 
10578   /* The target register may not be SP but the source may be
10579      there is no scaling required for a byte load.  */
10580   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
10581   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10582 		       (int64_t) aarch64_get_mem_s8 (cpu, address));
10583 }
10584 
10585 /* 32 bit load sign-extended byte scaled or unscaled zero-
10586    or sign-extended 32-bit register offset.  */
10587 static void
10588 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10589 {
10590   unsigned int rm = INSTR (20, 16);
10591   unsigned int rn = INSTR (9, 5);
10592   unsigned int rt = INSTR (4, 0);
10593 
10594   /* rn may reference SP, rm and rt must reference ZR.  */
10595 
10596   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10597   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10598 				 extension);
10599 
10600   /* There is no scaling required for a byte load.  */
10601   aarch64_set_reg_u64
10602     (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
10603 						   + displacement));
10604 }
10605 
10606 /* 32 bit load sign-extended byte unscaled signed 9 bit with
10607    pre- or post-writeback.  */
10608 static void
10609 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10610 {
10611   uint64_t address;
10612   unsigned int rn = INSTR (9, 5);
10613   unsigned int rt = INSTR (4, 0);
10614 
10615   if (rn == rt && wb != NoWriteBack)
10616     HALT_UNALLOC;
10617 
10618   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10619 
10620   if (wb == Pre)
10621       address += offset;
10622 
10623   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10624 		       (int64_t) aarch64_get_mem_s8 (cpu, address));
10625 
10626   if (wb == Post)
10627     address += offset;
10628 
10629   if (wb != NoWriteBack)
10630     aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
10631 }
10632 
10633 /* 8 bit store scaled.  */
10634 static void
10635 fstrb_abs (sim_cpu *cpu, uint32_t offset)
10636 {
10637   unsigned st = INSTR (4, 0);
10638   unsigned rn = INSTR (9, 5);
10639 
10640   aarch64_set_mem_u8 (cpu,
10641 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
10642 		      aarch64_get_vec_u8 (cpu, st, 0));
10643 }
10644 
10645 /* 8 bit store scaled or unscaled zero- or
10646    sign-extended 8-bit register offset.  */
10647 static void
10648 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10649 {
10650   unsigned rm = INSTR (20, 16);
10651   unsigned rn = INSTR (9, 5);
10652   unsigned st = INSTR (4, 0);
10653 
10654   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10655   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10656 			       extension);
10657   uint64_t  displacement = scaling == Scaled ? extended : 0;
10658 
10659   aarch64_set_mem_u8
10660     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
10661 }
10662 
10663 /* 16 bit store scaled.  */
10664 static void
10665 fstrh_abs (sim_cpu *cpu, uint32_t offset)
10666 {
10667   unsigned st = INSTR (4, 0);
10668   unsigned rn = INSTR (9, 5);
10669 
10670   aarch64_set_mem_u16
10671     (cpu,
10672      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
10673      aarch64_get_vec_u16 (cpu, st, 0));
10674 }
10675 
10676 /* 16 bit store scaled or unscaled zero-
10677    or sign-extended 16-bit register offset.  */
10678 static void
10679 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10680 {
10681   unsigned rm = INSTR (20, 16);
10682   unsigned rn = INSTR (9, 5);
10683   unsigned st = INSTR (4, 0);
10684 
10685   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10686   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10687 			       extension);
10688   uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
10689 
10690   aarch64_set_mem_u16
10691     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
10692 }
10693 
10694 /* 32 bit store scaled unsigned 12 bit.  */
10695 static void
10696 fstrs_abs (sim_cpu *cpu, uint32_t offset)
10697 {
10698   unsigned st = INSTR (4, 0);
10699   unsigned rn = INSTR (9, 5);
10700 
10701   aarch64_set_mem_u32
10702     (cpu,
10703      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
10704      aarch64_get_vec_u32 (cpu, st, 0));
10705 }
10706 
10707 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10708 static void
10709 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10710 {
10711   unsigned rn = INSTR (9, 5);
10712   unsigned st = INSTR (4, 0);
10713 
10714   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10715 
10716   if (wb != Post)
10717     address += offset;
10718 
10719   aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
10720 
10721   if (wb == Post)
10722     address += offset;
10723 
10724   if (wb != NoWriteBack)
10725     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10726 }
10727 
10728 /* 32 bit store scaled or unscaled zero-
10729    or sign-extended 32-bit register offset.  */
10730 static void
10731 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10732 {
10733   unsigned rm = INSTR (20, 16);
10734   unsigned rn = INSTR (9, 5);
10735   unsigned st = INSTR (4, 0);
10736 
10737   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10738   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10739 			       extension);
10740   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
10741 
10742   aarch64_set_mem_u32
10743     (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
10744 }
10745 
10746 /* 64 bit store scaled unsigned 12 bit.  */
10747 static void
10748 fstrd_abs (sim_cpu *cpu, uint32_t offset)
10749 {
10750   unsigned st = INSTR (4, 0);
10751   unsigned rn = INSTR (9, 5);
10752 
10753   aarch64_set_mem_u64
10754     (cpu,
10755      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
10756      aarch64_get_vec_u64 (cpu, st, 0));
10757 }
10758 
10759 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10760 static void
10761 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10762 {
10763   unsigned rn = INSTR (9, 5);
10764   unsigned st = INSTR (4, 0);
10765 
10766   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10767 
10768   if (wb != Post)
10769     address += offset;
10770 
10771   aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
10772 
10773   if (wb == Post)
10774     address += offset;
10775 
10776   if (wb != NoWriteBack)
10777     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10778 }
10779 
10780 /* 64 bit store scaled or unscaled zero-
10781    or sign-extended 32-bit register offset.  */
10782 static void
10783 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10784 {
10785   unsigned rm = INSTR (20, 16);
10786   unsigned rn = INSTR (9, 5);
10787   unsigned st = INSTR (4, 0);
10788 
10789   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10790   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10791 			       extension);
10792   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
10793 
10794   aarch64_set_mem_u64
10795     (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
10796 }
10797 
10798 /* 128 bit store scaled unsigned 12 bit.  */
10799 static void
10800 fstrq_abs (sim_cpu *cpu, uint32_t offset)
10801 {
10802   FRegister a;
10803   unsigned st = INSTR (4, 0);
10804   unsigned rn = INSTR (9, 5);
10805   uint64_t addr;
10806 
10807   aarch64_get_FP_long_double (cpu, st, & a);
10808 
10809   addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
10810   aarch64_set_mem_long_double (cpu, addr, a);
10811 }
10812 
10813 /* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10814 static void
10815 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10816 {
10817   FRegister a;
10818   unsigned rn = INSTR (9, 5);
10819   unsigned st = INSTR (4, 0);
10820   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10821 
10822   if (wb != Post)
10823     address += offset;
10824 
10825   aarch64_get_FP_long_double (cpu, st, & a);
10826   aarch64_set_mem_long_double (cpu, address, a);
10827 
10828   if (wb == Post)
10829     address += offset;
10830 
10831   if (wb != NoWriteBack)
10832     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10833 }
10834 
10835 /* 128 bit store scaled or unscaled zero-
10836    or sign-extended 32-bit register offset.  */
10837 static void
10838 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10839 {
10840   unsigned rm = INSTR (20, 16);
10841   unsigned rn = INSTR (9, 5);
10842   unsigned st = INSTR (4, 0);
10843 
10844   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10845   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10846 			       extension);
10847   uint64_t  displacement = OPT_SCALE (extended, 128, scaling);
10848 
10849   FRegister a;
10850 
10851   aarch64_get_FP_long_double (cpu, st, & a);
10852   aarch64_set_mem_long_double (cpu, address + displacement, a);
10853 }
10854 
10855 static void
10856 dexLoadImmediatePrePost (sim_cpu *cpu)
10857 {
10858   /* instr[31,30] = size
10859      instr[29,27] = 111
10860      instr[26]    = V
10861      instr[25,24] = 00
10862      instr[23,22] = opc
10863      instr[21]    = 0
10864      instr[20,12] = simm9
10865      instr[11]    = wb : 0 ==> Post, 1 ==> Pre
10866      instr[10]    = 0
10867      instr[9,5]   = Rn may be SP.
10868      instr[4,0]   = Rt */
10869 
10870   uint32_t  V        = INSTR (26, 26);
10871   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10872   int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
10873   WriteBack wb       = INSTR (11, 11);
10874 
10875   if (!V)
10876     {
10877       /* GReg operations.  */
10878       switch (dispatch)
10879 	{
10880 	case 0:	 strb_wb (cpu, imm, wb); return;
10881 	case 1:	 ldrb32_wb (cpu, imm, wb); return;
10882 	case 2:	 ldrsb_wb (cpu, imm, wb); return;
10883 	case 3:	 ldrsb32_wb (cpu, imm, wb); return;
10884 	case 4:	 strh_wb (cpu, imm, wb); return;
10885 	case 5:	 ldrh32_wb (cpu, imm, wb); return;
10886 	case 6:	 ldrsh64_wb (cpu, imm, wb); return;
10887 	case 7:	 ldrsh32_wb (cpu, imm, wb); return;
10888 	case 8:	 str32_wb (cpu, imm, wb); return;
10889 	case 9:	 ldr32_wb (cpu, imm, wb); return;
10890 	case 10: ldrsw_wb (cpu, imm, wb); return;
10891 	case 12: str_wb (cpu, imm, wb); return;
10892 	case 13: ldr_wb (cpu, imm, wb); return;
10893 
10894 	default:
10895 	case 11:
10896 	case 14:
10897 	case 15:
10898 	  HALT_UNALLOC;
10899 	}
10900     }
10901 
10902   /* FReg operations.  */
10903   switch (dispatch)
10904     {
10905     case 2:  fstrq_wb (cpu, imm, wb); return;
10906     case 3:  fldrq_wb (cpu, imm, wb); return;
10907     case 8:  fstrs_wb (cpu, imm, wb); return;
10908     case 9:  fldrs_wb (cpu, imm, wb); return;
10909     case 12: fstrd_wb (cpu, imm, wb); return;
10910     case 13: fldrd_wb (cpu, imm, wb); return;
10911 
10912     case 0:	  /* STUR 8 bit FP.  */
10913     case 1:	  /* LDUR 8 bit FP.  */
10914     case 4:	  /* STUR 16 bit FP.  */
10915     case 5:	  /* LDUR 8 bit FP.  */
10916       HALT_NYI;
10917 
10918     default:
10919     case 6:
10920     case 7:
10921     case 10:
10922     case 11:
10923     case 14:
10924     case 15:
10925       HALT_UNALLOC;
10926     }
10927 }
10928 
10929 static void
10930 dexLoadRegisterOffset (sim_cpu *cpu)
10931 {
10932   /* instr[31,30] = size
10933      instr[29,27] = 111
10934      instr[26]    = V
10935      instr[25,24] = 00
10936      instr[23,22] = opc
10937      instr[21]    = 1
10938      instr[20,16] = rm
10939      instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
10940                              110 ==> SXTW, 111 ==> SXTX,
10941                              ow ==> RESERVED
10942      instr[12]    = scaled
10943      instr[11,10] = 10
10944      instr[9,5]   = rn
10945      instr[4,0]   = rt.  */
10946 
10947   uint32_t  V = INSTR (26, 26);
10948   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10949   Scaling   scale = INSTR (12, 12);
10950   Extension extensionType = INSTR (15, 13);
10951 
10952   /* Check for illegal extension types.  */
10953   if (uimm (extensionType, 1, 1) == 0)
10954     HALT_UNALLOC;
10955 
10956   if (extensionType == UXTX || extensionType == SXTX)
10957     extensionType = NoExtension;
10958 
10959   if (!V)
10960     {
10961       /* GReg operations.  */
10962       switch (dispatch)
10963 	{
10964 	case 0:	 strb_scale_ext (cpu, scale, extensionType); return;
10965 	case 1:	 ldrb32_scale_ext (cpu, scale, extensionType); return;
10966 	case 2:	 ldrsb_scale_ext (cpu, scale, extensionType); return;
10967 	case 3:	 ldrsb32_scale_ext (cpu, scale, extensionType); return;
10968 	case 4:	 strh_scale_ext (cpu, scale, extensionType); return;
10969 	case 5:	 ldrh32_scale_ext (cpu, scale, extensionType); return;
10970 	case 6:	 ldrsh_scale_ext (cpu, scale, extensionType); return;
10971 	case 7:	 ldrsh32_scale_ext (cpu, scale, extensionType); return;
10972 	case 8:	 str32_scale_ext (cpu, scale, extensionType); return;
10973 	case 9:	 ldr32_scale_ext (cpu, scale, extensionType); return;
10974 	case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
10975 	case 12: str_scale_ext (cpu, scale, extensionType); return;
10976 	case 13: ldr_scale_ext (cpu, scale, extensionType); return;
10977 	case 14: prfm_scale_ext (cpu, scale, extensionType); return;
10978 
10979 	default:
10980 	case 11:
10981 	case 15:
10982 	  HALT_UNALLOC;
10983 	}
10984     }
10985 
10986   /* FReg operations.  */
10987   switch (dispatch)
10988     {
10989     case 1: /* LDUR 8 bit FP.  */
10990       HALT_NYI;
10991     case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
10992     case 5: /* LDUR 8 bit FP.  */
10993       HALT_NYI;
10994     case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
10995     case 13: fldrd_scale_ext (cpu, scale, extensionType); return;
10996 
10997     case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
10998     case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
10999     case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
11000     case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
11001     case 12: fstrd_scale_ext (cpu, scale, extensionType); return;
11002 
11003     default:
11004     case 6:
11005     case 7:
11006     case 10:
11007     case 11:
11008     case 14:
11009     case 15:
11010       HALT_UNALLOC;
11011     }
11012 }
11013 
11014 static void
11015 dexLoadUnsignedImmediate (sim_cpu *cpu)
11016 {
11017   /* instr[29,24] == 111_01
11018      instr[31,30] = size
11019      instr[26]    = V
11020      instr[23,22] = opc
11021      instr[21,10] = uimm12 : unsigned immediate offset
11022      instr[9,5]   = rn may be SP.
11023      instr[4,0]   = rt.  */
11024 
11025   uint32_t V = INSTR (26,26);
11026   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
11027   uint32_t imm = INSTR (21, 10);
11028 
11029   if (!V)
11030     {
11031       /* GReg operations.  */
11032       switch (dispatch)
11033 	{
11034 	case 0:  strb_abs (cpu, imm); return;
11035 	case 1:  ldrb32_abs (cpu, imm); return;
11036 	case 2:  ldrsb_abs (cpu, imm); return;
11037 	case 3:  ldrsb32_abs (cpu, imm); return;
11038 	case 4:  strh_abs (cpu, imm); return;
11039 	case 5:  ldrh32_abs (cpu, imm); return;
11040 	case 6:  ldrsh_abs (cpu, imm); return;
11041 	case 7:  ldrsh32_abs (cpu, imm); return;
11042 	case 8:  str32_abs (cpu, imm); return;
11043 	case 9:  ldr32_abs (cpu, imm); return;
11044 	case 10: ldrsw_abs (cpu, imm); return;
11045 	case 12: str_abs (cpu, imm); return;
11046 	case 13: ldr_abs (cpu, imm); return;
11047 	case 14: prfm_abs (cpu, imm); return;
11048 
11049 	default:
11050 	case 11:
11051 	case 15:
11052 	  HALT_UNALLOC;
11053 	}
11054     }
11055 
11056   /* FReg operations.  */
11057   switch (dispatch)
11058     {
11059     case 0:  fstrb_abs (cpu, imm); return;
11060     case 4:  fstrh_abs (cpu, imm); return;
11061     case 8:  fstrs_abs (cpu, imm); return;
11062     case 12: fstrd_abs (cpu, imm); return;
11063     case 2:  fstrq_abs (cpu, imm); return;
11064 
11065     case 1:  fldrb_abs (cpu, imm); return;
11066     case 5:  fldrh_abs (cpu, imm); return;
11067     case 9:  fldrs_abs (cpu, imm); return;
11068     case 13: fldrd_abs (cpu, imm); return;
11069     case 3:  fldrq_abs (cpu, imm); return;
11070 
11071     default:
11072     case 6:
11073     case 7:
11074     case 10:
11075     case 11:
11076     case 14:
11077     case 15:
11078       HALT_UNALLOC;
11079     }
11080 }
11081 
11082 static void
11083 dexLoadExclusive (sim_cpu *cpu)
11084 {
11085   /* assert instr[29:24] = 001000;
11086      instr[31,30] = size
11087      instr[23] = 0 if exclusive
11088      instr[22] = L : 1 if load, 0 if store
11089      instr[21] = 1 if pair
11090      instr[20,16] = Rs
11091      instr[15] = o0 : 1 if ordered
11092      instr[14,10] = Rt2
11093      instr[9,5] = Rn
11094      instr[4.0] = Rt.  */
11095 
11096   switch (INSTR (22, 21))
11097     {
11098     case 2:   ldxr (cpu); return;
11099     case 0:   stxr (cpu); return;
11100     default:  HALT_NYI;
11101     }
11102 }
11103 
11104 static void
11105 dexLoadOther (sim_cpu *cpu)
11106 {
11107   uint32_t dispatch;
11108 
11109   /* instr[29,25] = 111_0
11110      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
11111      instr[21:11,10] is the secondary dispatch.  */
11112   if (INSTR (24, 24))
11113     {
11114       dexLoadUnsignedImmediate (cpu);
11115       return;
11116     }
11117 
11118   dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
11119   switch (dispatch)
11120     {
11121     case 0: dexLoadUnscaledImmediate (cpu); return;
11122     case 1: dexLoadImmediatePrePost (cpu); return;
11123     case 3: dexLoadImmediatePrePost (cpu); return;
11124     case 6: dexLoadRegisterOffset (cpu); return;
11125 
11126     default:
11127     case 2:
11128     case 4:
11129     case 5:
11130     case 7:
11131       HALT_NYI;
11132     }
11133 }
11134 
11135 static void
11136 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11137 {
11138   unsigned rn = INSTR (14, 10);
11139   unsigned rd = INSTR (9, 5);
11140   unsigned rm = INSTR (4, 0);
11141   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11142 
11143   if ((rn == rd || rm == rd) && wb != NoWriteBack)
11144     HALT_UNALLOC; /* ??? */
11145 
11146   offset <<= 2;
11147 
11148   if (wb != Post)
11149     address += offset;
11150 
11151   aarch64_set_mem_u32 (cpu, address,
11152 		       aarch64_get_reg_u32 (cpu, rm, NO_SP));
11153   aarch64_set_mem_u32 (cpu, address + 4,
11154 		       aarch64_get_reg_u32 (cpu, rn, NO_SP));
11155 
11156   if (wb == Post)
11157     address += offset;
11158 
11159   if (wb != NoWriteBack)
11160     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11161 }
11162 
11163 static void
11164 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11165 {
11166   unsigned rn = INSTR (14, 10);
11167   unsigned rd = INSTR (9, 5);
11168   unsigned rm = INSTR (4, 0);
11169   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11170 
11171   if ((rn == rd || rm == rd) && wb != NoWriteBack)
11172     HALT_UNALLOC; /* ??? */
11173 
11174   offset <<= 3;
11175 
11176   if (wb != Post)
11177     address += offset;
11178 
11179   aarch64_set_mem_u64 (cpu, address,
11180 		       aarch64_get_reg_u64 (cpu, rm, NO_SP));
11181   aarch64_set_mem_u64 (cpu, address + 8,
11182 		       aarch64_get_reg_u64 (cpu, rn, NO_SP));
11183 
11184   if (wb == Post)
11185     address += offset;
11186 
11187   if (wb != NoWriteBack)
11188     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11189 }
11190 
11191 static void
11192 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11193 {
11194   unsigned rn = INSTR (14, 10);
11195   unsigned rd = INSTR (9, 5);
11196   unsigned rm = INSTR (4, 0);
11197   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11198 
11199   /* Treat this as unalloc to make sure we don't do it.  */
11200   if (rn == rm)
11201     HALT_UNALLOC;
11202 
11203   offset <<= 2;
11204 
11205   if (wb != Post)
11206     address += offset;
11207 
11208   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
11209   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));
11210 
11211   if (wb == Post)
11212     address += offset;
11213 
11214   if (wb != NoWriteBack)
11215     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11216 }
11217 
11218 static void
11219 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11220 {
11221   unsigned rn = INSTR (14, 10);
11222   unsigned rd = INSTR (9, 5);
11223   unsigned rm = INSTR (4, 0);
11224   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11225 
11226   /* Treat this as unalloc to make sure we don't do it.  */
11227   if (rn == rm)
11228     HALT_UNALLOC;
11229 
11230   offset <<= 2;
11231 
11232   if (wb != Post)
11233     address += offset;
11234 
11235   aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
11236   aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));
11237 
11238   if (wb == Post)
11239     address += offset;
11240 
11241   if (wb != NoWriteBack)
11242     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11243 }
11244 
11245 static void
11246 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11247 {
11248   unsigned rn = INSTR (14, 10);
11249   unsigned rd = INSTR (9, 5);
11250   unsigned rm = INSTR (4, 0);
11251   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11252 
11253   /* Treat this as unalloc to make sure we don't do it.  */
11254   if (rn == rm)
11255     HALT_UNALLOC;
11256 
11257   offset <<= 3;
11258 
11259   if (wb != Post)
11260     address += offset;
11261 
11262   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
11263   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));
11264 
11265   if (wb == Post)
11266     address += offset;
11267 
11268   if (wb != NoWriteBack)
11269     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11270 }
11271 
11272 static void
11273 dex_load_store_pair_gr (sim_cpu *cpu)
11274 {
11275   /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
11276      instr[29,25] = instruction encoding: 101_0
11277      instr[26]    = V : 1 if fp 0 if gp
11278      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11279      instr[22]    = load/store (1=> load)
11280      instr[21,15] = signed, scaled, offset
11281      instr[14,10] = Rn
11282      instr[ 9, 5] = Rd
11283      instr[ 4, 0] = Rm.  */
11284 
11285   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11286   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11287 
11288   switch (dispatch)
11289     {
11290     case 2: store_pair_u32 (cpu, offset, Post); return;
11291     case 3: load_pair_u32  (cpu, offset, Post); return;
11292     case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
11293     case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
11294     case 6: store_pair_u32 (cpu, offset, Pre); return;
11295     case 7: load_pair_u32  (cpu, offset, Pre); return;
11296 
11297     case 11: load_pair_s32  (cpu, offset, Post); return;
11298     case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
11299     case 15: load_pair_s32  (cpu, offset, Pre); return;
11300 
11301     case 18: store_pair_u64 (cpu, offset, Post); return;
11302     case 19: load_pair_u64  (cpu, offset, Post); return;
11303     case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
11304     case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
11305     case 22: store_pair_u64 (cpu, offset, Pre); return;
11306     case 23: load_pair_u64  (cpu, offset, Pre); return;
11307 
11308     default:
11309       HALT_UNALLOC;
11310     }
11311 }
11312 
11313 static void
11314 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11315 {
11316   unsigned rn = INSTR (14, 10);
11317   unsigned rd = INSTR (9, 5);
11318   unsigned rm = INSTR (4, 0);
11319   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11320 
11321   offset <<= 2;
11322 
11323   if (wb != Post)
11324     address += offset;
11325 
11326   aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
11327   aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
11328 
11329   if (wb == Post)
11330     address += offset;
11331 
11332   if (wb != NoWriteBack)
11333     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11334 }
11335 
11336 static void
11337 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11338 {
11339   unsigned rn = INSTR (14, 10);
11340   unsigned rd = INSTR (9, 5);
11341   unsigned rm = INSTR (4, 0);
11342   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11343 
11344   offset <<= 3;
11345 
11346   if (wb != Post)
11347     address += offset;
11348 
11349   aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
11350   aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
11351 
11352   if (wb == Post)
11353     address += offset;
11354 
11355   if (wb != NoWriteBack)
11356     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11357 }
11358 
11359 static void
11360 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11361 {
11362   FRegister a;
11363   unsigned rn = INSTR (14, 10);
11364   unsigned rd = INSTR (9, 5);
11365   unsigned rm = INSTR (4, 0);
11366   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11367 
11368   offset <<= 4;
11369 
11370   if (wb != Post)
11371     address += offset;
11372 
11373   aarch64_get_FP_long_double (cpu, rm, & a);
11374   aarch64_set_mem_long_double (cpu, address, a);
11375   aarch64_get_FP_long_double (cpu, rn, & a);
11376   aarch64_set_mem_long_double (cpu, address + 16, a);
11377 
11378   if (wb == Post)
11379     address += offset;
11380 
11381   if (wb != NoWriteBack)
11382     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11383 }
11384 
11385 static void
11386 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11387 {
11388   unsigned rn = INSTR (14, 10);
11389   unsigned rd = INSTR (9, 5);
11390   unsigned rm = INSTR (4, 0);
11391   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11392 
11393   if (rm == rn)
11394     HALT_UNALLOC;
11395 
11396   offset <<= 2;
11397 
11398   if (wb != Post)
11399     address += offset;
11400 
11401   aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
11402   aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
11403 
11404   if (wb == Post)
11405     address += offset;
11406 
11407   if (wb != NoWriteBack)
11408     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11409 }
11410 
11411 static void
11412 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11413 {
11414   unsigned rn = INSTR (14, 10);
11415   unsigned rd = INSTR (9, 5);
11416   unsigned rm = INSTR (4, 0);
11417   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11418 
11419   if (rm == rn)
11420     HALT_UNALLOC;
11421 
11422   offset <<= 3;
11423 
11424   if (wb != Post)
11425     address += offset;
11426 
11427   aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
11428   aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
11429 
11430   if (wb == Post)
11431     address += offset;
11432 
11433   if (wb != NoWriteBack)
11434     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11435 }
11436 
11437 static void
11438 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11439 {
11440   FRegister a;
11441   unsigned rn = INSTR (14, 10);
11442   unsigned rd = INSTR (9, 5);
11443   unsigned rm = INSTR (4, 0);
11444   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11445 
11446   if (rm == rn)
11447     HALT_UNALLOC;
11448 
11449   offset <<= 4;
11450 
11451   if (wb != Post)
11452     address += offset;
11453 
11454   aarch64_get_mem_long_double (cpu, address, & a);
11455   aarch64_set_FP_long_double (cpu, rm, a);
11456   aarch64_get_mem_long_double (cpu, address + 16, & a);
11457   aarch64_set_FP_long_double (cpu, rn, a);
11458 
11459   if (wb == Post)
11460     address += offset;
11461 
11462   if (wb != NoWriteBack)
11463     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11464 }
11465 
11466 static void
11467 dex_load_store_pair_fp (sim_cpu *cpu)
11468 {
11469   /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
11470      instr[29,25] = instruction encoding
11471      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11472      instr[22]    = load/store (1=> load)
11473      instr[21,15] = signed, scaled, offset
11474      instr[14,10] = Rn
11475      instr[ 9, 5] = Rd
11476      instr[ 4, 0] = Rm  */
11477 
11478   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11479   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11480 
11481   switch (dispatch)
11482     {
11483     case 2: store_pair_float (cpu, offset, Post); return;
11484     case 3: load_pair_float  (cpu, offset, Post); return;
11485     case 4: store_pair_float (cpu, offset, NoWriteBack); return;
11486     case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
11487     case 6: store_pair_float (cpu, offset, Pre); return;
11488     case 7: load_pair_float  (cpu, offset, Pre); return;
11489 
11490     case 10: store_pair_double (cpu, offset, Post); return;
11491     case 11: load_pair_double  (cpu, offset, Post); return;
11492     case 12: store_pair_double (cpu, offset, NoWriteBack); return;
11493     case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
11494     case 14: store_pair_double (cpu, offset, Pre); return;
11495     case 15: load_pair_double  (cpu, offset, Pre); return;
11496 
11497     case 18: store_pair_long_double (cpu, offset, Post); return;
11498     case 19: load_pair_long_double  (cpu, offset, Post); return;
11499     case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
11500     case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
11501     case 22: store_pair_long_double (cpu, offset, Pre); return;
11502     case 23: load_pair_long_double  (cpu, offset, Pre); return;
11503 
11504     default:
11505       HALT_UNALLOC;
11506     }
11507 }
11508 
11509 static inline unsigned
11510 vec_reg (unsigned v, unsigned o)
11511 {
11512   return (v + o) & 0x3F;
11513 }
11514 
11515 /* Load multiple N-element structures to M consecutive registers.  */
11516 static void
11517 vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
11518 {
11519   int      all  = INSTR (30, 30);
11520   unsigned size = INSTR (11, 10);
11521   unsigned vd   = INSTR (4, 0);
11522   unsigned rpt = (N == M) ? 1 : M;
11523   unsigned selem = N;
11524   unsigned i, j, k;
11525 
11526   switch (size)
11527     {
11528     case 0: /* 8-bit operations.  */
11529       for (i = 0; i < rpt; i++)
11530 	for (j = 0; j < (8 + (8 * all)); j++)
11531 	  for (k = 0; k < selem; k++)
11532 	    {
11533 	      aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
11534 				  aarch64_get_mem_u8 (cpu, address));
11535 	      address += 1;
11536 	    }
11537       return;
11538 
11539     case 1: /* 16-bit operations.  */
11540       for (i = 0; i < rpt; i++)
11541 	for (j = 0; j < (4 + (4 * all)); j++)
11542 	  for (k = 0; k < selem; k++)
11543 	    {
11544 	      aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
11545 				   aarch64_get_mem_u16 (cpu, address));
11546 	      address += 2;
11547 	    }
11548       return;
11549 
11550     case 2: /* 32-bit operations.  */
11551       for (i = 0; i < rpt; i++)
11552 	for (j = 0; j < (2 + (2 * all)); j++)
11553 	  for (k = 0; k < selem; k++)
11554 	    {
11555 	      aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
11556 				   aarch64_get_mem_u32 (cpu, address));
11557 	      address += 4;
11558 	    }
11559       return;
11560 
11561     case 3: /* 64-bit operations.  */
11562       for (i = 0; i < rpt; i++)
11563 	for (j = 0; j < (1 + all); j++)
11564 	  for (k = 0; k < selem; k++)
11565 	    {
11566 	      aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
11567 				   aarch64_get_mem_u64 (cpu, address));
11568 	      address += 8;
11569 	    }
11570       return;
11571     }
11572 }
11573 
11574 /* Load multiple 4-element structures into four consecutive registers.  */
11575 static void
11576 LD4 (sim_cpu *cpu, uint64_t address)
11577 {
11578   vec_load (cpu, address, 4, 4);
11579 }
11580 
11581 /* Load multiple 3-element structures into three consecutive registers.  */
11582 static void
11583 LD3 (sim_cpu *cpu, uint64_t address)
11584 {
11585   vec_load (cpu, address, 3, 3);
11586 }
11587 
11588 /* Load multiple 2-element structures into two consecutive registers.  */
11589 static void
11590 LD2 (sim_cpu *cpu, uint64_t address)
11591 {
11592   vec_load (cpu, address, 2, 2);
11593 }
11594 
11595 /* Load multiple 1-element structures into one register.  */
11596 static void
11597 LD1_1 (sim_cpu *cpu, uint64_t address)
11598 {
11599   vec_load (cpu, address, 1, 1);
11600 }
11601 
11602 /* Load multiple 1-element structures into two registers.  */
11603 static void
11604 LD1_2 (sim_cpu *cpu, uint64_t address)
11605 {
11606   vec_load (cpu, address, 1, 2);
11607 }
11608 
11609 /* Load multiple 1-element structures into three registers.  */
11610 static void
11611 LD1_3 (sim_cpu *cpu, uint64_t address)
11612 {
11613   vec_load (cpu, address, 1, 3);
11614 }
11615 
11616 /* Load multiple 1-element structures into four registers.  */
11617 static void
11618 LD1_4 (sim_cpu *cpu, uint64_t address)
11619 {
11620   vec_load (cpu, address, 1, 4);
11621 }
11622 
11623 /* Store multiple N-element structures from M consecutive registers.  */
11624 static void
11625 vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
11626 {
11627   int      all  = INSTR (30, 30);
11628   unsigned size = INSTR (11, 10);
11629   unsigned vd   = INSTR (4, 0);
11630   unsigned rpt = (N == M) ? 1 : M;
11631   unsigned selem = N;
11632   unsigned i, j, k;
11633 
11634   switch (size)
11635     {
11636     case 0: /* 8-bit operations.  */
11637       for (i = 0; i < rpt; i++)
11638 	for (j = 0; j < (8 + (8 * all)); j++)
11639 	  for (k = 0; k < selem; k++)
11640 	    {
11641 	      aarch64_set_mem_u8
11642 		(cpu, address,
11643 		 aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
11644 	      address += 1;
11645 	    }
11646       return;
11647 
11648     case 1: /* 16-bit operations.  */
11649       for (i = 0; i < rpt; i++)
11650 	for (j = 0; j < (4 + (4 * all)); j++)
11651 	  for (k = 0; k < selem; k++)
11652 	    {
11653 	      aarch64_set_mem_u16
11654 		(cpu, address,
11655 		 aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
11656 	      address += 2;
11657 	    }
11658       return;
11659 
11660     case 2: /* 32-bit operations.  */
11661       for (i = 0; i < rpt; i++)
11662 	for (j = 0; j < (2 + (2 * all)); j++)
11663 	  for (k = 0; k < selem; k++)
11664 	    {
11665 	      aarch64_set_mem_u32
11666 		(cpu, address,
11667 		 aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
11668 	      address += 4;
11669 	    }
11670       return;
11671 
11672     case 3: /* 64-bit operations.  */
11673       for (i = 0; i < rpt; i++)
11674 	for (j = 0; j < (1 + all); j++)
11675 	  for (k = 0; k < selem; k++)
11676 	    {
11677 	      aarch64_set_mem_u64
11678 		(cpu, address,
11679 		 aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
11680 	      address += 8;
11681 	    }
11682       return;
11683     }
11684 }
11685 
11686 /* Store multiple 4-element structure from four consecutive registers.  */
11687 static void
11688 ST4 (sim_cpu *cpu, uint64_t address)
11689 {
11690   vec_store (cpu, address, 4, 4);
11691 }
11692 
11693 /* Store multiple 3-element structures from three consecutive registers.  */
11694 static void
11695 ST3 (sim_cpu *cpu, uint64_t address)
11696 {
11697   vec_store (cpu, address, 3, 3);
11698 }
11699 
11700 /* Store multiple 2-element structures from two consecutive registers.  */
11701 static void
11702 ST2 (sim_cpu *cpu, uint64_t address)
11703 {
11704   vec_store (cpu, address, 2, 2);
11705 }
11706 
11707 /* Store multiple 1-element structures from one register.  */
11708 static void
11709 ST1_1 (sim_cpu *cpu, uint64_t address)
11710 {
11711   vec_store (cpu, address, 1, 1);
11712 }
11713 
11714 /* Store multiple 1-element structures from two registers.  */
11715 static void
11716 ST1_2 (sim_cpu *cpu, uint64_t address)
11717 {
11718   vec_store (cpu, address, 1, 2);
11719 }
11720 
11721 /* Store multiple 1-element structures from three registers.  */
11722 static void
11723 ST1_3 (sim_cpu *cpu, uint64_t address)
11724 {
11725   vec_store (cpu, address, 1, 3);
11726 }
11727 
11728 /* Store multiple 1-element structures from four registers.  */
11729 static void
11730 ST1_4 (sim_cpu *cpu, uint64_t address)
11731 {
11732   vec_store (cpu, address, 1, 4);
11733 }
11734 
11735 #define LDn_STn_SINGLE_LANE_AND_SIZE()				\
11736   do								\
11737     {								\
11738       switch (INSTR (15, 14))					\
11739 	{							\
11740 	case 0:							\
11741 	  lane = (full << 3) | (s << 2) | size;			\
11742 	  size = 0;						\
11743 	  break;						\
11744 								\
11745 	case 1:							\
11746 	  if ((size & 1) == 1)					\
11747 	    HALT_UNALLOC;					\
11748 	  lane = (full << 2) | (s << 1) | (size >> 1);		\
11749 	  size = 1;						\
11750 	  break;						\
11751 								\
11752 	case 2:							\
11753 	  if ((size & 2) == 2)					\
11754 	    HALT_UNALLOC;					\
11755 								\
11756 	  if ((size & 1) == 0)					\
11757 	    {							\
11758 	      lane = (full << 1) | s;				\
11759 	      size = 2;						\
11760 	    }							\
11761 	  else							\
11762 	    {							\
11763 	      if (s)						\
11764 		HALT_UNALLOC;					\
11765 	      lane = full;					\
11766 	      size = 3;						\
11767 	    }							\
11768 	  break;						\
11769 								\
11770 	default:						\
11771 	  HALT_UNALLOC;						\
11772 	}							\
11773     }								\
11774   while (0)
11775 
11776 /* Load single structure into one lane of N registers.  */
11777 static void
11778 do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
11779 {
11780   /* instr[31]    = 0
11781      instr[30]    = element selector 0=>half, 1=>all elements
11782      instr[29,24] = 00 1101
11783      instr[23]    = 0=>simple, 1=>post
11784      instr[22]    = 1
11785      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11786      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11787                       11111 (immediate post inc)
11788      instr[15,13] = opcode
11789      instr[12]    = S, used for lane number
11790      instr[11,10] = size, also used for lane number
11791      instr[9,5]   = address
11792      instr[4,0]   = Vd  */
11793 
11794   unsigned full = INSTR (30, 30);
11795   unsigned vd = INSTR (4, 0);
11796   unsigned size = INSTR (11, 10);
11797   unsigned s = INSTR (12, 12);
11798   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11799   int lane = 0;
11800   int i;
11801 
11802   NYI_assert (29, 24, 0x0D);
11803   NYI_assert (22, 22, 1);
11804 
11805   /* Compute the lane number first (using size), and then compute size.  */
11806   LDn_STn_SINGLE_LANE_AND_SIZE ();
11807 
11808   for (i = 0; i < nregs; i++)
11809     switch (size)
11810       {
11811       case 0:
11812 	{
11813 	  uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
11814 	  aarch64_set_vec_u8 (cpu, vd + i, lane, val);
11815 	  break;
11816 	}
11817 
11818       case 1:
11819 	{
11820 	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
11821 	  aarch64_set_vec_u16 (cpu, vd + i, lane, val);
11822 	  break;
11823 	}
11824 
11825       case 2:
11826 	{
11827 	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
11828 	  aarch64_set_vec_u32 (cpu, vd + i, lane, val);
11829 	  break;
11830 	}
11831 
11832       case 3:
11833 	{
11834 	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
11835 	  aarch64_set_vec_u64 (cpu, vd + i, lane, val);
11836 	  break;
11837 	}
11838       }
11839 }
11840 
11841 /* Store single structure from one lane from N registers.  */
11842 static void
11843 do_vec_STn_single (sim_cpu *cpu, uint64_t address)
11844 {
11845   /* instr[31]    = 0
11846      instr[30]    = element selector 0=>half, 1=>all elements
11847      instr[29,24] = 00 1101
11848      instr[23]    = 0=>simple, 1=>post
11849      instr[22]    = 0
11850      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11851      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11852                       11111 (immediate post inc)
11853      instr[15,13] = opcode
11854      instr[12]    = S, used for lane number
11855      instr[11,10] = size, also used for lane number
11856      instr[9,5]   = address
11857      instr[4,0]   = Vd  */
11858 
11859   unsigned full = INSTR (30, 30);
11860   unsigned vd = INSTR (4, 0);
11861   unsigned size = INSTR (11, 10);
11862   unsigned s = INSTR (12, 12);
11863   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11864   int lane = 0;
11865   int i;
11866 
11867   NYI_assert (29, 24, 0x0D);
11868   NYI_assert (22, 22, 0);
11869 
11870   /* Compute the lane number first (using size), and then compute size.  */
11871   LDn_STn_SINGLE_LANE_AND_SIZE ();
11872 
11873   for (i = 0; i < nregs; i++)
11874     switch (size)
11875       {
11876       case 0:
11877 	{
11878 	  uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
11879 	  aarch64_set_mem_u8 (cpu, address + i, val);
11880 	  break;
11881 	}
11882 
11883       case 1:
11884 	{
11885 	  uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
11886 	  aarch64_set_mem_u16 (cpu, address + (i * 2), val);
11887 	  break;
11888 	}
11889 
11890       case 2:
11891 	{
11892 	  uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
11893 	  aarch64_set_mem_u32 (cpu, address + (i * 4), val);
11894 	  break;
11895 	}
11896 
11897       case 3:
11898 	{
11899 	  uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
11900 	  aarch64_set_mem_u64 (cpu, address + (i * 8), val);
11901 	  break;
11902 	}
11903       }
11904 }
11905 
11906 /* Load single structure into all lanes of N registers.  */
11907 static void
11908 do_vec_LDnR (sim_cpu *cpu, uint64_t address)
11909 {
11910   /* instr[31]    = 0
11911      instr[30]    = element selector 0=>half, 1=>all elements
11912      instr[29,24] = 00 1101
11913      instr[23]    = 0=>simple, 1=>post
11914      instr[22]    = 1
11915      instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
11916      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11917                       11111 (immediate post inc)
11918      instr[15,14] = 11
11919      instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
11920      instr[12]    = 0
11921      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11922                                  10=> word(s), 11=> double(d)
11923      instr[9,5]   = address
11924      instr[4,0]   = Vd  */
11925 
11926   unsigned full = INSTR (30, 30);
11927   unsigned vd = INSTR (4, 0);
11928   unsigned size = INSTR (11, 10);
11929   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11930   int i, n;
11931 
11932   NYI_assert (29, 24, 0x0D);
11933   NYI_assert (22, 22, 1);
11934   NYI_assert (15, 14, 3);
11935   NYI_assert (12, 12, 0);
11936 
11937   for (n = 0; n < nregs; n++)
11938     switch (size)
11939       {
11940       case 0:
11941 	{
11942 	  uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
11943 	  for (i = 0; i < (full ? 16 : 8); i++)
11944 	    aarch64_set_vec_u8 (cpu, vd + n, i, val);
11945 	  break;
11946 	}
11947 
11948       case 1:
11949 	{
11950 	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
11951 	  for (i = 0; i < (full ? 8 : 4); i++)
11952 	    aarch64_set_vec_u16 (cpu, vd + n, i, val);
11953 	  break;
11954 	}
11955 
11956       case 2:
11957 	{
11958 	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
11959 	  for (i = 0; i < (full ? 4 : 2); i++)
11960 	    aarch64_set_vec_u32 (cpu, vd + n, i, val);
11961 	  break;
11962 	}
11963 
11964       case 3:
11965 	{
11966 	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
11967 	  for (i = 0; i < (full ? 2 : 1); i++)
11968 	    aarch64_set_vec_u64 (cpu, vd + n, i, val);
11969 	  break;
11970 	}
11971 
11972       default:
11973 	HALT_UNALLOC;
11974       }
11975 }
11976 
11977 static void
11978 do_vec_load_store (sim_cpu *cpu)
11979 {
11980   /* {LD|ST}<N>   {Vd..Vd+N}, vaddr
11981 
11982      instr[31]    = 0
11983      instr[30]    = element selector 0=>half, 1=>all elements
11984      instr[29,25] = 00110
11985      instr[24]    = 0=>multiple struct, 1=>single struct
11986      instr[23]    = 0=>simple, 1=>post
11987      instr[22]    = 0=>store, 1=>load
11988      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
11989      instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
11990                     11111 (immediate post inc)
11991      instr[15,12] = elements and destinations.  eg for load:
11992                      0000=>LD4 => load multiple 4-element to
11993 		     four consecutive registers
11994                      0100=>LD3 => load multiple 3-element to
11995 		     three consecutive registers
11996                      1000=>LD2 => load multiple 2-element to
11997 		     two consecutive registers
11998                      0010=>LD1 => load multiple 1-element to
11999 		     four consecutive registers
12000                      0110=>LD1 => load multiple 1-element to
12001 		     three consecutive registers
12002                      1010=>LD1 => load multiple 1-element to
12003 		     two consecutive registers
12004                      0111=>LD1 => load multiple 1-element to
12005 		     one register
12006                      1100=>LDR1,LDR2
12007                      1110=>LDR3,LDR4
12008      instr[11,10] = element size 00=> byte(b), 01=> half(h),
12009                                  10=> word(s), 11=> double(d)
12010      instr[9,5]   = Vn, can be SP
12011      instr[4,0]   = Vd  */
12012 
12013   int single;
12014   int post;
12015   int load;
12016   unsigned vn;
12017   uint64_t address;
12018   int type;
12019 
12020   if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
12021     HALT_NYI;
12022 
12023   single = INSTR (24, 24);
12024   post = INSTR (23, 23);
12025   load = INSTR (22, 22);
12026   type = INSTR (15, 12);
12027   vn = INSTR (9, 5);
12028   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
12029 
12030   if (! single && INSTR (21, 21) != 0)
12031     HALT_UNALLOC;
12032 
12033   if (post)
12034     {
12035       unsigned vm = INSTR (20, 16);
12036 
12037       if (vm == R31)
12038 	{
12039 	  unsigned sizeof_operation;
12040 
12041 	  if (single)
12042 	    {
12043 	      if ((type >= 0) && (type <= 11))
12044 		{
12045 		  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
12046 		  switch (INSTR (15, 14))
12047 		    {
12048 		    case 0:
12049 		      sizeof_operation = nregs * 1;
12050 		      break;
12051 		    case 1:
12052 		      sizeof_operation = nregs * 2;
12053 		      break;
12054 		    case 2:
12055 		      if (INSTR (10, 10) == 0)
12056 			sizeof_operation = nregs * 4;
12057 		      else
12058 			sizeof_operation = nregs * 8;
12059 		      break;
12060 		    default:
12061 		      HALT_UNALLOC;
12062 		    }
12063 		}
12064 	      else if (type == 0xC)
12065 		{
12066 		  sizeof_operation = INSTR (21, 21) ? 2 : 1;
12067 		  sizeof_operation <<= INSTR (11, 10);
12068 		}
12069 	      else if (type == 0xE)
12070 		{
12071 		  sizeof_operation = INSTR (21, 21) ? 4 : 3;
12072 		  sizeof_operation <<= INSTR (11, 10);
12073 		}
12074 	      else
12075 		HALT_UNALLOC;
12076 	    }
12077 	  else
12078 	    {
12079 	      switch (type)
12080 		{
12081 		case 0: sizeof_operation = 32; break;
12082 		case 4: sizeof_operation = 24; break;
12083 		case 8: sizeof_operation = 16; break;
12084 
12085 		case 7:
12086 		  /* One register, immediate offset variant.  */
12087 		  sizeof_operation = 8;
12088 		  break;
12089 
12090 		case 10:
12091 		  /* Two registers, immediate offset variant.  */
12092 		  sizeof_operation = 16;
12093 		  break;
12094 
12095 		case 6:
12096 		  /* Three registers, immediate offset variant.  */
12097 		  sizeof_operation = 24;
12098 		  break;
12099 
12100 		case 2:
12101 		  /* Four registers, immediate offset variant.  */
12102 		  sizeof_operation = 32;
12103 		  break;
12104 
12105 		default:
12106 		  HALT_UNALLOC;
12107 		}
12108 
12109 	      if (INSTR (30, 30))
12110 		sizeof_operation *= 2;
12111 	    }
12112 
12113 	  aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
12114 	}
12115       else
12116 	aarch64_set_reg_u64 (cpu, vn, SP_OK,
12117 			     address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
12118     }
12119   else
12120     {
12121       NYI_assert (20, 16, 0);
12122     }
12123 
12124   if (single)
12125     {
12126       if (load)
12127 	{
12128 	  if ((type >= 0) && (type <= 11))
12129 	    do_vec_LDn_single (cpu, address);
12130 	  else if ((type == 0xC) || (type == 0xE))
12131 	    do_vec_LDnR (cpu, address);
12132 	  else
12133 	    HALT_UNALLOC;
12134 	  return;
12135 	}
12136 
12137       /* Stores.  */
12138       if ((type >= 0) && (type <= 11))
12139 	{
12140 	  do_vec_STn_single (cpu, address);
12141 	  return;
12142 	}
12143 
12144       HALT_UNALLOC;
12145     }
12146 
12147   if (load)
12148     {
12149       switch (type)
12150 	{
12151 	case 0:  LD4 (cpu, address); return;
12152 	case 4:  LD3 (cpu, address); return;
12153 	case 8:  LD2 (cpu, address); return;
12154 	case 2:  LD1_4 (cpu, address); return;
12155 	case 6:  LD1_3 (cpu, address); return;
12156 	case 10: LD1_2 (cpu, address); return;
12157 	case 7:  LD1_1 (cpu, address); return;
12158 
12159 	default:
12160 	  HALT_UNALLOC;
12161 	}
12162     }
12163 
12164   /* Stores.  */
12165   switch (type)
12166     {
12167     case 0:  ST4 (cpu, address); return;
12168     case 4:  ST3 (cpu, address); return;
12169     case 8:  ST2 (cpu, address); return;
12170     case 2:  ST1_4 (cpu, address); return;
12171     case 6:  ST1_3 (cpu, address); return;
12172     case 10: ST1_2 (cpu, address); return;
12173     case 7:  ST1_1 (cpu, address); return;
12174     default:
12175       HALT_UNALLOC;
12176     }
12177 }
12178 
12179 static void
12180 dexLdSt (sim_cpu *cpu)
12181 {
12182   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
12183      assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
12184              group == GROUP_LDST_1100 || group == GROUP_LDST_1110
12185      bits [29,28:26] of a LS are the secondary dispatch vector.  */
12186   uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));
12187 
12188   switch (group2)
12189     {
12190     case LS_EXCL_000:
12191       dexLoadExclusive (cpu); return;
12192 
12193     case LS_LIT_010:
12194     case LS_LIT_011:
12195       dexLoadLiteral (cpu); return;
12196 
12197     case LS_OTHER_110:
12198     case LS_OTHER_111:
12199       dexLoadOther (cpu); return;
12200 
12201     case LS_ADVSIMD_001:
12202       do_vec_load_store (cpu); return;
12203 
12204     case LS_PAIR_100:
12205       dex_load_store_pair_gr (cpu); return;
12206 
12207     case LS_PAIR_101:
12208       dex_load_store_pair_fp (cpu); return;
12209 
12210     default:
12211       /* Should never reach here.  */
12212       HALT_NYI;
12213     }
12214 }
12215 
12216 /* Specific decode and execute for group Data Processing Register.  */
12217 
12218 static void
12219 dexLogicalShiftedRegister (sim_cpu *cpu)
12220 {
12221   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12222      instr[30,29] = op
12223      instr[28:24] = 01010
12224      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
12225      instr[21]    = N
12226      instr[20,16] = Rm
12227      instr[15,10] = count : must be 0xxxxx for 32 bit
12228      instr[9,5]   = Rn
12229      instr[4,0]   = Rd  */
12230 
12231   uint32_t size      = INSTR (31, 31);
12232   Shift    shiftType = INSTR (23, 22);
12233   uint32_t count     = INSTR (15, 10);
12234 
12235   /* 32 bit operations must have count[5] = 0.
12236      or else we have an UNALLOC.  */
12237   if (size == 0 && uimm (count, 5, 5))
12238     HALT_UNALLOC;
12239 
12240   /* Dispatch on size:op:N.  */
12241   switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
12242     {
12243     case 0: and32_shift  (cpu, shiftType, count); return;
12244     case 1: bic32_shift  (cpu, shiftType, count); return;
12245     case 2: orr32_shift  (cpu, shiftType, count); return;
12246     case 3: orn32_shift  (cpu, shiftType, count); return;
12247     case 4: eor32_shift  (cpu, shiftType, count); return;
12248     case 5: eon32_shift  (cpu, shiftType, count); return;
12249     case 6: ands32_shift (cpu, shiftType, count); return;
12250     case 7: bics32_shift (cpu, shiftType, count); return;
12251     case 8: and64_shift  (cpu, shiftType, count); return;
12252     case 9: bic64_shift  (cpu, shiftType, count); return;
12253     case 10:orr64_shift  (cpu, shiftType, count); return;
12254     case 11:orn64_shift  (cpu, shiftType, count); return;
12255     case 12:eor64_shift  (cpu, shiftType, count); return;
12256     case 13:eon64_shift  (cpu, shiftType, count); return;
12257     case 14:ands64_shift (cpu, shiftType, count); return;
12258     case 15:bics64_shift (cpu, shiftType, count); return;
12259     }
12260 }
12261 
12262 /* 32 bit conditional select.  */
12263 static void
12264 csel32 (sim_cpu *cpu, CondCode cc)
12265 {
12266   unsigned rm = INSTR (20, 16);
12267   unsigned rn = INSTR (9, 5);
12268   unsigned rd = INSTR (4, 0);
12269 
12270   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12271 		       testConditionCode (cpu, cc)
12272 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12273 		       : aarch64_get_reg_u32 (cpu, rm, NO_SP));
12274 }
12275 
12276 /* 64 bit conditional select.  */
12277 static void
12278 csel64 (sim_cpu *cpu, CondCode cc)
12279 {
12280   unsigned rm = INSTR (20, 16);
12281   unsigned rn = INSTR (9, 5);
12282   unsigned rd = INSTR (4, 0);
12283 
12284   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12285 		       testConditionCode (cpu, cc)
12286 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12287 		       : aarch64_get_reg_u64 (cpu, rm, NO_SP));
12288 }
12289 
12290 /* 32 bit conditional increment.  */
12291 static void
12292 csinc32 (sim_cpu *cpu, CondCode cc)
12293 {
12294   unsigned rm = INSTR (20, 16);
12295   unsigned rn = INSTR (9, 5);
12296   unsigned rd = INSTR (4, 0);
12297 
12298   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12299 		       testConditionCode (cpu, cc)
12300 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12301 		       : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
12302 }
12303 
12304 /* 64 bit conditional increment.  */
12305 static void
12306 csinc64 (sim_cpu *cpu, CondCode cc)
12307 {
12308   unsigned rm = INSTR (20, 16);
12309   unsigned rn = INSTR (9, 5);
12310   unsigned rd = INSTR (4, 0);
12311 
12312   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12313 		       testConditionCode (cpu, cc)
12314 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12315 		       : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
12316 }
12317 
12318 /* 32 bit conditional invert.  */
12319 static void
12320 csinv32 (sim_cpu *cpu, CondCode cc)
12321 {
12322   unsigned rm = INSTR (20, 16);
12323   unsigned rn = INSTR (9, 5);
12324   unsigned rd = INSTR (4, 0);
12325 
12326   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12327 		       testConditionCode (cpu, cc)
12328 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12329 		       : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
12330 }
12331 
12332 /* 64 bit conditional invert.  */
12333 static void
12334 csinv64 (sim_cpu *cpu, CondCode cc)
12335 {
12336   unsigned rm = INSTR (20, 16);
12337   unsigned rn = INSTR (9, 5);
12338   unsigned rd = INSTR (4, 0);
12339 
12340   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12341 		       testConditionCode (cpu, cc)
12342 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12343 		       : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
12344 }
12345 
12346 /* 32 bit conditional negate.  */
12347 static void
12348 csneg32 (sim_cpu *cpu, CondCode cc)
12349 {
12350   unsigned rm = INSTR (20, 16);
12351   unsigned rn = INSTR (9, 5);
12352   unsigned rd = INSTR (4, 0);
12353 
12354   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12355 		       testConditionCode (cpu, cc)
12356 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12357 		       : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
12358 }
12359 
12360 /* 64 bit conditional negate.  */
12361 static void
12362 csneg64 (sim_cpu *cpu, CondCode cc)
12363 {
12364   unsigned rm = INSTR (20, 16);
12365   unsigned rn = INSTR (9, 5);
12366   unsigned rd = INSTR (4, 0);
12367 
12368   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12369 		       testConditionCode (cpu, cc)
12370 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12371 		       : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
12372 }
12373 
12374 static void
12375 dexCondSelect (sim_cpu *cpu)
12376 {
12377   /* instr[28,21] = 11011011
12378      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12379      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
12380                             100 ==> CSINV, 101 ==> CSNEG,
12381                             _1_ ==> UNALLOC
12382      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12383      instr[15,12] = cond
12384      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
12385 
12386   CondCode cc = INSTR (15, 12);
12387   uint32_t S = INSTR (29, 29);
12388   uint32_t op2 = INSTR (11, 10);
12389 
12390   if (S == 1)
12391     HALT_UNALLOC;
12392 
12393   if (op2 & 0x2)
12394     HALT_UNALLOC;
12395 
12396   switch ((INSTR (31, 30) << 1) | op2)
12397     {
12398     case 0: csel32  (cpu, cc); return;
12399     case 1: csinc32 (cpu, cc); return;
12400     case 2: csinv32 (cpu, cc); return;
12401     case 3: csneg32 (cpu, cc); return;
12402     case 4: csel64  (cpu, cc); return;
12403     case 5: csinc64 (cpu, cc); return;
12404     case 6: csinv64 (cpu, cc); return;
12405     case 7: csneg64 (cpu, cc); return;
12406     }
12407 }
12408 
12409 /* Some helpers for counting leading 1 or 0 bits.  */
12410 
12411 /* Counts the number of leading bits which are the same
12412    in a 32 bit value in the range 1 to 32.  */
12413 static uint32_t
12414 leading32 (uint32_t value)
12415 {
12416   int32_t mask= 0xffff0000;
12417   uint32_t count= 16; /* Counts number of bits set in mask.  */
12418   uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
12419   uint32_t hi = 32;   /* Upper bound for number of sign bits.  */
12420 
12421   while (lo + 1 < hi)
12422     {
12423       int32_t test = (value & mask);
12424 
12425       if (test == 0 || test == mask)
12426 	{
12427 	  lo = count;
12428 	  count = (lo + hi) / 2;
12429 	  mask >>= (count - lo);
12430 	}
12431       else
12432 	{
12433 	  hi = count;
12434 	  count = (lo + hi) / 2;
12435 	  mask <<= hi - count;
12436 	}
12437     }
12438 
12439   if (lo != hi)
12440     {
12441       int32_t test;
12442 
12443       mask >>= 1;
12444       test = (value & mask);
12445 
12446       if (test == 0 || test == mask)
12447 	count = hi;
12448       else
12449 	count = lo;
12450     }
12451 
12452   return count;
12453 }
12454 
12455 /* Counts the number of leading bits which are the same
12456    in a 64 bit value in the range 1 to 64.  */
12457 static uint64_t
12458 leading64 (uint64_t value)
12459 {
12460   int64_t mask= 0xffffffff00000000LL;
12461   uint64_t count = 32; /* Counts number of bits set in mask.  */
12462   uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
12463   uint64_t hi = 64;    /* Upper bound for number of sign bits.  */
12464 
12465   while (lo + 1 < hi)
12466     {
12467       int64_t test = (value & mask);
12468 
12469       if (test == 0 || test == mask)
12470 	{
12471 	  lo = count;
12472 	  count = (lo + hi) / 2;
12473 	  mask >>= (count - lo);
12474 	}
12475       else
12476 	{
12477 	  hi = count;
12478 	  count = (lo + hi) / 2;
12479 	  mask <<= hi - count;
12480 	}
12481     }
12482 
12483   if (lo != hi)
12484     {
12485       int64_t test;
12486 
12487       mask >>= 1;
12488       test = (value & mask);
12489 
12490       if (test == 0 || test == mask)
12491 	count = hi;
12492       else
12493 	count = lo;
12494     }
12495 
12496   return count;
12497 }
12498 
12499 /* Bit operations.  */
12500 /* N.B register args may not be SP.  */
12501 
12502 /* 32 bit count leading sign bits.  */
12503 static void
12504 cls32 (sim_cpu *cpu)
12505 {
12506   unsigned rn = INSTR (9, 5);
12507   unsigned rd = INSTR (4, 0);
12508 
12509   /* N.B. the result needs to exclude the leading bit.  */
12510   aarch64_set_reg_u64
12511     (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
12512 }
12513 
12514 /* 64 bit count leading sign bits.  */
12515 static void
12516 cls64 (sim_cpu *cpu)
12517 {
12518   unsigned rn = INSTR (9, 5);
12519   unsigned rd = INSTR (4, 0);
12520 
12521   /* N.B. the result needs to exclude the leading bit.  */
12522   aarch64_set_reg_u64
12523     (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
12524 }
12525 
12526 /* 32 bit count leading zero bits.  */
12527 static void
12528 clz32 (sim_cpu *cpu)
12529 {
12530   unsigned rn = INSTR (9, 5);
12531   unsigned rd = INSTR (4, 0);
12532   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12533 
12534   /* if the sign (top) bit is set then the count is 0.  */
12535   if (pick32 (value, 31, 31))
12536     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12537   else
12538     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
12539 }
12540 
12541 /* 64 bit count leading zero bits.  */
12542 static void
12543 clz64 (sim_cpu *cpu)
12544 {
12545   unsigned rn = INSTR (9, 5);
12546   unsigned rd = INSTR (4, 0);
12547   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12548 
12549   /* if the sign (top) bit is set then the count is 0.  */
12550   if (pick64 (value, 63, 63))
12551     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12552   else
12553     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
12554 }
12555 
12556 /* 32 bit reverse bits.  */
12557 static void
12558 rbit32 (sim_cpu *cpu)
12559 {
12560   unsigned rn = INSTR (9, 5);
12561   unsigned rd = INSTR (4, 0);
12562   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12563   uint32_t result = 0;
12564   int i;
12565 
12566   for (i = 0; i < 32; i++)
12567     {
12568       result <<= 1;
12569       result |= (value & 1);
12570       value >>= 1;
12571     }
12572   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12573 }
12574 
12575 /* 64 bit reverse bits.  */
12576 static void
12577 rbit64 (sim_cpu *cpu)
12578 {
12579   unsigned rn = INSTR (9, 5);
12580   unsigned rd = INSTR (4, 0);
12581   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12582   uint64_t result = 0;
12583   int i;
12584 
12585   for (i = 0; i < 64; i++)
12586     {
12587       result <<= 1;
12588       result |= (value & 1UL);
12589       value >>= 1;
12590     }
12591   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12592 }
12593 
12594 /* 32 bit reverse bytes.  */
12595 static void
12596 rev32 (sim_cpu *cpu)
12597 {
12598   unsigned rn = INSTR (9, 5);
12599   unsigned rd = INSTR (4, 0);
12600   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12601   uint32_t result = 0;
12602   int i;
12603 
12604   for (i = 0; i < 4; i++)
12605     {
12606       result <<= 8;
12607       result |= (value & 0xff);
12608       value >>= 8;
12609     }
12610   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12611 }
12612 
12613 /* 64 bit reverse bytes.  */
12614 static void
12615 rev64 (sim_cpu *cpu)
12616 {
12617   unsigned rn = INSTR (9, 5);
12618   unsigned rd = INSTR (4, 0);
12619   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12620   uint64_t result = 0;
12621   int i;
12622 
12623   for (i = 0; i < 8; i++)
12624     {
12625       result <<= 8;
12626       result |= (value & 0xffULL);
12627       value >>= 8;
12628     }
12629   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12630 }
12631 
12632 /* 32 bit reverse shorts.  */
12633 /* N.B.this reverses the order of the bytes in each half word.  */
12634 static void
12635 revh32 (sim_cpu *cpu)
12636 {
12637   unsigned rn = INSTR (9, 5);
12638   unsigned rd = INSTR (4, 0);
12639   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12640   uint32_t result = 0;
12641   int i;
12642 
12643   for (i = 0; i < 2; i++)
12644     {
12645       result <<= 8;
12646       result |= (value & 0x00ff00ff);
12647       value >>= 8;
12648     }
12649   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12650 }
12651 
12652 /* 64 bit reverse shorts.  */
12653 /* N.B.this reverses the order of the bytes in each half word.  */
12654 static void
12655 revh64 (sim_cpu *cpu)
12656 {
12657   unsigned rn = INSTR (9, 5);
12658   unsigned rd = INSTR (4, 0);
12659   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12660   uint64_t result = 0;
12661   int i;
12662 
12663   for (i = 0; i < 2; i++)
12664     {
12665       result <<= 8;
12666       result |= (value & 0x00ff00ff00ff00ffULL);
12667       value >>= 8;
12668     }
12669   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12670 }
12671 
12672 static void
12673 dexDataProc1Source (sim_cpu *cpu)
12674 {
12675   /* instr[30]    = 1
12676      instr[28,21] = 111010110
12677      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12678      instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
12679      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
12680      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
12681                              000010 ==> REV, 000011 ==> UNALLOC
12682                              000100 ==> CLZ, 000101 ==> CLS
12683                              ow ==> UNALLOC
12684      instr[9,5]   = rn : may not be SP
12685      instr[4,0]   = rd : may not be SP.  */
12686 
12687   uint32_t S = INSTR (29, 29);
12688   uint32_t opcode2 = INSTR (20, 16);
12689   uint32_t opcode = INSTR (15, 10);
12690   uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
12691 
12692   if (S == 1)
12693     HALT_UNALLOC;
12694 
12695   if (opcode2 != 0)
12696     HALT_UNALLOC;
12697 
12698   if (opcode & 0x38)
12699     HALT_UNALLOC;
12700 
12701   switch (dispatch)
12702     {
12703     case 0: rbit32 (cpu); return;
12704     case 1: revh32 (cpu); return;
12705     case 2: rev32 (cpu); return;
12706     case 4: clz32 (cpu); return;
12707     case 5: cls32 (cpu); return;
12708     case 8: rbit64 (cpu); return;
12709     case 9: revh64 (cpu); return;
12710     case 10:rev32 (cpu); return;
12711     case 11:rev64 (cpu); return;
12712     case 12:clz64 (cpu); return;
12713     case 13:cls64 (cpu); return;
12714     default: HALT_UNALLOC;
12715     }
12716 }
12717 
12718 /* Variable shift.
12719    Shifts by count supplied in register.
12720    N.B register args may not be SP.
12721    These all use the shifted auxiliary function for
12722    simplicity and clarity.  Writing the actual shift
12723    inline would avoid a branch and so be faster but
12724    would also necessitate getting signs right.  */
12725 
12726 /* 32 bit arithmetic shift right.  */
12727 static void
12728 asrv32 (sim_cpu *cpu)
12729 {
12730   unsigned rm = INSTR (20, 16);
12731   unsigned rn = INSTR (9, 5);
12732   unsigned rd = INSTR (4, 0);
12733 
12734   aarch64_set_reg_u64
12735     (cpu, rd, NO_SP,
12736      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
12737 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12738 }
12739 
12740 /* 64 bit arithmetic shift right.  */
12741 static void
12742 asrv64 (sim_cpu *cpu)
12743 {
12744   unsigned rm = INSTR (20, 16);
12745   unsigned rn = INSTR (9, 5);
12746   unsigned rd = INSTR (4, 0);
12747 
12748   aarch64_set_reg_u64
12749     (cpu, rd, NO_SP,
12750      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
12751 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12752 }
12753 
12754 /* 32 bit logical shift left.  */
12755 static void
12756 lslv32 (sim_cpu *cpu)
12757 {
12758   unsigned rm = INSTR (20, 16);
12759   unsigned rn = INSTR (9, 5);
12760   unsigned rd = INSTR (4, 0);
12761 
12762   aarch64_set_reg_u64
12763     (cpu, rd, NO_SP,
12764      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
12765 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12766 }
12767 
12768 /* 64 bit arithmetic shift left.  */
12769 static void
12770 lslv64 (sim_cpu *cpu)
12771 {
12772   unsigned rm = INSTR (20, 16);
12773   unsigned rn = INSTR (9, 5);
12774   unsigned rd = INSTR (4, 0);
12775 
12776   aarch64_set_reg_u64
12777     (cpu, rd, NO_SP,
12778      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
12779 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12780 }
12781 
12782 /* 32 bit logical shift right.  */
12783 static void
12784 lsrv32 (sim_cpu *cpu)
12785 {
12786   unsigned rm = INSTR (20, 16);
12787   unsigned rn = INSTR (9, 5);
12788   unsigned rd = INSTR (4, 0);
12789 
12790   aarch64_set_reg_u64
12791     (cpu, rd, NO_SP,
12792      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
12793 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12794 }
12795 
12796 /* 64 bit logical shift right.  */
12797 static void
12798 lsrv64 (sim_cpu *cpu)
12799 {
12800   unsigned rm = INSTR (20, 16);
12801   unsigned rn = INSTR (9, 5);
12802   unsigned rd = INSTR (4, 0);
12803 
12804   aarch64_set_reg_u64
12805     (cpu, rd, NO_SP,
12806      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
12807 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12808 }
12809 
12810 /* 32 bit rotate right.  */
12811 static void
12812 rorv32 (sim_cpu *cpu)
12813 {
12814   unsigned rm = INSTR (20, 16);
12815   unsigned rn = INSTR (9, 5);
12816   unsigned rd = INSTR (4, 0);
12817 
12818   aarch64_set_reg_u64
12819     (cpu, rd, NO_SP,
12820      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
12821 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12822 }
12823 
12824 /* 64 bit rotate right.  */
12825 static void
12826 rorv64 (sim_cpu *cpu)
12827 {
12828   unsigned rm = INSTR (20, 16);
12829   unsigned rn = INSTR (9, 5);
12830   unsigned rd = INSTR (4, 0);
12831 
12832   aarch64_set_reg_u64
12833     (cpu, rd, NO_SP,
12834      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
12835 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12836 }
12837 
12838 
12839 /* divide.  */
12840 
12841 /* 32 bit signed divide.  */
12842 static void
12843 cpuiv32 (sim_cpu *cpu)
12844 {
12845   unsigned rm = INSTR (20, 16);
12846   unsigned rn = INSTR (9, 5);
12847   unsigned rd = INSTR (4, 0);
12848   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12849   /* TODO : check that this rounds towards zero as required.  */
12850   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
12851   int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);
12852 
12853   aarch64_set_reg_s64 (cpu, rd, NO_SP,
12854 		       divisor ? ((int32_t) (dividend / divisor)) : 0);
12855 }
12856 
12857 /* 64 bit signed divide.  */
12858 static void
12859 cpuiv64 (sim_cpu *cpu)
12860 {
12861   unsigned rm = INSTR (20, 16);
12862   unsigned rn = INSTR (9, 5);
12863   unsigned rd = INSTR (4, 0);
12864 
12865   /* TODO : check that this rounds towards zero as required.  */
12866   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
12867 
12868   aarch64_set_reg_s64
12869     (cpu, rd, NO_SP,
12870      divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
12871 }
12872 
12873 /* 32 bit unsigned divide.  */
12874 static void
12875 udiv32 (sim_cpu *cpu)
12876 {
12877   unsigned rm = INSTR (20, 16);
12878   unsigned rn = INSTR (9, 5);
12879   unsigned rd = INSTR (4, 0);
12880 
12881   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12882   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12883   uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);
12884 
12885   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12886 		       divisor ? (uint32_t) (dividend / divisor) : 0);
12887 }
12888 
12889 /* 64 bit unsigned divide.  */
12890 static void
12891 udiv64 (sim_cpu *cpu)
12892 {
12893   unsigned rm = INSTR (20, 16);
12894   unsigned rn = INSTR (9, 5);
12895   unsigned rd = INSTR (4, 0);
12896 
12897   /* TODO : check that this rounds towards zero as required.  */
12898   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12899 
12900   aarch64_set_reg_u64
12901     (cpu, rd, NO_SP,
12902      divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
12903 }
12904 
12905 static void
12906 dexDataProc2Source (sim_cpu *cpu)
12907 {
12908   /* assert instr[30] == 0
12909      instr[28,21] == 11010110
12910      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
12911      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12912      instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
12913                              001000 ==> LSLV, 001001 ==> LSRV
12914                              001010 ==> ASRV, 001011 ==> RORV
12915                              ow ==> UNALLOC.  */
12916 
12917   uint32_t dispatch;
12918   uint32_t S = INSTR (29, 29);
12919   uint32_t opcode = INSTR (15, 10);
12920 
12921   if (S == 1)
12922     HALT_UNALLOC;
12923 
12924   if (opcode & 0x34)
12925     HALT_UNALLOC;
12926 
12927   dispatch = (  (INSTR (31, 31) << 3)
12928 	      | (uimm (opcode, 3, 3) << 2)
12929 	      |  uimm (opcode, 1, 0));
12930   switch (dispatch)
12931     {
12932     case 2:  udiv32 (cpu); return;
12933     case 3:  cpuiv32 (cpu); return;
12934     case 4:  lslv32 (cpu); return;
12935     case 5:  lsrv32 (cpu); return;
12936     case 6:  asrv32 (cpu); return;
12937     case 7:  rorv32 (cpu); return;
12938     case 10: udiv64 (cpu); return;
12939     case 11: cpuiv64 (cpu); return;
12940     case 12: lslv64 (cpu); return;
12941     case 13: lsrv64 (cpu); return;
12942     case 14: asrv64 (cpu); return;
12943     case 15: rorv64 (cpu); return;
12944     default: HALT_UNALLOC;
12945     }
12946 }
12947 
12948 
12949 /* Multiply.  */
12950 
12951 /* 32 bit multiply and add.  */
12952 static void
12953 madd32 (sim_cpu *cpu)
12954 {
12955   unsigned rm = INSTR (20, 16);
12956   unsigned ra = INSTR (14, 10);
12957   unsigned rn = INSTR (9, 5);
12958   unsigned rd = INSTR (4, 0);
12959 
12960   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12961   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12962 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
12963 		       + aarch64_get_reg_u32 (cpu, rn, NO_SP)
12964 		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12965 }
12966 
12967 /* 64 bit multiply and add.  */
12968 static void
12969 madd64 (sim_cpu *cpu)
12970 {
12971   unsigned rm = INSTR (20, 16);
12972   unsigned ra = INSTR (14, 10);
12973   unsigned rn = INSTR (9, 5);
12974   unsigned rd = INSTR (4, 0);
12975 
12976   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12977   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12978 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
12979 		       + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
12980 			  * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
12981 }
12982 
12983 /* 32 bit multiply and sub.  */
12984 static void
12985 msub32 (sim_cpu *cpu)
12986 {
12987   unsigned rm = INSTR (20, 16);
12988   unsigned ra = INSTR (14, 10);
12989   unsigned rn = INSTR (9, 5);
12990   unsigned rd = INSTR (4, 0);
12991 
12992   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12993   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12994 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
12995 		       - aarch64_get_reg_u32 (cpu, rn, NO_SP)
12996 		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12997 }
12998 
12999 /* 64 bit multiply and sub.  */
13000 static void
13001 msub64 (sim_cpu *cpu)
13002 {
13003   unsigned rm = INSTR (20, 16);
13004   unsigned ra = INSTR (14, 10);
13005   unsigned rn = INSTR (9, 5);
13006   unsigned rd = INSTR (4, 0);
13007 
13008   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13009   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13010 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
13011 		       - aarch64_get_reg_u64 (cpu, rn, NO_SP)
13012 		       * aarch64_get_reg_u64 (cpu, rm, NO_SP));
13013 }
13014 
13015 /* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
13016 static void
13017 smaddl (sim_cpu *cpu)
13018 {
13019   unsigned rm = INSTR (20, 16);
13020   unsigned ra = INSTR (14, 10);
13021   unsigned rn = INSTR (9, 5);
13022   unsigned rd = INSTR (4, 0);
13023 
13024   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13025      obtain a 64 bit product.  */
13026   aarch64_set_reg_s64
13027     (cpu, rd, NO_SP,
13028      aarch64_get_reg_s64 (cpu, ra, NO_SP)
13029      + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
13030      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
13031 }
13032 
13033 /* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
13034 static void
13035 smsubl (sim_cpu *cpu)
13036 {
13037   unsigned rm = INSTR (20, 16);
13038   unsigned ra = INSTR (14, 10);
13039   unsigned rn = INSTR (9, 5);
13040   unsigned rd = INSTR (4, 0);
13041 
13042   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13043      obtain a 64 bit product.  */
13044   aarch64_set_reg_s64
13045     (cpu, rd, NO_SP,
13046      aarch64_get_reg_s64 (cpu, ra, NO_SP)
13047      - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
13048      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
13049 }
13050 
13051 /* Integer Multiply/Divide.  */
13052 
13053 /* First some macros and a helper function.  */
13054 /* Macros to test or access elements of 64 bit words.  */
13055 
13056 /* Mask used to access lo 32 bits of 64 bit unsigned int.  */
13057 #define LOW_WORD_MASK ((1ULL << 32) - 1)
13058 /* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
13059 #define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
13060 /* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
13061 #define highWordToU64(_value_u64) ((_value_u64) >> 32)
13062 
13063 /* Offset of sign bit in 64 bit signed integger.  */
13064 #define SIGN_SHIFT_U64 63
13065 /* The sign bit itself -- also identifies the minimum negative int value.  */
13066 #define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
13067 /* Return true if a 64 bit signed int presented as an unsigned int is the
13068    most negative value.  */
13069 #define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
13070 /* Return true (non-zero) if a 64 bit signed int presented as an unsigned
13071    int has its sign bit set to false.  */
13072 #define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
13073 /* Return 1L or -1L according to whether a 64 bit signed int presented as
13074    an unsigned int has its sign bit set or not.  */
13075 #define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
13076 /* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
13077 #define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)
13078 
13079 /* Multiply two 64 bit ints and return.
13080    the hi 64 bits of the 128 bit product.  */
13081 
13082 static uint64_t
13083 mul64hi (uint64_t value1, uint64_t value2)
13084 {
13085   uint64_t resultmid1;
13086   uint64_t result;
13087   uint64_t value1_lo = lowWordToU64 (value1);
13088   uint64_t value1_hi = highWordToU64 (value1) ;
13089   uint64_t value2_lo = lowWordToU64 (value2);
13090   uint64_t value2_hi = highWordToU64 (value2);
13091 
13092   /* Cross-multiply and collect results.  */
13093   uint64_t xproductlo = value1_lo * value2_lo;
13094   uint64_t xproductmid1 = value1_lo * value2_hi;
13095   uint64_t xproductmid2 = value1_hi * value2_lo;
13096   uint64_t xproducthi = value1_hi * value2_hi;
13097   uint64_t carry = 0;
13098   /* Start accumulating 64 bit results.  */
13099   /* Drop bottom half of lowest cross-product.  */
13100   uint64_t resultmid = xproductlo >> 32;
13101   /* Add in middle products.  */
13102   resultmid = resultmid + xproductmid1;
13103 
13104   /* Check for overflow.  */
13105   if (resultmid < xproductmid1)
13106     /* Carry over 1 into top cross-product.  */
13107     carry++;
13108 
13109   resultmid1  = resultmid + xproductmid2;
13110 
13111   /* Check for overflow.  */
13112   if (resultmid1 < xproductmid2)
13113     /* Carry over 1 into top cross-product.  */
13114     carry++;
13115 
13116   /* Drop lowest 32 bits of middle cross-product.  */
13117   result = resultmid1 >> 32;
13118   /* Move carry bit to just above middle cross-product highest bit.  */
13119   carry = carry << 32;
13120 
13121   /* Add top cross-product plus and any carry.  */
13122   result += xproducthi + carry;
13123 
13124   return result;
13125 }
13126 
13127 /* Signed multiply high, source, source2 :
13128    64 bit, dest <-- high 64-bit of result.  */
13129 static void
13130 smulh (sim_cpu *cpu)
13131 {
13132   uint64_t uresult;
13133   int64_t  result;
13134   unsigned rm = INSTR (20, 16);
13135   unsigned rn = INSTR (9, 5);
13136   unsigned rd = INSTR (4, 0);
13137   GReg     ra = INSTR (14, 10);
13138   int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
13139   int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
13140   uint64_t uvalue1;
13141   uint64_t uvalue2;
13142   int  negate = 0;
13143 
13144   if (ra != R31)
13145     HALT_UNALLOC;
13146 
13147   /* Convert to unsigned and use the unsigned mul64hi routine
13148      the fix the sign up afterwards.  */
13149   if (value1 < 0)
13150     {
13151       negate = !negate;
13152       uvalue1 = -value1;
13153     }
13154   else
13155     {
13156       uvalue1 = value1;
13157     }
13158 
13159   if (value2 < 0)
13160     {
13161       negate = !negate;
13162       uvalue2 = -value2;
13163     }
13164   else
13165     {
13166       uvalue2 = value2;
13167     }
13168 
13169   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13170 
13171   uresult = mul64hi (uvalue1, uvalue2);
13172   result = uresult;
13173 
13174   if (negate)
13175     {
13176       /* Multiply 128-bit result by -1, which means highpart gets inverted,
13177 	 and has carry in added only if low part is 0.  */
13178       result = ~result;
13179       if ((uvalue1 * uvalue2) == 0)
13180 	result += 1;
13181     }
13182 
13183   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
13184 }
13185 
13186 /* Unsigned multiply add long -- source, source2 :
13187    32 bit, source3 : 64 bit.  */
13188 static void
13189 umaddl (sim_cpu *cpu)
13190 {
13191   unsigned rm = INSTR (20, 16);
13192   unsigned ra = INSTR (14, 10);
13193   unsigned rn = INSTR (9, 5);
13194   unsigned rd = INSTR (4, 0);
13195 
13196   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13197   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13198      obtain a 64 bit product.  */
13199   aarch64_set_reg_u64
13200     (cpu, rd, NO_SP,
13201      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13202      + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13203      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13204 }
13205 
13206 /* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
13207 static void
13208 umsubl (sim_cpu *cpu)
13209 {
13210   unsigned rm = INSTR (20, 16);
13211   unsigned ra = INSTR (14, 10);
13212   unsigned rn = INSTR (9, 5);
13213   unsigned rd = INSTR (4, 0);
13214 
13215   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13216   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13217      obtain a 64 bit product.  */
13218   aarch64_set_reg_u64
13219     (cpu, rd, NO_SP,
13220      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13221      - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13222      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13223 }
13224 
13225 /* Unsigned multiply high, source, source2 :
13226    64 bit, dest <-- high 64-bit of result.  */
13227 static void
13228 umulh (sim_cpu *cpu)
13229 {
13230   unsigned rm = INSTR (20, 16);
13231   unsigned rn = INSTR (9, 5);
13232   unsigned rd = INSTR (4, 0);
13233   GReg     ra = INSTR (14, 10);
13234 
13235   if (ra != R31)
13236     HALT_UNALLOC;
13237 
13238   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13239   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13240 		       mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
13241 				aarch64_get_reg_u64 (cpu, rm, NO_SP)));
13242 }
13243 
13244 static void
13245 dexDataProc3Source (sim_cpu *cpu)
13246 {
13247   /* assert instr[28,24] == 11011.  */
13248   /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
13249      instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
13250      instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
13251      instr[15] = o0 : 0/1 ==> ok
13252      instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
13253                               0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
13254                               0100 ==> SMULH,                   (64 bit only)
13255                               1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
13256                               1100 ==> UMULH                    (64 bit only)
13257                               ow ==> UNALLOC.  */
13258 
13259   uint32_t dispatch;
13260   uint32_t size = INSTR (31, 31);
13261   uint32_t op54 = INSTR (30, 29);
13262   uint32_t op31 = INSTR (23, 21);
13263   uint32_t o0 = INSTR (15, 15);
13264 
13265   if (op54 != 0)
13266     HALT_UNALLOC;
13267 
13268   if (size == 0)
13269     {
13270       if (op31 != 0)
13271 	HALT_UNALLOC;
13272 
13273       if (o0 == 0)
13274 	madd32 (cpu);
13275       else
13276 	msub32 (cpu);
13277       return;
13278     }
13279 
13280   dispatch = (op31 << 1) | o0;
13281 
13282   switch (dispatch)
13283     {
13284     case 0:  madd64 (cpu); return;
13285     case 1:  msub64 (cpu); return;
13286     case 2:  smaddl (cpu); return;
13287     case 3:  smsubl (cpu); return;
13288     case 4:  smulh (cpu); return;
13289     case 10: umaddl (cpu); return;
13290     case 11: umsubl (cpu); return;
13291     case 12: umulh (cpu); return;
13292     default: HALT_UNALLOC;
13293     }
13294 }
13295 
13296 static void
13297 dexDPReg (sim_cpu *cpu)
13298 {
13299   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13300      assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
13301      bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
13302   uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));
13303 
13304   switch (group2)
13305     {
13306     case DPREG_LOG_000:
13307     case DPREG_LOG_001:
13308       dexLogicalShiftedRegister (cpu); return;
13309 
13310     case DPREG_ADDSHF_010:
13311       dexAddSubtractShiftedRegister (cpu); return;
13312 
13313     case DPREG_ADDEXT_011:
13314       dexAddSubtractExtendedRegister (cpu); return;
13315 
13316     case DPREG_ADDCOND_100:
13317       {
13318 	/* This set bundles a variety of different operations.  */
13319 	/* Check for.  */
13320 	/* 1) add/sub w carry.  */
13321 	uint32_t mask1 = 0x1FE00000U;
13322 	uint32_t val1  = 0x1A000000U;
13323 	/* 2) cond compare register/immediate.  */
13324 	uint32_t mask2 = 0x1FE00000U;
13325 	uint32_t val2  = 0x1A400000U;
13326 	/* 3) cond select.  */
13327 	uint32_t mask3 = 0x1FE00000U;
13328 	uint32_t val3  = 0x1A800000U;
13329 	/* 4) data proc 1/2 source.  */
13330 	uint32_t mask4 = 0x1FE00000U;
13331 	uint32_t val4  = 0x1AC00000U;
13332 
13333 	if ((aarch64_get_instr (cpu) & mask1) == val1)
13334 	  dexAddSubtractWithCarry (cpu);
13335 
13336 	else if ((aarch64_get_instr (cpu) & mask2) == val2)
13337 	  CondCompare (cpu);
13338 
13339 	else if ((aarch64_get_instr (cpu) & mask3) == val3)
13340 	  dexCondSelect (cpu);
13341 
13342 	else if ((aarch64_get_instr (cpu) & mask4) == val4)
13343 	  {
13344 	    /* Bit 30 is clear for data proc 2 source
13345 	       and set for data proc 1 source.  */
13346 	    if (aarch64_get_instr (cpu)  & (1U << 30))
13347 	      dexDataProc1Source (cpu);
13348 	    else
13349 	      dexDataProc2Source (cpu);
13350 	  }
13351 
13352 	else
13353 	  /* Should not reach here.  */
13354 	  HALT_NYI;
13355 
13356 	return;
13357       }
13358 
13359     case DPREG_3SRC_110:
13360       dexDataProc3Source (cpu); return;
13361 
13362     case DPREG_UNALLOC_101:
13363       HALT_UNALLOC;
13364 
13365     case DPREG_3SRC_111:
13366       dexDataProc3Source (cpu); return;
13367 
13368     default:
13369       /* Should never reach here.  */
13370       HALT_NYI;
13371     }
13372 }
13373 
13374 /* Unconditional Branch immediate.
13375    Offset is a PC-relative byte offset in the range +/- 128MiB.
13376    The offset is assumed to be raw from the decode i.e. the
13377    simulator is expected to scale them from word offsets to byte.  */
13378 
13379 /* Unconditional branch.  */
13380 static void
13381 buc (sim_cpu *cpu, int32_t offset)
13382 {
13383   aarch64_set_next_PC_by_offset (cpu, offset);
13384 }
13385 
13386 static unsigned stack_depth = 0;
13387 
13388 /* Unconditional branch and link -- writes return PC to LR.  */
13389 static void
13390 bl (sim_cpu *cpu, int32_t offset)
13391 {
13392   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13393   aarch64_save_LR (cpu);
13394   aarch64_set_next_PC_by_offset (cpu, offset);
13395 
13396   if (TRACE_BRANCH_P (cpu))
13397     {
13398       ++ stack_depth;
13399       TRACE_BRANCH (cpu,
13400 		    " %*scall %" PRIx64 " [%s]"
13401 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13402 		    stack_depth, " ", aarch64_get_next_PC (cpu),
13403 		    aarch64_get_func (CPU_STATE (cpu),
13404 				      aarch64_get_next_PC (cpu)),
13405 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
13406 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
13407 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
13408 		    );
13409     }
13410 }
13411 
13412 /* Unconditional Branch register.
13413    Branch/return address is in source register.  */
13414 
13415 /* Unconditional branch.  */
13416 static void
13417 br (sim_cpu *cpu)
13418 {
13419   unsigned rn = INSTR (9, 5);
13420   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13421   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13422 }
13423 
13424 /* Unconditional branch and link -- writes return PC to LR.  */
13425 static void
13426 blr (sim_cpu *cpu)
13427 {
13428   /* Ensure we read the destination before we write LR.  */
13429   uint64_t target = aarch64_get_reg_u64 (cpu, INSTR (9, 5), NO_SP);
13430 
13431   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13432   aarch64_save_LR (cpu);
13433   aarch64_set_next_PC (cpu, target);
13434 
13435   if (TRACE_BRANCH_P (cpu))
13436     {
13437       ++ stack_depth;
13438       TRACE_BRANCH (cpu,
13439 		    " %*scall %" PRIx64 " [%s]"
13440 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13441 		    stack_depth, " ", aarch64_get_next_PC (cpu),
13442 		    aarch64_get_func (CPU_STATE (cpu),
13443 				      aarch64_get_next_PC (cpu)),
13444 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
13445 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
13446 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
13447 		    );
13448     }
13449 }
13450 
13451 /* Return -- assembler will default source to LR this is functionally
13452    equivalent to br but, presumably, unlike br it side effects the
13453    branch predictor.  */
13454 static void
13455 ret (sim_cpu *cpu)
13456 {
13457   unsigned rn = INSTR (9, 5);
13458   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13459 
13460   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13461   if (TRACE_BRANCH_P (cpu))
13462     {
13463       TRACE_BRANCH (cpu,
13464 		    " %*sreturn [result: %" PRIx64 "]",
13465 		    stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
13466       -- stack_depth;
13467     }
13468 }
13469 
13470 /* NOP -- we implement this and call it from the decode in case we
13471    want to intercept it later.  */
13472 
13473 static void
13474 nop (sim_cpu *cpu)
13475 {
13476   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13477 }
13478 
13479 /* Data synchronization barrier.  */
13480 
13481 static void
13482 dsb (sim_cpu *cpu)
13483 {
13484   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13485 }
13486 
13487 /* Data memory barrier.  */
13488 
13489 static void
13490 dmb (sim_cpu *cpu)
13491 {
13492   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13493 }
13494 
13495 /* Instruction synchronization barrier.  */
13496 
13497 static void
13498 isb (sim_cpu *cpu)
13499 {
13500   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13501 }
13502 
13503 static void
13504 dexBranchImmediate (sim_cpu *cpu)
13505 {
13506   /* assert instr[30,26] == 00101
13507      instr[31] ==> 0 == B, 1 == BL
13508      instr[25,0] == imm26 branch offset counted in words.  */
13509 
13510   uint32_t top = INSTR (31, 31);
13511   /* We have a 26 byte signed word offset which we need to pass to the
13512      execute routine as a signed byte offset.  */
13513   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
13514 
13515   if (top)
13516     bl (cpu, offset);
13517   else
13518     buc (cpu, offset);
13519 }
13520 
13521 /* Control Flow.  */
13522 
13523 /* Conditional branch
13524 
13525    Offset is a PC-relative byte offset in the range +/- 1MiB pos is
13526    a bit position in the range 0 .. 63
13527 
13528    cc is a CondCode enum value as pulled out of the decode
13529 
13530    N.B. any offset register (source) can only be Xn or Wn.  */
13531 
13532 static void
13533 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
13534 {
13535   /* The test returns TRUE if CC is met.  */
13536   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13537   if (testConditionCode (cpu, cc))
13538     aarch64_set_next_PC_by_offset (cpu, offset);
13539 }
13540 
13541 /* 32 bit branch on register non-zero.  */
13542 static void
13543 cbnz32 (sim_cpu *cpu, int32_t offset)
13544 {
13545   unsigned rt = INSTR (4, 0);
13546 
13547   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13548   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
13549     aarch64_set_next_PC_by_offset (cpu, offset);
13550 }
13551 
13552 /* 64 bit branch on register zero.  */
13553 static void
13554 cbnz (sim_cpu *cpu, int32_t offset)
13555 {
13556   unsigned rt = INSTR (4, 0);
13557 
13558   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13559   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
13560     aarch64_set_next_PC_by_offset (cpu, offset);
13561 }
13562 
13563 /* 32 bit branch on register non-zero.  */
13564 static void
13565 cbz32 (sim_cpu *cpu, int32_t offset)
13566 {
13567   unsigned rt = INSTR (4, 0);
13568 
13569   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13570   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
13571     aarch64_set_next_PC_by_offset (cpu, offset);
13572 }
13573 
13574 /* 64 bit branch on register zero.  */
13575 static void
13576 cbz (sim_cpu *cpu, int32_t offset)
13577 {
13578   unsigned rt = INSTR (4, 0);
13579 
13580   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13581   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
13582     aarch64_set_next_PC_by_offset (cpu, offset);
13583 }
13584 
13585 /* Branch on register bit test non-zero -- one size fits all.  */
13586 static void
13587 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13588 {
13589   unsigned rt = INSTR (4, 0);
13590 
13591   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13592   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
13593     aarch64_set_next_PC_by_offset (cpu, offset);
13594 }
13595 
13596 /* Branch on register bit test zero -- one size fits all.  */
13597 static void
13598 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13599 {
13600   unsigned rt = INSTR (4, 0);
13601 
13602   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13603   if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
13604     aarch64_set_next_PC_by_offset (cpu, offset);
13605 }
13606 
13607 static void
13608 dexCompareBranchImmediate (sim_cpu *cpu)
13609 {
13610   /* instr[30,25] = 01 1010
13611      instr[31]    = size : 0 ==> 32, 1 ==> 64
13612      instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
13613      instr[23,5]  = simm19 branch offset counted in words
13614      instr[4,0]   = rt  */
13615 
13616   uint32_t size = INSTR (31, 31);
13617   uint32_t op   = INSTR (24, 24);
13618   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13619 
13620   if (size == 0)
13621     {
13622       if (op == 0)
13623 	cbz32 (cpu, offset);
13624       else
13625 	cbnz32 (cpu, offset);
13626     }
13627   else
13628     {
13629       if (op == 0)
13630 	cbz (cpu, offset);
13631       else
13632 	cbnz (cpu, offset);
13633     }
13634 }
13635 
13636 static void
13637 dexTestBranchImmediate (sim_cpu *cpu)
13638 {
13639   /* instr[31]    = b5 : bit 5 of test bit idx
13640      instr[30,25] = 01 1011
13641      instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
13642      instr[23,19] = b40 : bits 4 to 0 of test bit idx
13643      instr[18,5]  = simm14 : signed offset counted in words
13644      instr[4,0]   = uimm5  */
13645 
13646   uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
13647   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
13648 
13649   NYI_assert (30, 25, 0x1b);
13650 
13651   if (INSTR (24, 24) == 0)
13652     tbz (cpu, pos, offset);
13653   else
13654     tbnz (cpu, pos, offset);
13655 }
13656 
13657 static void
13658 dexCondBranchImmediate (sim_cpu *cpu)
13659 {
13660   /* instr[31,25] = 010 1010
13661      instr[24]    = op1; op => 00 ==> B.cond
13662      instr[23,5]  = simm19 : signed offset counted in words
13663      instr[4]     = op0
13664      instr[3,0]   = cond  */
13665 
13666   int32_t offset;
13667   uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
13668 
13669   NYI_assert (31, 25, 0x2a);
13670 
13671   if (op != 0)
13672     HALT_UNALLOC;
13673 
13674   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13675 
13676   bcc (cpu, offset, INSTR (3, 0));
13677 }
13678 
13679 static void
13680 dexBranchRegister (sim_cpu *cpu)
13681 {
13682   /* instr[31,25] = 110 1011
13683      instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
13684      instr[20,16] = op2 : must be 11111
13685      instr[15,10] = op3 : must be 000000
13686      instr[4,0]   = op2 : must be 11111.  */
13687 
13688   uint32_t op = INSTR (24, 21);
13689   uint32_t op2 = INSTR (20, 16);
13690   uint32_t op3 = INSTR (15, 10);
13691   uint32_t op4 = INSTR (4, 0);
13692 
13693   NYI_assert (31, 25, 0x6b);
13694 
13695   if (op2 != 0x1F || op3 != 0 || op4 != 0)
13696     HALT_UNALLOC;
13697 
13698   if (op == 0)
13699     br (cpu);
13700 
13701   else if (op == 1)
13702     blr (cpu);
13703 
13704   else if (op == 2)
13705     ret (cpu);
13706 
13707   else
13708     {
13709       /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
13710       /* anything else is unallocated.  */
13711       uint32_t rn = INSTR (4, 0);
13712 
13713       if (rn != 0x1f)
13714 	HALT_UNALLOC;
13715 
13716       if (op == 4 || op == 5)
13717 	HALT_NYI;
13718 
13719       HALT_UNALLOC;
13720     }
13721 }
13722 
13723 /* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
13724    but this may not be available.  So instead we define the values we need
13725    here.  */
13726 #define AngelSVC_Reason_Open		0x01
13727 #define AngelSVC_Reason_Close		0x02
13728 #define AngelSVC_Reason_Write		0x05
13729 #define AngelSVC_Reason_Read		0x06
13730 #define AngelSVC_Reason_IsTTY		0x09
13731 #define AngelSVC_Reason_Seek		0x0A
13732 #define AngelSVC_Reason_FLen		0x0C
13733 #define AngelSVC_Reason_Remove		0x0E
13734 #define AngelSVC_Reason_Rename		0x0F
13735 #define AngelSVC_Reason_Clock		0x10
13736 #define AngelSVC_Reason_Time		0x11
13737 #define AngelSVC_Reason_System		0x12
13738 #define AngelSVC_Reason_Errno		0x13
13739 #define AngelSVC_Reason_GetCmdLine	0x15
13740 #define AngelSVC_Reason_HeapInfo	0x16
13741 #define AngelSVC_Reason_ReportException 0x18
13742 #define AngelSVC_Reason_Elapsed         0x30
13743 
13744 
13745 static void
13746 handle_halt (sim_cpu *cpu, uint32_t val)
13747 {
13748   uint64_t result = 0;
13749 
13750   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13751   if (val != 0xf000)
13752     {
13753       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
13754       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13755 		       sim_stopped, SIM_SIGTRAP);
13756     }
13757 
13758   /* We have encountered an Angel SVC call.  See if we can process it.  */
13759   switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
13760     {
13761     case AngelSVC_Reason_HeapInfo:
13762       {
13763 	/* Get the values.  */
13764 	uint64_t stack_top = aarch64_get_stack_start (cpu);
13765 	uint64_t heap_base = aarch64_get_heap_start (cpu);
13766 
13767 	/* Get the pointer  */
13768 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13769 	ptr = aarch64_get_mem_u64 (cpu, ptr);
13770 
13771 	/* Fill in the memory block.  */
13772 	/* Start addr of heap.  */
13773 	aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
13774 	/* End addr of heap.  */
13775 	aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
13776 	/* Lowest stack addr.  */
13777 	aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
13778 	/* Initial stack addr.  */
13779 	aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);
13780 
13781 	TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
13782       }
13783       break;
13784 
13785     case AngelSVC_Reason_Open:
13786       {
13787 	/* Get the pointer  */
13788 	/* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
13789 	/* FIXME: For now we just assume that we will only be asked
13790 	   to open the standard file descriptors.  */
13791 	static int fd = 0;
13792 	result = fd ++;
13793 
13794 	TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
13795       }
13796       break;
13797 
13798     case AngelSVC_Reason_Close:
13799       {
13800 	uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13801 	TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
13802 	result = 0;
13803       }
13804       break;
13805 
13806     case AngelSVC_Reason_Errno:
13807       result = 0;
13808       TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
13809       break;
13810 
13811     case AngelSVC_Reason_Clock:
13812       result =
13813 #ifdef CLOCKS_PER_SEC
13814 	(CLOCKS_PER_SEC >= 100)
13815 	? (clock () / (CLOCKS_PER_SEC / 100))
13816 	: ((clock () * 100) / CLOCKS_PER_SEC)
13817 #else
13818 	/* Presume unix... clock() returns microseconds.  */
13819 	(clock () / 10000)
13820 #endif
13821 	;
13822 	TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
13823       break;
13824 
13825     case AngelSVC_Reason_GetCmdLine:
13826       {
13827 	/* Get the pointer  */
13828 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13829 	ptr = aarch64_get_mem_u64 (cpu, ptr);
13830 
13831 	/* FIXME: No command line for now.  */
13832 	aarch64_set_mem_u64 (cpu, ptr, 0);
13833 	TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
13834       }
13835       break;
13836 
13837     case AngelSVC_Reason_IsTTY:
13838       result = 1;
13839 	TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
13840       break;
13841 
13842     case AngelSVC_Reason_Write:
13843       {
13844 	/* Get the pointer  */
13845 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13846 	/* Get the write control block.  */
13847 	uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
13848 	uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
13849 	uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);
13850 
13851 	TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
13852 		       PRIx64 " on descriptor %" PRIx64,
13853 		       len, buf, fd);
13854 
13855 	if (len > 1280)
13856 	  {
13857 	    TRACE_SYSCALL (cpu,
13858 			   " AngelSVC: Write: Suspiciously long write: %ld",
13859 			   (long) len);
13860 	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13861 			     sim_stopped, SIM_SIGBUS);
13862 	  }
13863 	else if (fd == 1)
13864 	  {
13865 	    printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
13866 	  }
13867 	else if (fd == 2)
13868 	  {
13869 	    TRACE (cpu, 0, "\n");
13870 	    sim_io_eprintf (CPU_STATE (cpu), "%.*s",
13871 			    (int) len, aarch64_get_mem_ptr (cpu, buf));
13872 	    TRACE (cpu, 0, "\n");
13873 	  }
13874 	else
13875 	  {
13876 	    TRACE_SYSCALL (cpu,
13877 			   " AngelSVC: Write: Unexpected file handle: %d",
13878 			   (int) fd);
13879 	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13880 			     sim_stopped, SIM_SIGABRT);
13881 	  }
13882       }
13883       break;
13884 
13885     case AngelSVC_Reason_ReportException:
13886       {
13887 	/* Get the pointer  */
13888 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13889 	/*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
13890 	uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
13891 	uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);
13892 
13893 	TRACE_SYSCALL (cpu,
13894 		       "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
13895 		       type, state);
13896 
13897 	if (type == 0x20026)
13898 	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13899 			   sim_exited, state);
13900 	else
13901 	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13902 			   sim_stopped, SIM_SIGINT);
13903       }
13904       break;
13905 
13906     case AngelSVC_Reason_Read:
13907     case AngelSVC_Reason_FLen:
13908     case AngelSVC_Reason_Seek:
13909     case AngelSVC_Reason_Remove:
13910     case AngelSVC_Reason_Time:
13911     case AngelSVC_Reason_System:
13912     case AngelSVC_Reason_Rename:
13913     case AngelSVC_Reason_Elapsed:
13914     default:
13915       TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
13916 		     aarch64_get_reg_u32 (cpu, 0, NO_SP));
13917       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13918 		       sim_stopped, SIM_SIGTRAP);
13919     }
13920 
13921   aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
13922 }
13923 
13924 static void
13925 dexExcpnGen (sim_cpu *cpu)
13926 {
13927   /* instr[31:24] = 11010100
13928      instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
13929                           010 ==> HLT,       101 ==> DBG GEN EXCPN
13930      instr[20,5]  = imm16
13931      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
13932      instr[1,0]   = LL : discriminates opc  */
13933 
13934   uint32_t opc = INSTR (23, 21);
13935   uint32_t imm16 = INSTR (20, 5);
13936   uint32_t opc2 = INSTR (4, 2);
13937   uint32_t LL;
13938 
13939   NYI_assert (31, 24, 0xd4);
13940 
13941   if (opc2 != 0)
13942     HALT_UNALLOC;
13943 
13944   LL = INSTR (1, 0);
13945 
13946   /* We only implement HLT and BRK for now.  */
13947   if (opc == 1 && LL == 0)
13948     {
13949       TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
13950       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13951 		       sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
13952     }
13953 
13954   if (opc == 2 && LL == 0)
13955     handle_halt (cpu, imm16);
13956 
13957   else if (opc == 0 || opc == 5)
13958     HALT_NYI;
13959 
13960   else
13961     HALT_UNALLOC;
13962 }
13963 
13964 /* Stub for accessing system registers.  */
13965 
13966 static uint64_t
13967 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13968 	    unsigned crm, unsigned op2)
13969 {
13970   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
13971     /* DCZID_EL0 - the Data Cache Zero ID register.
13972        We do not support DC ZVA at the moment, so
13973        we return a value with the disable bit set.
13974        We implement support for the DCZID register since
13975        it is used by the C library's memset function.  */
13976     return ((uint64_t) 1) << 4;
13977 
13978   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
13979     /* Cache Type Register.  */
13980     return 0x80008000UL;
13981 
13982   if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
13983     /* TPIDR_EL0 - thread pointer id.  */
13984     return aarch64_get_thread_id (cpu);
13985 
13986   if (op1 == 3 && crm == 4 && op2 == 0)
13987     return aarch64_get_FPCR (cpu);
13988 
13989   if (op1 == 3 && crm == 4 && op2 == 1)
13990     return aarch64_get_FPSR (cpu);
13991 
13992   else if (op1 == 3 && crm == 2 && op2 == 0)
13993     return aarch64_get_CPSR (cpu);
13994 
13995   HALT_NYI;
13996 }
13997 
13998 static void
13999 system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
14000 	    unsigned crm, unsigned op2, uint64_t val)
14001 {
14002   if (op1 == 3 && crm == 4 && op2 == 0)
14003     aarch64_set_FPCR (cpu, val);
14004 
14005   else if (op1 == 3 && crm == 4 && op2 == 1)
14006     aarch64_set_FPSR (cpu, val);
14007 
14008   else if (op1 == 3 && crm == 2 && op2 == 0)
14009     aarch64_set_CPSR (cpu, val);
14010 
14011   else
14012     HALT_NYI;
14013 }
14014 
14015 static void
14016 do_mrs (sim_cpu *cpu)
14017 {
14018   /* instr[31:20] = 1101 0101 0001 1
14019      instr[19]    = op0
14020      instr[18,16] = op1
14021      instr[15,12] = CRn
14022      instr[11,8]  = CRm
14023      instr[7,5]   = op2
14024      instr[4,0]   = Rt  */
14025   unsigned sys_op0 = INSTR (19, 19) + 2;
14026   unsigned sys_op1 = INSTR (18, 16);
14027   unsigned sys_crn = INSTR (15, 12);
14028   unsigned sys_crm = INSTR (11, 8);
14029   unsigned sys_op2 = INSTR (7, 5);
14030   unsigned rt = INSTR (4, 0);
14031 
14032   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
14033   aarch64_set_reg_u64 (cpu, rt, NO_SP,
14034 		       system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
14035 }
14036 
14037 static void
14038 do_MSR_immediate (sim_cpu *cpu)
14039 {
14040   /* instr[31:19] = 1101 0101 0000 0
14041      instr[18,16] = op1
14042      instr[15,12] = 0100
14043      instr[11,8]  = CRm
14044      instr[7,5]   = op2
14045      instr[4,0]   = 1 1111  */
14046 
14047   unsigned op1 = INSTR (18, 16);
14048   /*unsigned crm = INSTR (11, 8);*/
14049   unsigned op2 = INSTR (7, 5);
14050 
14051   NYI_assert (31, 19, 0x1AA0);
14052   NYI_assert (15, 12, 0x4);
14053   NYI_assert (4,  0,  0x1F);
14054 
14055   if (op1 == 0)
14056     {
14057       if (op2 == 5)
14058 	HALT_NYI; /* set SPSel.  */
14059       else
14060 	HALT_UNALLOC;
14061     }
14062   else if (op1 == 3)
14063     {
14064       if (op2 == 6)
14065 	HALT_NYI; /* set DAIFset.  */
14066       else if (op2 == 7)
14067 	HALT_NYI; /* set DAIFclr.  */
14068       else
14069 	HALT_UNALLOC;
14070     }
14071   else
14072     HALT_UNALLOC;
14073 }
14074 
14075 static void
14076 do_MSR_reg (sim_cpu *cpu)
14077 {
14078   /* instr[31:20] = 1101 0101 0001
14079      instr[19]    = op0
14080      instr[18,16] = op1
14081      instr[15,12] = CRn
14082      instr[11,8]  = CRm
14083      instr[7,5]   = op2
14084      instr[4,0]   = Rt  */
14085 
14086   unsigned sys_op0 = INSTR (19, 19) + 2;
14087   unsigned sys_op1 = INSTR (18, 16);
14088   unsigned sys_crn = INSTR (15, 12);
14089   unsigned sys_crm = INSTR (11, 8);
14090   unsigned sys_op2 = INSTR (7, 5);
14091   unsigned rt = INSTR (4, 0);
14092 
14093   NYI_assert (31, 20, 0xD51);
14094 
14095   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
14096   system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
14097 	      aarch64_get_reg_u64 (cpu, rt, NO_SP));
14098 }
14099 
14100 static void
14101 do_SYS (sim_cpu *cpu)
14102 {
14103   /* instr[31,19] = 1101 0101 0000 1
14104      instr[18,16] = op1
14105      instr[15,12] = CRn
14106      instr[11,8]  = CRm
14107      instr[7,5]   = op2
14108      instr[4,0]   = Rt  */
14109   NYI_assert (31, 19, 0x1AA1);
14110 
14111   /* FIXME: For now we just silently accept system ops.  */
14112 }
14113 
14114 static void
14115 dexSystem (sim_cpu *cpu)
14116 {
14117   /* instr[31:22] = 1101 01010 0
14118      instr[21]    = L
14119      instr[20,19] = op0
14120      instr[18,16] = op1
14121      instr[15,12] = CRn
14122      instr[11,8]  = CRm
14123      instr[7,5]   = op2
14124      instr[4,0]   = uimm5  */
14125 
14126   /* We are interested in HINT, DSB, DMB and ISB
14127 
14128      Hint #0 encodes NOOP (this is the only hint we care about)
14129      L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
14130      CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101
14131 
14132      DSB, DMB, ISB are data store barrier, data memory barrier and
14133      instruction store barrier, respectively, where
14134 
14135      L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
14136      op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
14137      CRm<3:2> ==> domain, CRm<1:0> ==> types,
14138      domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
14139               10 ==> InerShareable, 11 ==> FullSystem
14140      types :  01 ==> Reads, 10 ==> Writes,
14141               11 ==> All, 00 ==> All (domain == FullSystem).  */
14142 
14143   unsigned rt = INSTR (4, 0);
14144 
14145   NYI_assert (31, 22, 0x354);
14146 
14147   switch (INSTR (21, 12))
14148     {
14149     case 0x032:
14150       if (rt == 0x1F)
14151 	{
14152 	  /* NOP has CRm != 0000 OR.  */
14153 	  /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
14154 	  uint32_t crm = INSTR (11, 8);
14155 	  uint32_t op2 = INSTR (7, 5);
14156 
14157 	  if (crm != 0 || (op2 == 0 || op2 > 5))
14158 	    {
14159 	      /* Actually call nop method so we can reimplement it later.  */
14160 	      nop (cpu);
14161 	      return;
14162 	    }
14163 	}
14164       HALT_NYI;
14165 
14166     case 0x033:
14167       {
14168 	uint32_t op2 =  INSTR (7, 5);
14169 
14170 	switch (op2)
14171 	  {
14172 	  case 2: HALT_NYI;
14173 	  case 4: dsb (cpu); return;
14174 	  case 5: dmb (cpu); return;
14175 	  case 6: isb (cpu); return;
14176 	  default: HALT_UNALLOC;
14177 	}
14178       }
14179 
14180     case 0x3B0:
14181     case 0x3B4:
14182     case 0x3BD:
14183       do_mrs (cpu);
14184       return;
14185 
14186     case 0x0B7:
14187       do_SYS (cpu); /* DC is an alias of SYS.  */
14188       return;
14189 
14190     default:
14191       if (INSTR (21, 20) == 0x1)
14192 	do_MSR_reg (cpu);
14193       else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
14194 	do_MSR_immediate (cpu);
14195       else
14196 	HALT_NYI;
14197       return;
14198     }
14199 }
14200 
14201 static void
14202 dexBr (sim_cpu *cpu)
14203 {
14204   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
14205      assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
14206      bits [31,29] of a BrExSys are the secondary dispatch vector.  */
14207   uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));
14208 
14209   switch (group2)
14210     {
14211     case BR_IMM_000:
14212       return dexBranchImmediate (cpu);
14213 
14214     case BR_IMMCMP_001:
14215       /* Compare has bit 25 clear while test has it set.  */
14216       if (!INSTR (25, 25))
14217 	dexCompareBranchImmediate (cpu);
14218       else
14219 	dexTestBranchImmediate (cpu);
14220       return;
14221 
14222     case BR_IMMCOND_010:
14223       /* This is a conditional branch if bit 25 is clear otherwise
14224          unallocated.  */
14225       if (!INSTR (25, 25))
14226 	dexCondBranchImmediate (cpu);
14227       else
14228 	HALT_UNALLOC;
14229       return;
14230 
14231     case BR_UNALLOC_011:
14232       HALT_UNALLOC;
14233 
14234     case BR_IMM_100:
14235       dexBranchImmediate (cpu);
14236       return;
14237 
14238     case BR_IMMCMP_101:
14239       /* Compare has bit 25 clear while test has it set.  */
14240       if (!INSTR (25, 25))
14241 	dexCompareBranchImmediate (cpu);
14242       else
14243 	dexTestBranchImmediate (cpu);
14244       return;
14245 
14246     case BR_REG_110:
14247       /* Unconditional branch reg has bit 25 set.  */
14248       if (INSTR (25, 25))
14249 	dexBranchRegister (cpu);
14250 
14251       /* This includes both Excpn Gen, System and unalloc operations.
14252          We need to decode the Excpn Gen operation BRK so we can plant
14253          debugger entry points.
14254          Excpn Gen operations have instr [24] = 0.
14255          we need to decode at least one of the System operations NOP
14256          which is an alias for HINT #0.
14257          System operations have instr [24,22] = 100.  */
14258       else if (INSTR (24, 24) == 0)
14259 	dexExcpnGen (cpu);
14260 
14261       else if (INSTR (24, 22) == 4)
14262 	dexSystem (cpu);
14263 
14264       else
14265 	HALT_UNALLOC;
14266 
14267       return;
14268 
14269     case BR_UNALLOC_111:
14270       HALT_UNALLOC;
14271 
14272     default:
14273       /* Should never reach here.  */
14274       HALT_NYI;
14275     }
14276 }
14277 
14278 static void
14279 aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
14280 {
14281   /* We need to check if gdb wants an in here.  */
14282   /* checkBreak (cpu);.  */
14283 
14284   uint64_t group = dispatchGroup (aarch64_get_instr (cpu));
14285 
14286   switch (group)
14287     {
14288     case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
14289     case GROUP_LDST_0100:     dexLdSt (cpu); break;
14290     case GROUP_DPREG_0101:    dexDPReg (cpu); break;
14291     case GROUP_LDST_0110:     dexLdSt (cpu); break;
14292     case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
14293     case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
14294     case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
14295     case GROUP_BREXSYS_1010:  dexBr (cpu); break;
14296     case GROUP_BREXSYS_1011:  dexBr (cpu); break;
14297     case GROUP_LDST_1100:     dexLdSt (cpu); break;
14298     case GROUP_DPREG_1101:    dexDPReg (cpu); break;
14299     case GROUP_LDST_1110:     dexLdSt (cpu); break;
14300     case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;
14301 
14302     case GROUP_UNALLOC_0001:
14303     case GROUP_UNALLOC_0010:
14304     case GROUP_UNALLOC_0011:
14305       HALT_UNALLOC;
14306 
14307     default:
14308       /* Should never reach here.  */
14309       HALT_NYI;
14310     }
14311 }
14312 
14313 static bfd_boolean
14314 aarch64_step (sim_cpu *cpu)
14315 {
14316   uint64_t pc = aarch64_get_PC (cpu);
14317 
14318   if (pc == TOP_LEVEL_RETURN_PC)
14319     return FALSE;
14320 
14321   aarch64_set_next_PC (cpu, pc + 4);
14322 
14323   /* Code is always little-endian.  */
14324   sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
14325 			& aarch64_get_instr (cpu), pc, 4);
14326   aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
14327 
14328   TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
14329 	      aarch64_get_instr (cpu));
14330   TRACE_DISASM (cpu, pc);
14331 
14332   aarch64_decode_and_execute (cpu, pc);
14333 
14334   return TRUE;
14335 }
14336 
14337 void
14338 aarch64_run (SIM_DESC sd)
14339 {
14340   sim_cpu *cpu = STATE_CPU (sd, 0);
14341 
14342   while (aarch64_step (cpu))
14343     {
14344       aarch64_update_PC (cpu);
14345 
14346       if (sim_events_tick (sd))
14347 	sim_events_process (sd);
14348     }
14349 
14350   sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
14351 		   sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
14352 }
14353 
14354 void
14355 aarch64_init (sim_cpu *cpu, uint64_t pc)
14356 {
14357   uint64_t sp = aarch64_get_stack_start (cpu);
14358 
14359   /* Install SP, FP and PC and set LR to -20
14360      so we can detect a top-level return.  */
14361   aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
14362   aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
14363   aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
14364   aarch64_set_next_PC (cpu, pc);
14365   aarch64_update_PC (cpu);
14366   aarch64_init_LIT_table ();
14367 }
14368