xref: /netbsd-src/external/gpl3/gdb/dist/sim/aarch64/simulator.c (revision 4d1eaf9104083d3c1d72a17f5c414777b18c6935)
1 /* simulator.c -- Interface for the AArch64 simulator.
2 
3    Copyright (C) 2015-2024 Free Software Foundation, Inc.
4 
5    Contributed by Red Hat.
6 
7    This file is part of GDB.
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3 of the License, or
12    (at your option) any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
21 
22 /* This must come before any other includes.  */
23 #include "defs.h"
24 
25 #include <stdlib.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <sys/types.h>
29 #include <math.h>
30 #include <time.h>
31 #include <limits.h>
32 
33 #include "aarch64-sim.h"
34 #include "simulator.h"
35 #include "cpustate.h"
36 #include "memory.h"
37 
38 #include "sim-signal.h"
39 
40 #define NO_SP 0
41 #define SP_OK 1
42 
43 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
44 #define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
45 #define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
46 
47 /* Space saver macro.  */
48 #define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
49 
50 #define HALT_UNALLOC							\
51   do									\
52     {									\
53       TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
54       TRACE_INSN (cpu,							\
55 		  "Unallocated instruction detected at sim line %d,"	\
56 		  " exe addr %" PRIx64,					\
57 		  __LINE__, aarch64_get_PC (cpu));			\
58       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
59 		       sim_stopped, SIM_SIGILL);			\
60     }									\
61   while (0)
62 
63 #define HALT_NYI							\
64   do									\
65     {									\
66       TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
67       TRACE_INSN (cpu,							\
68 		  "Unimplemented instruction detected at sim line %d,"	\
69 		  " exe addr %" PRIx64,					\
70 		  __LINE__, aarch64_get_PC (cpu));			\
71       if (! TRACE_ANY_P (cpu))						\
72         sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
73                         aarch64_get_instr (cpu));			\
74       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
75 		       sim_stopped, SIM_SIGABRT);			\
76     }									\
77   while (0)
78 
79 #define NYI_assert(HI, LO, EXPECTED)					\
80   do									\
81     {									\
82       if (INSTR ((HI), (LO)) != (EXPECTED))				\
83 	HALT_NYI;							\
84     }									\
85   while (0)
86 
87 static uint64_t
88 expand_logical_immediate (uint32_t s, uint32_t r, uint32_t n)
89 {
90   uint64_t mask;
91   uint64_t imm;
92   unsigned simd_size;
93 
94   /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
95      (in other words, right rotated by R), then replicated. */
96   if (n != 0)
97     {
98       simd_size = 64;
99       mask = 0xffffffffffffffffull;
100     }
101   else
102     {
103       switch (s)
104 	{
105 	case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
106 	case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; s &= 0xf; break;
107 	case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; s &= 0x7; break;
108 	case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; s &= 0x3; break;
109 	case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; s &= 0x1; break;
110 	default: return 0;
111 	}
112       mask = (1ull << simd_size) - 1;
113       /* Top bits are IGNORED.  */
114       r &= simd_size - 1;
115     }
116 
117   /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
118   if (s == simd_size - 1)
119     return 0;
120 
121   /* S+1 consecutive bits to 1.  */
122   /* NOTE: S can't be 63 due to detection above.  */
123   imm = (1ull << (s + 1)) - 1;
124 
125   /* Rotate to the left by simd_size - R.  */
126   if (r != 0)
127     imm = ((imm << (simd_size - r)) & mask) | (imm >> r);
128 
129   /* Replicate the value according to SIMD size.  */
130   switch (simd_size)
131     {
132     case  2: imm = (imm <<  2) | imm; ATTRIBUTE_FALLTHROUGH;
133     case  4: imm = (imm <<  4) | imm; ATTRIBUTE_FALLTHROUGH;
134     case  8: imm = (imm <<  8) | imm; ATTRIBUTE_FALLTHROUGH;
135     case 16: imm = (imm << 16) | imm; ATTRIBUTE_FALLTHROUGH;
136     case 32: imm = (imm << 32) | imm; ATTRIBUTE_FALLTHROUGH;
137     case 64: break;
138     default: return 0;
139     }
140 
141   return imm;
142 }
143 
144 /* Instr[22,10] encodes N immr and imms. we want a lookup table
145    for each possible combination i.e. 13 bits worth of int entries.  */
146 #define  LI_TABLE_SIZE  (1 << 13)
147 static uint64_t LITable[LI_TABLE_SIZE];
148 
149 void
150 aarch64_init_LIT_table (void)
151 {
152   unsigned index;
153 
154   for (index = 0; index < LI_TABLE_SIZE; index++)
155     {
156       uint32_t n    = uimm (index, 12, 12);
157       uint32_t immr = uimm (index, 11, 6);
158       uint32_t imms = uimm (index, 5, 0);
159 
160       LITable [index] = expand_logical_immediate (imms, immr, n);
161     }
162 }
163 
164 static void
165 dexNotify (sim_cpu *cpu)
166 {
167   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
168                            2 ==> exit Java, 3 ==> start next bytecode.  */
169   uint32_t type = INSTR (14, 0);
170 
171   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
172 
173   switch (type)
174     {
175     case 0:
176       /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
177 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
178       break;
179     case 1:
180       /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
181 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
182       break;
183     case 2:
184       /* aarch64_notifyMethodExit ();  */
185       break;
186     case 3:
187       /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
188 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
189       break;
190     }
191 }
192 
193 /* secondary decode within top level groups  */
194 
195 static void
196 dexPseudo (sim_cpu *cpu)
197 {
198   /* assert instr[28,27] = 00
199 
200      We provide 2 pseudo instructions:
201 
202      HALT stops execution of the simulator causing an immediate
203      return to the x86 code which entered it.
204 
205      CALLOUT initiates recursive entry into x86 code.  A register
206      argument holds the address of the x86 routine.  Immediate
207      values in the instruction identify the number of general
208      purpose and floating point register arguments to be passed
209      and the type of any value to be returned.  */
210 
211   uint32_t PSEUDO_HALT      =  0xE0000000U;
212   uint32_t PSEUDO_CALLOUT   =  0x00018000U;
213   uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
214   uint32_t PSEUDO_NOTIFY    =  0x00014000U;
215   uint32_t dispatch;
216 
217   if (aarch64_get_instr (cpu) == PSEUDO_HALT)
218     {
219       TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
220       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
221 		       sim_stopped, SIM_SIGTRAP);
222     }
223 
224   dispatch = INSTR (31, 15);
225 
226   /* We do not handle callouts at the moment.  */
227   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
228     {
229       TRACE_EVENTS (cpu, " Callout");
230       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
231 		       sim_stopped, SIM_SIGABRT);
232     }
233 
234   else if (dispatch == PSEUDO_NOTIFY)
235     dexNotify (cpu);
236 
237   else
238     HALT_UNALLOC;
239 }
240 
241 /* Load-store single register (unscaled offset)
242    These instructions employ a base register plus an unscaled signed
243    9 bit offset.
244 
245    N.B. the base register (source) can be Xn or SP. all other
246    registers may not be SP.  */
247 
248 /* 32 bit load 32 bit unscaled signed 9 bit.  */
249 static void
250 ldur32 (sim_cpu *cpu, int32_t offset)
251 {
252   unsigned rn = INSTR (9, 5);
253   unsigned rt = INSTR (4, 0);
254 
255   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
256   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
257 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
258 			+ offset));
259 }
260 
261 /* 64 bit load 64 bit unscaled signed 9 bit.  */
262 static void
263 ldur64 (sim_cpu *cpu, int32_t offset)
264 {
265   unsigned rn = INSTR (9, 5);
266   unsigned rt = INSTR (4, 0);
267 
268   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
269   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
270 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
271 			+ offset));
272 }
273 
274 /* 32 bit load zero-extended byte unscaled signed 9 bit.  */
275 static void
276 ldurb32 (sim_cpu *cpu, int32_t offset)
277 {
278   unsigned rn = INSTR (9, 5);
279   unsigned rt = INSTR (4, 0);
280 
281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
282   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
283 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
284 			+ offset));
285 }
286 
287 /* 32 bit load sign-extended byte unscaled signed 9 bit.  */
288 static void
289 ldursb32 (sim_cpu *cpu, int32_t offset)
290 {
291   unsigned rn = INSTR (9, 5);
292   unsigned rt = INSTR (4, 0);
293 
294   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
295   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
296 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
297 			+ offset));
298 }
299 
300 /* 64 bit load sign-extended byte unscaled signed 9 bit.  */
301 static void
302 ldursb64 (sim_cpu *cpu, int32_t offset)
303 {
304   unsigned rn = INSTR (9, 5);
305   unsigned rt = INSTR (4, 0);
306 
307   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
308   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
309 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
310 			+ offset));
311 }
312 
313 /* 32 bit load zero-extended short unscaled signed 9 bit  */
314 static void
315 ldurh32 (sim_cpu *cpu, int32_t offset)
316 {
317   unsigned rn = INSTR (9, 5);
318   unsigned rd = INSTR (4, 0);
319 
320   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
321   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
322 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
323 			+ offset));
324 }
325 
326 /* 32 bit load sign-extended short unscaled signed 9 bit  */
327 static void
328 ldursh32 (sim_cpu *cpu, int32_t offset)
329 {
330   unsigned rn = INSTR (9, 5);
331   unsigned rd = INSTR (4, 0);
332 
333   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
334   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
335 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
336 			+ offset));
337 }
338 
339 /* 64 bit load sign-extended short unscaled signed 9 bit  */
340 static void
341 ldursh64 (sim_cpu *cpu, int32_t offset)
342 {
343   unsigned rn = INSTR (9, 5);
344   unsigned rt = INSTR (4, 0);
345 
346   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
347   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
348 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
349 			+ offset));
350 }
351 
352 /* 64 bit load sign-extended word unscaled signed 9 bit  */
353 static void
354 ldursw (sim_cpu *cpu, int32_t offset)
355 {
356   unsigned rn = INSTR (9, 5);
357   unsigned rd = INSTR (4, 0);
358 
359   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
360   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
361 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
362 			+ offset));
363 }
364 
365 /* N.B. with stores the value in source is written to the address
366    identified by source2 modified by offset.  */
367 
368 /* 32 bit store 32 bit unscaled signed 9 bit.  */
369 static void
370 stur32 (sim_cpu *cpu, int32_t offset)
371 {
372   unsigned rn = INSTR (9, 5);
373   unsigned rd = INSTR (4, 0);
374 
375   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
376   aarch64_set_mem_u32 (cpu,
377 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
378 		       aarch64_get_reg_u32 (cpu, rd, NO_SP));
379 }
380 
381 /* 64 bit store 64 bit unscaled signed 9 bit  */
382 static void
383 stur64 (sim_cpu *cpu, int32_t offset)
384 {
385   unsigned rn = INSTR (9, 5);
386   unsigned rd = INSTR (4, 0);
387 
388   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
389   aarch64_set_mem_u64 (cpu,
390 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
391 		       aarch64_get_reg_u64 (cpu, rd, NO_SP));
392 }
393 
394 /* 32 bit store byte unscaled signed 9 bit  */
395 static void
396 sturb (sim_cpu *cpu, int32_t offset)
397 {
398   unsigned rn = INSTR (9, 5);
399   unsigned rd = INSTR (4, 0);
400 
401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
402   aarch64_set_mem_u8 (cpu,
403 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
404 		      aarch64_get_reg_u8 (cpu, rd, NO_SP));
405 }
406 
407 /* 32 bit store short unscaled signed 9 bit  */
408 static void
409 sturh (sim_cpu *cpu, int32_t offset)
410 {
411   unsigned rn = INSTR (9, 5);
412   unsigned rd = INSTR (4, 0);
413 
414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
415   aarch64_set_mem_u16 (cpu,
416 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
417 		       aarch64_get_reg_u16 (cpu, rd, NO_SP));
418 }
419 
420 /* Load single register pc-relative label
421    Offset is a signed 19 bit immediate count in words
422    rt may not be SP.  */
423 
424 /* 32 bit pc-relative load  */
425 static void
426 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
427 {
428   unsigned rd = INSTR (4, 0);
429 
430   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
431   aarch64_set_reg_u64 (cpu, rd, NO_SP,
432 		       aarch64_get_mem_u32
433 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
434 }
435 
436 /* 64 bit pc-relative load  */
437 static void
438 ldr_pcrel (sim_cpu *cpu, int32_t offset)
439 {
440   unsigned rd = INSTR (4, 0);
441 
442   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
443   aarch64_set_reg_u64 (cpu, rd, NO_SP,
444 		       aarch64_get_mem_u64
445 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
446 }
447 
448 /* sign extended 32 bit pc-relative load  */
449 static void
450 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
451 {
452   unsigned rd = INSTR (4, 0);
453 
454   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
455   aarch64_set_reg_u64 (cpu, rd, NO_SP,
456 		       aarch64_get_mem_s32
457 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
458 }
459 
460 /* float pc-relative load  */
461 static void
462 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
463 {
464   unsigned int rd = INSTR (4, 0);
465 
466   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
467   aarch64_set_vec_u32 (cpu, rd, 0,
468 		       aarch64_get_mem_u32
469 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
470 }
471 
472 /* double pc-relative load  */
473 static void
474 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
475 {
476   unsigned int st = INSTR (4, 0);
477 
478   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
479   aarch64_set_vec_u64 (cpu, st, 0,
480 		       aarch64_get_mem_u64
481 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
482 }
483 
484 /* long double pc-relative load.  */
485 static void
486 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
487 {
488   unsigned int st = INSTR (4, 0);
489   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
490   FRegister a;
491 
492   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
493   aarch64_get_mem_long_double (cpu, addr, & a);
494   aarch64_set_FP_long_double (cpu, st, a);
495 }
496 
497 /* This can be used to scale an offset by applying
498    the requisite shift. the second argument is either
499    16, 32 or 64.  */
500 
501 #define SCALE(_offset, _elementSize) \
502     ((_offset) << ScaleShift ## _elementSize)
503 
504 /* This can be used to optionally scale a register derived offset
505    by applying the requisite shift as indicated by the Scaling
506    argument.  The second argument is either Byte, Short, Word
507    or Long. The third argument is either Scaled or Unscaled.
508    N.B. when _Scaling is Scaled the shift gets ANDed with
509    all 1s while when it is Unscaled it gets ANDed with 0.  */
510 
511 #define OPT_SCALE(_offset, _elementType, _Scaling) \
512   ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))
513 
514 /* This can be used to zero or sign extend a 32 bit register derived
515    value to a 64 bit value.  the first argument must be the value as
516    a uint32_t and the second must be either UXTW or SXTW. The result
517    is returned as an int64_t.  */
518 
519 static inline int64_t
520 extend (uint32_t value, Extension extension)
521 {
522   union
523   {
524     uint32_t u;
525     int32_t   n;
526   } x;
527 
528   /* A branchless variant of this ought to be possible.  */
529   if (extension == UXTW || extension == NoExtension)
530     return value;
531 
532   x.u = value;
533   return x.n;
534 }
535 
536 /* Scalar Floating Point
537 
538    FP load/store single register (4 addressing modes)
539 
540    N.B. the base register (source) can be the stack pointer.
541    The secondary source register (source2) can only be an Xn register.  */
542 
543 /* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
544 static void
545 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
546 {
547   unsigned rn = INSTR (9, 5);
548   unsigned st = INSTR (4, 0);
549   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
550 
551   if (wb != Post)
552     address += offset;
553 
554   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
555   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
556   if (wb == Post)
557     address += offset;
558 
559   if (wb != NoWriteBack)
560     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
561 }
562 
563 /* Load 8 bit with unsigned 12 bit offset.  */
564 static void
565 fldrb_abs (sim_cpu *cpu, uint32_t offset)
566 {
567   unsigned rd = INSTR (4, 0);
568   unsigned rn = INSTR (9, 5);
569   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
570 
571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
572   aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
573 }
574 
575 /* Load 16 bit scaled unsigned 12 bit.  */
576 static void
577 fldrh_abs (sim_cpu *cpu, uint32_t offset)
578 {
579   unsigned rd = INSTR (4, 0);
580   unsigned rn = INSTR (9, 5);
581   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
582 
583   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
584   aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
585 }
586 
587 /* Load 32 bit scaled unsigned 12 bit.  */
588 static void
589 fldrs_abs (sim_cpu *cpu, uint32_t offset)
590 {
591   unsigned rd = INSTR (4, 0);
592   unsigned rn = INSTR (9, 5);
593   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
594 
595   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
596   aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
597 }
598 
599 /* Load 64 bit scaled unsigned 12 bit.  */
600 static void
601 fldrd_abs (sim_cpu *cpu, uint32_t offset)
602 {
603   unsigned rd = INSTR (4, 0);
604   unsigned rn = INSTR (9, 5);
605   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
606 
607   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
608   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
609 }
610 
611 /* Load 128 bit scaled unsigned 12 bit.  */
612 static void
613 fldrq_abs (sim_cpu *cpu, uint32_t offset)
614 {
615   unsigned rd = INSTR (4, 0);
616   unsigned rn = INSTR (9, 5);
617   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
618 
619   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
620   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
621   aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
622 }
623 
624 /* Load 32 bit scaled or unscaled zero- or sign-extended
625    32-bit register offset.  */
626 static void
627 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
628 {
629   unsigned rm = INSTR (20, 16);
630   unsigned rn = INSTR (9, 5);
631   unsigned st = INSTR (4, 0);
632   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
633   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
634   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
635 
636   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
637   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
638 		       (cpu, address + displacement));
639 }
640 
641 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
642 static void
643 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
644 {
645   unsigned rn = INSTR (9, 5);
646   unsigned st = INSTR (4, 0);
647   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
648 
649   if (wb != Post)
650     address += offset;
651 
652   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
653   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
654 
655   if (wb == Post)
656     address += offset;
657 
658   if (wb != NoWriteBack)
659     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
660 }
661 
662 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
663 static void
664 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
665 {
666   unsigned rm = INSTR (20, 16);
667   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
668   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
669 
670   fldrd_wb (cpu, displacement, NoWriteBack);
671 }
672 
673 /* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
674 static void
675 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
676 {
677   FRegister a;
678   unsigned rn = INSTR (9, 5);
679   unsigned st = INSTR (4, 0);
680   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
681 
682   if (wb != Post)
683     address += offset;
684 
685   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
686   aarch64_get_mem_long_double (cpu, address, & a);
687   aarch64_set_FP_long_double (cpu, st, a);
688 
689   if (wb == Post)
690     address += offset;
691 
692   if (wb != NoWriteBack)
693     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
694 }
695 
696 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
697 static void
698 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
699 {
700   unsigned rm = INSTR (20, 16);
701   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
702   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
703 
704   fldrq_wb (cpu, displacement, NoWriteBack);
705 }
706 
707 /* Memory Access
708 
709    load-store single register
710    There are four addressing modes available here which all employ a
711    64 bit source (base) register.
712 
713    N.B. the base register (source) can be the stack pointer.
714    The secondary source register (source2)can only be an Xn register.
715 
716    Scaled, 12-bit, unsigned immediate offset, without pre- and
717    post-index options.
718    Unscaled, 9-bit, signed immediate offset with pre- or post-index
719    writeback.
720    scaled or unscaled 64-bit register offset.
721    scaled or unscaled 32-bit extended register offset.
722 
723    All offsets are assumed to be raw from the decode i.e. the
724    simulator is expected to adjust scaled offsets based on the
725    accessed data size with register or extended register offset
726    versions the same applies except that in the latter case the
727    operation may also require a sign extend.
728 
729    A separate method is provided for each possible addressing mode.  */
730 
731 /* 32 bit load 32 bit scaled unsigned 12 bit  */
732 static void
733 ldr32_abs (sim_cpu *cpu, uint32_t offset)
734 {
735   unsigned rn = INSTR (9, 5);
736   unsigned rt = INSTR (4, 0);
737 
738   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
739   /* The target register may not be SP but the source may be.  */
740   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
741 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
742 			+ SCALE (offset, 32)));
743 }
744 
745 /* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
746 static void
747 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
748 {
749   unsigned rn = INSTR (9, 5);
750   unsigned rt = INSTR (4, 0);
751   uint64_t address;
752 
753   if (rn == rt && wb != NoWriteBack)
754     HALT_UNALLOC;
755 
756   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
757 
758   if (wb != Post)
759     address += offset;
760 
761   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
762   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
763 
764   if (wb == Post)
765     address += offset;
766 
767   if (wb != NoWriteBack)
768     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
769 }
770 
771 /* 32 bit load 32 bit scaled or unscaled
772    zero- or sign-extended 32-bit register offset  */
773 static void
774 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
775 {
776   unsigned rm = INSTR (20, 16);
777   unsigned rn = INSTR (9, 5);
778   unsigned rt = INSTR (4, 0);
779   /* rn may reference SP, rm and rt must reference ZR  */
780 
781   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
782   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
783   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
784 
785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
786   aarch64_set_reg_u64 (cpu, rt, NO_SP,
787 		       aarch64_get_mem_u32 (cpu, address + displacement));
788 }
789 
790 /* 64 bit load 64 bit scaled unsigned 12 bit  */
791 static void
792 ldr_abs (sim_cpu *cpu, uint32_t offset)
793 {
794   unsigned rn = INSTR (9, 5);
795   unsigned rt = INSTR (4, 0);
796 
797   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
798   /* The target register may not be SP but the source may be.  */
799   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
800 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
801 			+ SCALE (offset, 64)));
802 }
803 
804 /* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
805 static void
806 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
807 {
808   unsigned rn = INSTR (9, 5);
809   unsigned rt = INSTR (4, 0);
810   uint64_t address;
811 
812   if (rn == rt && wb != NoWriteBack)
813     HALT_UNALLOC;
814 
815   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
816 
817   if (wb != Post)
818     address += offset;
819 
820   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
821   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
822 
823   if (wb == Post)
824     address += offset;
825 
826   if (wb != NoWriteBack)
827     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
828 }
829 
830 /* 64 bit load 64 bit scaled or unscaled zero-
831    or sign-extended 32-bit register offset.  */
832 static void
833 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
834 {
835   unsigned rm = INSTR (20, 16);
836   unsigned rn = INSTR (9, 5);
837   unsigned rt = INSTR (4, 0);
838   /* rn may reference SP, rm and rt must reference ZR  */
839 
840   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
841   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
842   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
843 
844   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
845   aarch64_set_reg_u64 (cpu, rt, NO_SP,
846 		       aarch64_get_mem_u64 (cpu, address + displacement));
847 }
848 
849 /* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
850 static void
851 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
852 {
853   unsigned rn = INSTR (9, 5);
854   unsigned rt = INSTR (4, 0);
855 
856   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
857   /* The target register may not be SP but the source may be
858      there is no scaling required for a byte load.  */
859   aarch64_set_reg_u64 (cpu, rt, NO_SP,
860 		       aarch64_get_mem_u8
861 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
862 }
863 
864 /* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
865 static void
866 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
867 {
868   unsigned rn = INSTR (9, 5);
869   unsigned rt = INSTR (4, 0);
870   uint64_t address;
871 
872   if (rn == rt && wb != NoWriteBack)
873     HALT_UNALLOC;
874 
875   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
876 
877   if (wb != Post)
878     address += offset;
879 
880   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
881   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
882 
883   if (wb == Post)
884     address += offset;
885 
886   if (wb != NoWriteBack)
887     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
888 }
889 
890 /* 32 bit load zero-extended byte scaled or unscaled zero-
891    or sign-extended 32-bit register offset.  */
892 static void
893 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
894 {
895   unsigned rm = INSTR (20, 16);
896   unsigned rn = INSTR (9, 5);
897   unsigned rt = INSTR (4, 0);
898   /* rn may reference SP, rm and rt must reference ZR  */
899 
900   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
901   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
902 				 extension);
903 
904   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
905   /* There is no scaling required for a byte load.  */
906   aarch64_set_reg_u64 (cpu, rt, NO_SP,
907 		       aarch64_get_mem_u8 (cpu, address + displacement));
908 }
909 
910 /* 64 bit load sign-extended byte unscaled signed 9 bit
911    with pre- or post-writeback.  */
912 static void
913 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
914 {
915   unsigned rn = INSTR (9, 5);
916   unsigned rt = INSTR (4, 0);
917   uint64_t address;
918   int64_t val;
919 
920   if (rn == rt && wb != NoWriteBack)
921     HALT_UNALLOC;
922 
923   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
924 
925   if (wb != Post)
926     address += offset;
927 
928   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
929   val = aarch64_get_mem_s8 (cpu, address);
930   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
931 
932   if (wb == Post)
933     address += offset;
934 
935   if (wb != NoWriteBack)
936     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
937 }
938 
939 /* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
940 static void
941 ldrsb_abs (sim_cpu *cpu, uint32_t offset)
942 {
943   ldrsb_wb (cpu, offset, NoWriteBack);
944 }
945 
946 /* 64 bit load sign-extended byte scaled or unscaled zero-
947    or sign-extended 32-bit register offset.  */
948 static void
949 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
950 {
951   unsigned rm = INSTR (20, 16);
952   unsigned rn = INSTR (9, 5);
953   unsigned rt = INSTR (4, 0);
954   /* rn may reference SP, rm and rt must reference ZR  */
955 
956   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
957   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
958 				 extension);
959   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
960   /* There is no scaling required for a byte load.  */
961   aarch64_set_reg_s64 (cpu, rt, NO_SP,
962 		       aarch64_get_mem_s8 (cpu, address + displacement));
963 }
964 
965 /* 32 bit load zero-extended short scaled unsigned 12 bit.  */
966 static void
967 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
968 {
969   unsigned rn = INSTR (9, 5);
970   unsigned rt = INSTR (4, 0);
971   uint32_t val;
972 
973   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
974   /* The target register may not be SP but the source may be.  */
975   val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
976 			     + SCALE (offset, 16));
977   aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
978 }
979 
980 /* 32 bit load zero-extended short unscaled signed 9 bit
981    with pre- or post-writeback.  */
982 static void
983 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
984 {
985   unsigned rn = INSTR (9, 5);
986   unsigned rt = INSTR (4, 0);
987   uint64_t address;
988 
989   if (rn == rt && wb != NoWriteBack)
990     HALT_UNALLOC;
991 
992   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
993 
994   if (wb != Post)
995     address += offset;
996 
997   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
998   aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
999 
1000   if (wb == Post)
1001     address += offset;
1002 
1003   if (wb != NoWriteBack)
1004     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1005 }
1006 
1007 /* 32 bit load zero-extended short scaled or unscaled zero-
1008    or sign-extended 32-bit register offset.  */
1009 static void
1010 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1011 {
1012   unsigned rm = INSTR (20, 16);
1013   unsigned rn = INSTR (9, 5);
1014   unsigned rt = INSTR (4, 0);
1015   /* rn may reference SP, rm and rt must reference ZR  */
1016 
1017   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1018   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1019   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1020 
1021   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1022   aarch64_set_reg_u32 (cpu, rt, NO_SP,
1023 		       aarch64_get_mem_u16 (cpu, address + displacement));
1024 }
1025 
1026 /* 32 bit load sign-extended short scaled unsigned 12 bit.  */
1027 static void
1028 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
1029 {
1030   unsigned rn = INSTR (9, 5);
1031   unsigned rt = INSTR (4, 0);
1032   int32_t val;
1033 
1034   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1035   /* The target register may not be SP but the source may be.  */
1036   val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1037 			     + SCALE (offset, 16));
1038   aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
1039 }
1040 
1041 /* 32 bit load sign-extended short unscaled signed 9 bit
1042    with pre- or post-writeback.  */
1043 static void
1044 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1045 {
1046   unsigned rn = INSTR (9, 5);
1047   unsigned rt = INSTR (4, 0);
1048   uint64_t address;
1049 
1050   if (rn == rt && wb != NoWriteBack)
1051     HALT_UNALLOC;
1052 
1053   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1054 
1055   if (wb != Post)
1056     address += offset;
1057 
1058   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1059   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1060 		       (int32_t) aarch64_get_mem_s16 (cpu, address));
1061 
1062   if (wb == Post)
1063     address += offset;
1064 
1065   if (wb != NoWriteBack)
1066     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1067 }
1068 
1069 /* 32 bit load sign-extended short scaled or unscaled zero-
1070    or sign-extended 32-bit register offset.  */
1071 static void
1072 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1073 {
1074   unsigned rm = INSTR (20, 16);
1075   unsigned rn = INSTR (9, 5);
1076   unsigned rt = INSTR (4, 0);
1077   /* rn may reference SP, rm and rt must reference ZR  */
1078 
1079   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1080   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1081   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1082 
1083   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1084   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1085 		       (int32_t) aarch64_get_mem_s16
1086 		       (cpu, address + displacement));
1087 }
1088 
1089 /* 64 bit load sign-extended short scaled unsigned 12 bit.  */
1090 static void
1091 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
1092 {
1093   unsigned rn = INSTR (9, 5);
1094   unsigned rt = INSTR (4, 0);
1095   int64_t val;
1096 
1097   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1098   /* The target register may not be SP but the source may be.  */
1099   val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1100 			      + SCALE (offset, 16));
1101   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1102 }
1103 
1104 /* 64 bit load sign-extended short unscaled signed 9 bit
1105    with pre- or post-writeback.  */
1106 static void
1107 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1108 {
1109   unsigned rn = INSTR (9, 5);
1110   unsigned rt = INSTR (4, 0);
1111   uint64_t address;
1112   int64_t val;
1113 
1114   if (rn == rt && wb != NoWriteBack)
1115     HALT_UNALLOC;
1116 
1117   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1118   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1119 
1120   if (wb != Post)
1121     address += offset;
1122 
1123   val = aarch64_get_mem_s16 (cpu, address);
1124   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1125 
1126   if (wb == Post)
1127     address += offset;
1128 
1129   if (wb != NoWriteBack)
1130     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1131 }
1132 
1133 /* 64 bit load sign-extended short scaled or unscaled zero-
1134    or sign-extended 32-bit register offset.  */
1135 static void
1136 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1137 {
1138   unsigned rm = INSTR (20, 16);
1139   unsigned rn = INSTR (9, 5);
1140   unsigned rt = INSTR (4, 0);
1141 
1142   /* rn may reference SP, rm and rt must reference ZR  */
1143 
1144   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1145   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1146   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
1147   int64_t val;
1148 
1149   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1150   val = aarch64_get_mem_s16 (cpu, address + displacement);
1151   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1152 }
1153 
1154 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
1155 static void
1156 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
1157 {
1158   unsigned rn = INSTR (9, 5);
1159   unsigned rt = INSTR (4, 0);
1160   int64_t val;
1161 
1162   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1163   val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1164 			     + SCALE (offset, 32));
1165   /* The target register may not be SP but the source may be.  */
1166   return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1167 }
1168 
1169 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
1170    with pre- or post-writeback.  */
1171 static void
1172 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1173 {
1174   unsigned rn = INSTR (9, 5);
1175   unsigned rt = INSTR (4, 0);
1176   uint64_t address;
1177 
1178   if (rn == rt && wb != NoWriteBack)
1179     HALT_UNALLOC;
1180 
1181   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1182 
1183   if (wb != Post)
1184     address += offset;
1185 
1186   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1187   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
1188 
1189   if (wb == Post)
1190     address += offset;
1191 
1192   if (wb != NoWriteBack)
1193     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1194 }
1195 
1196 /* 64 bit load sign-extended 32 bit scaled or unscaled zero-
1197    or sign-extended 32-bit register offset.  */
1198 static void
1199 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1200 {
1201   unsigned rm = INSTR (20, 16);
1202   unsigned rn = INSTR (9, 5);
1203   unsigned rt = INSTR (4, 0);
1204   /* rn may reference SP, rm and rt must reference ZR  */
1205 
1206   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1207   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1208   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
1209 
1210   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1211   aarch64_set_reg_s64 (cpu, rt, NO_SP,
1212 		       aarch64_get_mem_s32 (cpu, address + displacement));
1213 }
1214 
1215 /* N.B. with stores the value in source is written to the
1216    address identified by source2 modified by source3/offset.  */
1217 
1218 /* 32 bit store scaled unsigned 12 bit.  */
1219 static void
1220 str32_abs (sim_cpu *cpu, uint32_t offset)
1221 {
1222   unsigned rn = INSTR (9, 5);
1223   unsigned rt = INSTR (4, 0);
1224 
1225   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1226   /* The target register may not be SP but the source may be.  */
1227   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
1228 			     + SCALE (offset, 32)),
1229 		       aarch64_get_reg_u32 (cpu, rt, NO_SP));
1230 }
1231 
1232 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1233 static void
1234 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1235 {
1236   unsigned rn = INSTR (9, 5);
1237   unsigned rt = INSTR (4, 0);
1238   uint64_t address;
1239 
1240   if (rn == rt && wb != NoWriteBack)
1241     HALT_UNALLOC;
1242 
1243   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1244   if (wb != Post)
1245     address += offset;
1246 
1247   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1248   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
1249 
1250   if (wb == Post)
1251     address += offset;
1252 
1253   if (wb != NoWriteBack)
1254     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1255 }
1256 
1257 /* 32 bit store scaled or unscaled zero- or
1258    sign-extended 32-bit register offset.  */
1259 static void
1260 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1261 {
1262   unsigned rm = INSTR (20, 16);
1263   unsigned rn = INSTR (9, 5);
1264   unsigned rt = INSTR (4, 0);
1265 
1266   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1267   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1268   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
1269 
1270   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1271   aarch64_set_mem_u32 (cpu, address + displacement,
1272 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1273 }
1274 
1275 /* 64 bit store scaled unsigned 12 bit.  */
1276 static void
1277 str_abs (sim_cpu *cpu, uint32_t offset)
1278 {
1279   unsigned rn = INSTR (9, 5);
1280   unsigned rt = INSTR (4, 0);
1281 
1282   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1283   aarch64_set_mem_u64 (cpu,
1284 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
1285 		       + SCALE (offset, 64),
1286 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1287 }
1288 
1289 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1290 static void
1291 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1292 {
1293   unsigned rn = INSTR (9, 5);
1294   unsigned rt = INSTR (4, 0);
1295   uint64_t address;
1296 
1297   if (rn == rt && wb != NoWriteBack)
1298     HALT_UNALLOC;
1299 
1300   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1301 
1302   if (wb != Post)
1303     address += offset;
1304 
1305   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1306   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
1307 
1308   if (wb == Post)
1309     address += offset;
1310 
1311   if (wb != NoWriteBack)
1312     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1313 }
1314 
1315 /* 64 bit store scaled or unscaled zero-
1316    or sign-extended 32-bit register offset.  */
1317 static void
1318 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1319 {
1320   unsigned rm = INSTR (20, 16);
1321   unsigned rn = INSTR (9, 5);
1322   unsigned rt = INSTR (4, 0);
1323   /* rn may reference SP, rm and rt must reference ZR  */
1324 
1325   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1326   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1327 			       extension);
1328   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
1329 
1330   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1331   aarch64_set_mem_u64 (cpu, address + displacement,
1332 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1333 }
1334 
1335 /* 32 bit store byte scaled unsigned 12 bit.  */
1336 static void
1337 strb_abs (sim_cpu *cpu, uint32_t offset)
1338 {
1339   unsigned rn = INSTR (9, 5);
1340   unsigned rt = INSTR (4, 0);
1341 
1342   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1343   /* The target register may not be SP but the source may be.
1344      There is no scaling required for a byte load.  */
1345   aarch64_set_mem_u8 (cpu,
1346 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
1347 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
1348 }
1349 
1350 /* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
1351 static void
1352 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1353 {
1354   unsigned rn = INSTR (9, 5);
1355   unsigned rt = INSTR (4, 0);
1356   uint64_t address;
1357 
1358   if (rn == rt && wb != NoWriteBack)
1359     HALT_UNALLOC;
1360 
1361   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1362 
1363   if (wb != Post)
1364     address += offset;
1365 
1366   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1367   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
1368 
1369   if (wb == Post)
1370     address += offset;
1371 
1372   if (wb != NoWriteBack)
1373     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1374 }
1375 
1376 /* 32 bit store byte scaled or unscaled zero-
1377    or sign-extended 32-bit register offset.  */
1378 static void
1379 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1380 {
1381   unsigned rm = INSTR (20, 16);
1382   unsigned rn = INSTR (9, 5);
1383   unsigned rt = INSTR (4, 0);
1384   /* rn may reference SP, rm and rt must reference ZR  */
1385 
1386   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1387   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1388 				 extension);
1389 
1390   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1391   /* There is no scaling required for a byte load.  */
1392   aarch64_set_mem_u8 (cpu, address + displacement,
1393 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
1394 }
1395 
1396 /* 32 bit store short scaled unsigned 12 bit.  */
1397 static void
1398 strh_abs (sim_cpu *cpu, uint32_t offset)
1399 {
1400   unsigned rn = INSTR (9, 5);
1401   unsigned rt = INSTR (4, 0);
1402 
1403   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1404   /* The target register may not be SP but the source may be.  */
1405   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1406 		       + SCALE (offset, 16),
1407 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
1408 }
1409 
1410 /* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
1411 static void
1412 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1413 {
1414   unsigned rn = INSTR (9, 5);
1415   unsigned rt = INSTR (4, 0);
1416   uint64_t address;
1417 
1418   if (rn == rt && wb != NoWriteBack)
1419     HALT_UNALLOC;
1420 
1421   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1422 
1423   if (wb != Post)
1424     address += offset;
1425 
1426   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1427   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
1428 
1429   if (wb == Post)
1430     address += offset;
1431 
1432   if (wb != NoWriteBack)
1433     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1434 }
1435 
1436 /* 32 bit store short scaled or unscaled zero-
1437    or sign-extended 32-bit register offset.  */
1438 static void
1439 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1440 {
1441   unsigned rm = INSTR (20, 16);
1442   unsigned rn = INSTR (9, 5);
1443   unsigned rt = INSTR (4, 0);
1444   /* rn may reference SP, rm and rt must reference ZR  */
1445 
1446   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1447   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1448   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1449 
1450   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1451   aarch64_set_mem_u16 (cpu, address + displacement,
1452 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
1453 }
1454 
1455 /* Prefetch unsigned 12 bit.  */
1456 static void
1457 prfm_abs (sim_cpu *cpu, uint32_t offset)
1458 {
1459   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1460                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1461                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1462                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1463                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1464                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1465                           ow ==> UNALLOC
1466      PrfOp prfop = prfop (instr, 4, 0);
1467      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
1468      + SCALE (offset, 64).  */
1469 
1470   /* TODO : implement prefetch of address.  */
1471 }
1472 
1473 /* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
1474 static void
1475 prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1476 {
1477   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1478                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1479                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1480                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1481                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1482                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1483                           ow ==> UNALLOC
1484      rn may reference SP, rm may only reference ZR
1485      PrfOp prfop = prfop (instr, 4, 0);
1486      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1487      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1488                                 extension);
1489      uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
1490      uint64_t address = base + displacement.  */
1491 
1492   /* TODO : implement prefetch of address  */
1493 }
1494 
1495 /* 64 bit pc-relative prefetch.  */
1496 static void
1497 prfm_pcrel (sim_cpu *cpu, int32_t offset)
1498 {
1499   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1500                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1501                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1502                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1503                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1504                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1505                           ow ==> UNALLOC
1506      PrfOp prfop = prfop (instr, 4, 0);
1507      uint64_t address = aarch64_get_PC (cpu) + offset.  */
1508 
1509   /* TODO : implement this  */
1510 }
1511 
1512 /* Load-store exclusive.  */
1513 
1514 static void
1515 ldxr (sim_cpu *cpu)
1516 {
1517   unsigned rn = INSTR (9, 5);
1518   unsigned rt = INSTR (4, 0);
1519   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1520   int size = INSTR (31, 30);
1521   /* int ordered = INSTR (15, 15);  */
1522   /* int exclusive = ! INSTR (23, 23);  */
1523 
1524   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1525   switch (size)
1526     {
1527     case 0:
1528       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
1529       break;
1530     case 1:
1531       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1532       break;
1533     case 2:
1534       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
1535       break;
1536     case 3:
1537       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
1538       break;
1539     }
1540 }
1541 
1542 static void
1543 stxr (sim_cpu *cpu)
1544 {
1545   unsigned rn = INSTR (9, 5);
1546   unsigned rt = INSTR (4, 0);
1547   unsigned rs = INSTR (20, 16);
1548   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1549   int      size = INSTR (31, 30);
1550   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
1551 
1552   switch (size)
1553     {
1554     case 0: aarch64_set_mem_u8 (cpu, address, data); break;
1555     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
1556     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
1557     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
1558     }
1559 
1560   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1561   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
1562 }
1563 
1564 static void
1565 dexLoadLiteral (sim_cpu *cpu)
1566 {
1567   /* instr[29,27] == 011
1568      instr[25,24] == 00
1569      instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
1570                             010 ==> LDRX,  011 ==> FLDRD
1571                             100 ==> LDRSW, 101 ==> FLDRQ
1572                             110 ==> PRFM, 111 ==> UNALLOC
1573      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
1574      instr[23, 5] == simm19  */
1575 
1576   /* unsigned rt = INSTR (4, 0);  */
1577   uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
1578   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
1579 
1580   switch (dispatch)
1581     {
1582     case 0: ldr32_pcrel (cpu, imm); break;
1583     case 1: fldrs_pcrel (cpu, imm); break;
1584     case 2: ldr_pcrel   (cpu, imm); break;
1585     case 3: fldrd_pcrel (cpu, imm); break;
1586     case 4: ldrsw_pcrel (cpu, imm); break;
1587     case 5: fldrq_pcrel (cpu, imm); break;
1588     case 6: prfm_pcrel  (cpu, imm); break;
1589     case 7:
1590     default:
1591       HALT_UNALLOC;
1592     }
1593 }
1594 
1595 /* Immediate arithmetic
1596    The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
1597    value left shifted by 12 bits (done at decode).
1598 
1599    N.B. the register args (dest, source) can normally be Xn or SP.
1600    the exception occurs for flag setting instructions which may
1601    only use Xn for the output (dest).  */
1602 
1603 /* 32 bit add immediate.  */
1604 static void
1605 add32 (sim_cpu *cpu, uint32_t aimm)
1606 {
1607   unsigned rn = INSTR (9, 5);
1608   unsigned rd = INSTR (4, 0);
1609 
1610   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1611   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1612 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
1613 }
1614 
1615 /* 64 bit add immediate.  */
1616 static void
1617 add64 (sim_cpu *cpu, uint32_t aimm)
1618 {
1619   unsigned rn = INSTR (9, 5);
1620   unsigned rd = INSTR (4, 0);
1621 
1622   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1623   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1624 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
1625 }
1626 
1627 static void
1628 set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
1629 {
1630   int32_t   result = value1 + value2;
1631   int64_t   sresult = (int64_t) value1 + (int64_t) value2;
1632   uint64_t  uresult = (uint64_t)(uint32_t) value1
1633     + (uint64_t)(uint32_t) value2;
1634   uint32_t  flags = 0;
1635 
1636   if (result == 0)
1637     flags |= Z;
1638 
1639   if (result & (1 << 31))
1640     flags |= N;
1641 
1642   if (uresult != (uint32_t)uresult)
1643     flags |= C;
1644 
1645   if (sresult != (int32_t)sresult)
1646     flags |= V;
1647 
1648   aarch64_set_CPSR (cpu, flags);
1649 }
1650 
1651 #define NEG(a) (((a) & signbit) == signbit)
1652 #define POS(a) (((a) & signbit) == 0)
1653 
1654 static void
1655 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1656 {
1657   uint64_t result = value1 + value2;
1658   uint32_t flags = 0;
1659   uint64_t signbit = 1ULL << 63;
1660 
1661   if (result == 0)
1662     flags |= Z;
1663 
1664   if (NEG (result))
1665     flags |= N;
1666 
1667   if (   (NEG (value1) && NEG (value2))
1668       || (NEG (value1) && POS (result))
1669       || (NEG (value2) && POS (result)))
1670     flags |= C;
1671 
1672   if (   (NEG (value1) && NEG (value2) && POS (result))
1673       || (POS (value1) && POS (value2) && NEG (result)))
1674     flags |= V;
1675 
1676   aarch64_set_CPSR (cpu, flags);
1677 }
1678 
1679 static void
1680 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
1681 {
1682   uint32_t result = value1 - value2;
1683   uint32_t flags = 0;
1684   uint32_t signbit = 1U << 31;
1685 
1686   if (result == 0)
1687     flags |= Z;
1688 
1689   if (NEG (result))
1690     flags |= N;
1691 
1692   if (   (NEG (value1) && POS (value2))
1693       || (NEG (value1) && POS (result))
1694       || (POS (value2) && POS (result)))
1695     flags |= C;
1696 
1697   if (   (NEG (value1) && POS (value2) && POS (result))
1698       || (POS (value1) && NEG (value2) && NEG (result)))
1699     flags |= V;
1700 
1701   aarch64_set_CPSR (cpu, flags);
1702 }
1703 
1704 static void
1705 set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1706 {
1707   uint64_t result = value1 - value2;
1708   uint32_t flags = 0;
1709   uint64_t signbit = 1ULL << 63;
1710 
1711   if (result == 0)
1712     flags |= Z;
1713 
1714   if (NEG (result))
1715     flags |= N;
1716 
1717   if (   (NEG (value1) && POS (value2))
1718       || (NEG (value1) && POS (result))
1719       || (POS (value2) && POS (result)))
1720     flags |= C;
1721 
1722   if (   (NEG (value1) && POS (value2) && POS (result))
1723       || (POS (value1) && NEG (value2) && NEG (result)))
1724     flags |= V;
1725 
1726   aarch64_set_CPSR (cpu, flags);
1727 }
1728 
1729 static void
1730 set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
1731 {
1732   uint32_t flags = 0;
1733 
1734   if (result == 0)
1735     flags |= Z;
1736   else
1737     flags &= ~ Z;
1738 
1739   if (result & (1 << 31))
1740     flags |= N;
1741   else
1742     flags &= ~ N;
1743 
1744   aarch64_set_CPSR (cpu, flags);
1745 }
1746 
1747 static void
1748 set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
1749 {
1750   uint32_t flags = 0;
1751 
1752   if (result == 0)
1753     flags |= Z;
1754   else
1755     flags &= ~ Z;
1756 
1757   if (result & (1ULL << 63))
1758     flags |= N;
1759   else
1760     flags &= ~ N;
1761 
1762   aarch64_set_CPSR (cpu, flags);
1763 }
1764 
1765 /* 32 bit add immediate set flags.  */
1766 static void
1767 adds32 (sim_cpu *cpu, uint32_t aimm)
1768 {
1769   unsigned rn = INSTR (9, 5);
1770   unsigned rd = INSTR (4, 0);
1771   /* TODO : do we need to worry about signs here?  */
1772   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
1773 
1774   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1775   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
1776   set_flags_for_add32 (cpu, value1, aimm);
1777 }
1778 
1779 /* 64 bit add immediate set flags.  */
1780 static void
1781 adds64 (sim_cpu *cpu, uint32_t aimm)
1782 {
1783   unsigned rn = INSTR (9, 5);
1784   unsigned rd = INSTR (4, 0);
1785   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1786   uint64_t value2 = aimm;
1787 
1788   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1789   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1790   set_flags_for_add64 (cpu, value1, value2);
1791 }
1792 
1793 /* 32 bit sub immediate.  */
1794 static void
1795 sub32 (sim_cpu *cpu, uint32_t aimm)
1796 {
1797   unsigned rn = INSTR (9, 5);
1798   unsigned rd = INSTR (4, 0);
1799 
1800   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1801   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1802 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
1803 }
1804 
1805 /* 64 bit sub immediate.  */
1806 static void
1807 sub64 (sim_cpu *cpu, uint32_t aimm)
1808 {
1809   unsigned rn = INSTR (9, 5);
1810   unsigned rd = INSTR (4, 0);
1811 
1812   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1813   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1814 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
1815 }
1816 
1817 /* 32 bit sub immediate set flags.  */
1818 static void
1819 subs32 (sim_cpu *cpu, uint32_t aimm)
1820 {
1821   unsigned rn = INSTR (9, 5);
1822   unsigned rd = INSTR (4, 0);
1823   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1824   uint32_t value2 = aimm;
1825 
1826   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1827   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1828   set_flags_for_sub32 (cpu, value1, value2);
1829 }
1830 
1831 /* 64 bit sub immediate set flags.  */
1832 static void
1833 subs64 (sim_cpu *cpu, uint32_t aimm)
1834 {
1835   unsigned rn = INSTR (9, 5);
1836   unsigned rd = INSTR (4, 0);
1837   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1838   uint32_t value2 = aimm;
1839 
1840   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1841   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1842   set_flags_for_sub64 (cpu, value1, value2);
1843 }
1844 
1845 /* Data Processing Register.  */
1846 
1847 /* First two helpers to perform the shift operations.  */
1848 
1849 static inline uint32_t
1850 shifted32 (uint32_t value, Shift shift, uint32_t count)
1851 {
1852   switch (shift)
1853     {
1854     default:
1855     case LSL:
1856       return (value << count);
1857     case LSR:
1858       return (value >> count);
1859     case ASR:
1860       {
1861 	int32_t svalue = value;
1862 	return (svalue >> count);
1863       }
1864     case ROR:
1865       {
1866 	uint32_t top = value >> count;
1867 	uint32_t bottom = value << (32 - count);
1868 	return (bottom | top);
1869       }
1870     }
1871 }
1872 
1873 static inline uint64_t
1874 shifted64 (uint64_t value, Shift shift, uint32_t count)
1875 {
1876   switch (shift)
1877     {
1878     default:
1879     case LSL:
1880       return (value << count);
1881     case LSR:
1882       return (value >> count);
1883     case ASR:
1884       {
1885 	int64_t svalue = value;
1886 	return (svalue >> count);
1887       }
1888     case ROR:
1889       {
1890 	uint64_t top = value >> count;
1891 	uint64_t bottom = value << (64 - count);
1892 	return (bottom | top);
1893       }
1894     }
1895 }
1896 
1897 /* Arithmetic shifted register.
1898    These allow an optional LSL, ASR or LSR to the second source
1899    register with a count up to the register bit count.
1900 
1901    N.B register args may not be SP.  */
1902 
1903 /* 32 bit ADD shifted register.  */
1904 static void
1905 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1906 {
1907   unsigned rm = INSTR (20, 16);
1908   unsigned rn = INSTR (9, 5);
1909   unsigned rd = INSTR (4, 0);
1910 
1911   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1912   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1913 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
1914 		       + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1915 				    shift, count));
1916 }
1917 
1918 /* 64 bit ADD shifted register.  */
1919 static void
1920 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1921 {
1922   unsigned rm = INSTR (20, 16);
1923   unsigned rn = INSTR (9, 5);
1924   unsigned rd = INSTR (4, 0);
1925 
1926   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1927   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1928 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
1929 		       + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1930 				    shift, count));
1931 }
1932 
1933 /* 32 bit ADD shifted register setting flags.  */
1934 static void
1935 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1936 {
1937   unsigned rm = INSTR (20, 16);
1938   unsigned rn = INSTR (9, 5);
1939   unsigned rd = INSTR (4, 0);
1940 
1941   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
1942   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1943 			       shift, count);
1944 
1945   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1946   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1947   set_flags_for_add32 (cpu, value1, value2);
1948 }
1949 
1950 /* 64 bit ADD shifted register setting flags.  */
1951 static void
1952 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1953 {
1954   unsigned rm = INSTR (20, 16);
1955   unsigned rn = INSTR (9, 5);
1956   unsigned rd = INSTR (4, 0);
1957 
1958   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
1959   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1960 			       shift, count);
1961 
1962   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1963   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1964   set_flags_for_add64 (cpu, value1, value2);
1965 }
1966 
1967 /* 32 bit SUB shifted register.  */
1968 static void
1969 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1970 {
1971   unsigned rm = INSTR (20, 16);
1972   unsigned rn = INSTR (9, 5);
1973   unsigned rd = INSTR (4, 0);
1974 
1975   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1976   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1977 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
1978 		       - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1979 				    shift, count));
1980 }
1981 
1982 /* 64 bit SUB shifted register.  */
1983 static void
1984 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1985 {
1986   unsigned rm = INSTR (20, 16);
1987   unsigned rn = INSTR (9, 5);
1988   unsigned rd = INSTR (4, 0);
1989 
1990   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1991   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1992 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
1993 		       - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1994 				    shift, count));
1995 }
1996 
1997 /* 32 bit SUB shifted register setting flags.  */
1998 static void
1999 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2000 {
2001   unsigned rm = INSTR (20, 16);
2002   unsigned rn = INSTR (9, 5);
2003   unsigned rd = INSTR (4, 0);
2004 
2005   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2006   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
2007 			      shift, count);
2008 
2009   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2010   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2011   set_flags_for_sub32 (cpu, value1, value2);
2012 }
2013 
2014 /* 64 bit SUB shifted register setting flags.  */
2015 static void
2016 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2017 {
2018   unsigned rm = INSTR (20, 16);
2019   unsigned rn = INSTR (9, 5);
2020   unsigned rd = INSTR (4, 0);
2021 
2022   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2023   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2024 			       shift, count);
2025 
2026   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2027   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2028   set_flags_for_sub64 (cpu, value1, value2);
2029 }
2030 
2031 /* First a couple more helpers to fetch the
2032    relevant source register element either
2033    sign or zero extended as required by the
2034    extension value.  */
2035 
2036 static uint32_t
2037 extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
2038 {
2039   switch (extension)
2040     {
2041     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2042     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2043     case UXTW: ATTRIBUTE_FALLTHROUGH;
2044     case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2045     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2046     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2047     case SXTW: ATTRIBUTE_FALLTHROUGH;
2048     case SXTX: ATTRIBUTE_FALLTHROUGH;
2049     default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2050   }
2051 }
2052 
2053 static uint64_t
2054 extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
2055 {
2056   switch (extension)
2057     {
2058     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2059     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2060     case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2061     case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
2062     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2063     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2064     case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2065     case SXTX:
2066     default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
2067     }
2068 }
2069 
2070 /* Arithmetic extending register
2071    These allow an optional sign extension of some portion of the
2072    second source register followed by an optional left shift of
2073    between 1 and 4 bits (i.e. a shift of 0-4 bits???)
2074 
2075    N.B output (dest) and first input arg (source) may normally be Xn
2076    or SP. However, for flag setting operations dest can only be
2077    Xn. Second input registers are always Xn.  */
2078 
2079 /* 32 bit ADD extending register.  */
2080 static void
2081 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2082 {
2083   unsigned rm = INSTR (20, 16);
2084   unsigned rn = INSTR (9, 5);
2085   unsigned rd = INSTR (4, 0);
2086 
2087   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2088   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2089 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
2090 		       + (extreg32 (cpu, rm, extension) << shift));
2091 }
2092 
2093 /* 64 bit ADD extending register.
2094    N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2095 static void
2096 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2097 {
2098   unsigned rm = INSTR (20, 16);
2099   unsigned rn = INSTR (9, 5);
2100   unsigned rd = INSTR (4, 0);
2101 
2102   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2103   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2104 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
2105 		       + (extreg64 (cpu, rm, extension) << shift));
2106 }
2107 
2108 /* 32 bit ADD extending register setting flags.  */
2109 static void
2110 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2111 {
2112   unsigned rm = INSTR (20, 16);
2113   unsigned rn = INSTR (9, 5);
2114   unsigned rd = INSTR (4, 0);
2115 
2116   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2117   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2118 
2119   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2120   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2121   set_flags_for_add32 (cpu, value1, value2);
2122 }
2123 
2124 /* 64 bit ADD extending register setting flags  */
2125 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2126 static void
2127 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2128 {
2129   unsigned rm = INSTR (20, 16);
2130   unsigned rn = INSTR (9, 5);
2131   unsigned rd = INSTR (4, 0);
2132 
2133   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2134   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2135 
2136   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2137   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2138   set_flags_for_add64 (cpu, value1, value2);
2139 }
2140 
2141 /* 32 bit SUB extending register.  */
2142 static void
2143 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2144 {
2145   unsigned rm = INSTR (20, 16);
2146   unsigned rn = INSTR (9, 5);
2147   unsigned rd = INSTR (4, 0);
2148 
2149   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2150   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2151 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
2152 		       - (extreg32 (cpu, rm, extension) << shift));
2153 }
2154 
2155 /* 64 bit SUB extending register.  */
2156 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2157 static void
2158 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2159 {
2160   unsigned rm = INSTR (20, 16);
2161   unsigned rn = INSTR (9, 5);
2162   unsigned rd = INSTR (4, 0);
2163 
2164   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2165   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2166 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
2167 		       - (extreg64 (cpu, rm, extension) << shift));
2168 }
2169 
2170 /* 32 bit SUB extending register setting flags.  */
2171 static void
2172 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2173 {
2174   unsigned rm = INSTR (20, 16);
2175   unsigned rn = INSTR (9, 5);
2176   unsigned rd = INSTR (4, 0);
2177 
2178   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2179   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2180 
2181   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2182   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2183   set_flags_for_sub32 (cpu, value1, value2);
2184 }
2185 
2186 /* 64 bit SUB extending register setting flags  */
2187 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2188 static void
2189 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2190 {
2191   unsigned rm = INSTR (20, 16);
2192   unsigned rn = INSTR (9, 5);
2193   unsigned rd = INSTR (4, 0);
2194 
2195   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2196   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2197 
2198   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2199   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2200   set_flags_for_sub64 (cpu, value1, value2);
2201 }
2202 
2203 static void
2204 dexAddSubtractImmediate (sim_cpu *cpu)
2205 {
2206   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2207      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2208      instr[29]    = set : 0 ==> no flags, 1 ==> set flags
2209      instr[28,24] = 10001
2210      instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
2211      instr[21,10] = uimm12
2212      instr[9,5]   = Rn
2213      instr[4,0]   = Rd  */
2214 
2215   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
2216   uint32_t shift = INSTR (23, 22);
2217   uint32_t imm = INSTR (21, 10);
2218   uint32_t dispatch = INSTR (31, 29);
2219 
2220   NYI_assert (28, 24, 0x11);
2221 
2222   if (shift > 1)
2223     HALT_UNALLOC;
2224 
2225   if (shift)
2226     imm <<= 12;
2227 
2228   switch (dispatch)
2229     {
2230     case 0: add32 (cpu, imm); break;
2231     case 1: adds32 (cpu, imm); break;
2232     case 2: sub32 (cpu, imm); break;
2233     case 3: subs32 (cpu, imm); break;
2234     case 4: add64 (cpu, imm); break;
2235     case 5: adds64 (cpu, imm); break;
2236     case 6: sub64 (cpu, imm); break;
2237     case 7: subs64 (cpu, imm); break;
2238     }
2239 }
2240 
2241 static void
2242 dexAddSubtractShiftedRegister (sim_cpu *cpu)
2243 {
2244   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2245      instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
2246      instr[28,24] = 01011
2247      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
2248      instr[21]    = 0
2249      instr[20,16] = Rm
2250      instr[15,10] = count : must be 0xxxxx for 32 bit
2251      instr[9,5]   = Rn
2252      instr[4,0]   = Rd  */
2253 
2254   uint32_t size = INSTR (31, 31);
2255   uint32_t count = INSTR (15, 10);
2256   Shift shiftType = INSTR (23, 22);
2257 
2258   NYI_assert (28, 24, 0x0B);
2259   NYI_assert (21, 21, 0);
2260 
2261   /* Shift encoded as ROR is unallocated.  */
2262   if (shiftType == ROR)
2263     HALT_UNALLOC;
2264 
2265   /* 32 bit operations must have count[5] = 0
2266      or else we have an UNALLOC.  */
2267   if (size == 0 && uimm (count, 5, 5))
2268     HALT_UNALLOC;
2269 
2270   /* Dispatch on size:op i.e instr [31,29].  */
2271   switch (INSTR (31, 29))
2272     {
2273     case 0: add32_shift  (cpu, shiftType, count); break;
2274     case 1: adds32_shift (cpu, shiftType, count); break;
2275     case 2: sub32_shift  (cpu, shiftType, count); break;
2276     case 3: subs32_shift (cpu, shiftType, count); break;
2277     case 4: add64_shift  (cpu, shiftType, count); break;
2278     case 5: adds64_shift (cpu, shiftType, count); break;
2279     case 6: sub64_shift  (cpu, shiftType, count); break;
2280     case 7: subs64_shift (cpu, shiftType, count); break;
2281     }
2282 }
2283 
2284 static void
2285 dexAddSubtractExtendedRegister (sim_cpu *cpu)
2286 {
2287   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2288      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2289      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2290      instr[28,24] = 01011
2291      instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
2292      instr[21]    = 1
2293      instr[20,16] = Rm
2294      instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
2295                              000 ==> LSL|UXTW, 001 ==> UXTZ,
2296                              000 ==> SXTB, 001 ==> SXTH,
2297                              000 ==> SXTW, 001 ==> SXTX,
2298      instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
2299      instr[9,5]   = Rn
2300      instr[4,0]   = Rd  */
2301 
2302   Extension extensionType = INSTR (15, 13);
2303   uint32_t shift = INSTR (12, 10);
2304 
2305   NYI_assert (28, 24, 0x0B);
2306   NYI_assert (21, 21, 1);
2307 
2308   /* Shift may not exceed 4.  */
2309   if (shift > 4)
2310     HALT_UNALLOC;
2311 
2312   /* Dispatch on size:op:set?.  */
2313   switch (INSTR (31, 29))
2314     {
2315     case 0: add32_ext  (cpu, extensionType, shift); break;
2316     case 1: adds32_ext (cpu, extensionType, shift); break;
2317     case 2: sub32_ext  (cpu, extensionType, shift); break;
2318     case 3: subs32_ext (cpu, extensionType, shift); break;
2319     case 4: add64_ext  (cpu, extensionType, shift); break;
2320     case 5: adds64_ext (cpu, extensionType, shift); break;
2321     case 6: sub64_ext  (cpu, extensionType, shift); break;
2322     case 7: subs64_ext (cpu, extensionType, shift); break;
2323     }
2324 }
2325 
2326 /* Conditional data processing
2327    Condition register is implicit 3rd source.  */
2328 
2329 /* 32 bit add with carry.  */
2330 /* N.B register args may not be SP.  */
2331 
2332 static void
2333 adc32 (sim_cpu *cpu)
2334 {
2335   unsigned rm = INSTR (20, 16);
2336   unsigned rn = INSTR (9, 5);
2337   unsigned rd = INSTR (4, 0);
2338 
2339   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2340   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2341 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
2342 		       + aarch64_get_reg_u32 (cpu, rm, NO_SP)
2343 		       + IS_SET (C));
2344 }
2345 
2346 /* 64 bit add with carry  */
2347 static void
2348 adc64 (sim_cpu *cpu)
2349 {
2350   unsigned rm = INSTR (20, 16);
2351   unsigned rn = INSTR (9, 5);
2352   unsigned rd = INSTR (4, 0);
2353 
2354   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2355   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2356 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
2357 		       + aarch64_get_reg_u64 (cpu, rm, NO_SP)
2358 		       + IS_SET (C));
2359 }
2360 
2361 /* 32 bit add with carry setting flags.  */
2362 static void
2363 adcs32 (sim_cpu *cpu)
2364 {
2365   unsigned rm = INSTR (20, 16);
2366   unsigned rn = INSTR (9, 5);
2367   unsigned rd = INSTR (4, 0);
2368 
2369   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2370   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2371   uint32_t carry = IS_SET (C);
2372 
2373   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2374   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2375   set_flags_for_add32 (cpu, value1, value2 + carry);
2376 }
2377 
2378 /* 64 bit add with carry setting flags.  */
2379 static void
2380 adcs64 (sim_cpu *cpu)
2381 {
2382   unsigned rm = INSTR (20, 16);
2383   unsigned rn = INSTR (9, 5);
2384   unsigned rd = INSTR (4, 0);
2385 
2386   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2387   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2388   uint64_t carry = IS_SET (C);
2389 
2390   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2391   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2392   set_flags_for_add64 (cpu, value1, value2 + carry);
2393 }
2394 
2395 /* 32 bit sub with carry.  */
2396 static void
2397 sbc32 (sim_cpu *cpu)
2398 {
2399   unsigned rm = INSTR (20, 16);
2400   unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
2401   unsigned rd = INSTR (4, 0);
2402 
2403   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2404   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2405 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
2406 		       - aarch64_get_reg_u32 (cpu, rm, NO_SP)
2407 		       - 1 + IS_SET (C));
2408 }
2409 
2410 /* 64 bit sub with carry  */
2411 static void
2412 sbc64 (sim_cpu *cpu)
2413 {
2414   unsigned rm = INSTR (20, 16);
2415   unsigned rn = INSTR (9, 5);
2416   unsigned rd = INSTR (4, 0);
2417 
2418   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2419   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2420 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
2421 		       - aarch64_get_reg_u64 (cpu, rm, NO_SP)
2422 		       - 1 + IS_SET (C));
2423 }
2424 
2425 /* 32 bit sub with carry setting flags  */
2426 static void
2427 sbcs32 (sim_cpu *cpu)
2428 {
2429   unsigned rm = INSTR (20, 16);
2430   unsigned rn = INSTR (9, 5);
2431   unsigned rd = INSTR (4, 0);
2432 
2433   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2434   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2435   uint32_t carry  = IS_SET (C);
2436   uint32_t result = value1 - value2 + 1 - carry;
2437 
2438   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2439   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2440   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
2441 }
2442 
2443 /* 64 bit sub with carry setting flags  */
2444 static void
2445 sbcs64 (sim_cpu *cpu)
2446 {
2447   unsigned rm = INSTR (20, 16);
2448   unsigned rn = INSTR (9, 5);
2449   unsigned rd = INSTR (4, 0);
2450 
2451   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2452   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2453   uint64_t carry  = IS_SET (C);
2454   uint64_t result = value1 - value2 + 1 - carry;
2455 
2456   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2457   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2458   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
2459 }
2460 
2461 static void
2462 dexAddSubtractWithCarry (sim_cpu *cpu)
2463 {
2464   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2465      instr[30]    = op : 0 ==> ADC, 1 ==> SBC
2466      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2467      instr[28,21] = 1 1010 000
2468      instr[20,16] = Rm
2469      instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
2470      instr[9,5]   = Rn
2471      instr[4,0]   = Rd  */
2472 
2473   uint32_t op2 = INSTR (15, 10);
2474 
2475   NYI_assert (28, 21, 0xD0);
2476 
2477   if (op2 != 0)
2478     HALT_UNALLOC;
2479 
2480   /* Dispatch on size:op:set?.  */
2481   switch (INSTR (31, 29))
2482     {
2483     case 0: adc32 (cpu); break;
2484     case 1: adcs32 (cpu); break;
2485     case 2: sbc32 (cpu); break;
2486     case 3: sbcs32 (cpu); break;
2487     case 4: adc64 (cpu); break;
2488     case 5: adcs64 (cpu); break;
2489     case 6: sbc64 (cpu); break;
2490     case 7: sbcs64 (cpu); break;
2491     }
2492 }
2493 
2494 static uint32_t
2495 testConditionCode (sim_cpu *cpu, CondCode cc)
2496 {
2497   /* This should be reduceable to branchless logic
2498      by some careful testing of bits in CC followed
2499      by the requisite masking and combining of bits
2500      from the flag register.
2501 
2502      For now we do it with a switch.  */
2503   int res;
2504 
2505   switch (cc)
2506     {
2507     case EQ:  res = IS_SET (Z);    break;
2508     case NE:  res = IS_CLEAR (Z);  break;
2509     case CS:  res = IS_SET (C);    break;
2510     case CC:  res = IS_CLEAR (C);  break;
2511     case MI:  res = IS_SET (N);    break;
2512     case PL:  res = IS_CLEAR (N);  break;
2513     case VS:  res = IS_SET (V);    break;
2514     case VC:  res = IS_CLEAR (V);  break;
2515     case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
2516     case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
2517     case GE:  res = IS_SET (N) == IS_SET (V);    break;
2518     case LT:  res = IS_SET (N) != IS_SET (V);    break;
2519     case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
2520     case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
2521     case AL:
2522     case NV:
2523     default:
2524       res = 1;
2525       break;
2526     }
2527   return res;
2528 }
2529 
2530 static void
2531 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
2532 {
2533   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2534      instr[30]    = compare with positive (1) or negative value (0)
2535      instr[29,21] = 1 1101 0010
2536      instr[20,16] = Rm or const
2537      instr[15,12] = cond
2538      instr[11]    = compare reg (0) or const (1)
2539      instr[10]    = 0
2540      instr[9,5]   = Rn
2541      instr[4]     = 0
2542      instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
2543   signed int negate;
2544   unsigned rm;
2545   unsigned rn;
2546 
2547   NYI_assert (29, 21, 0x1d2);
2548   NYI_assert (10, 10, 0);
2549   NYI_assert (4, 4, 0);
2550 
2551   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2552   if (! testConditionCode (cpu, INSTR (15, 12)))
2553     {
2554       aarch64_set_CPSR (cpu, INSTR (3, 0));
2555       return;
2556     }
2557 
2558   negate = INSTR (30, 30) ? 1 : -1;
2559   rm = INSTR (20, 16);
2560   rn = INSTR ( 9,  5);
2561 
2562   if (INSTR (31, 31))
2563     {
2564       if (INSTR (11, 11))
2565 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2566 			     negate * (uint64_t) rm);
2567       else
2568 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2569 			     negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
2570     }
2571   else
2572     {
2573       if (INSTR (11, 11))
2574 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2575 			     negate * rm);
2576       else
2577 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2578 			     negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
2579     }
2580 }
2581 
2582 static void
2583 do_vec_MOV_whole_vector (sim_cpu *cpu)
2584 {
2585   /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)
2586 
2587      instr[31]    = 0
2588      instr[30]    = half(0)/full(1)
2589      instr[29,21] = 001110101
2590      instr[20,16] = Vs
2591      instr[15,10] = 000111
2592      instr[9,5]   = Vs
2593      instr[4,0]   = Vd  */
2594 
2595   unsigned vs = INSTR (9, 5);
2596   unsigned vd = INSTR (4, 0);
2597 
2598   NYI_assert (29, 21, 0x075);
2599   NYI_assert (15, 10, 0x07);
2600 
2601   if (INSTR (20, 16) != vs)
2602     HALT_NYI;
2603 
2604   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2605   if (INSTR (30, 30))
2606     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
2607 
2608   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
2609 }
2610 
2611 static void
2612 do_vec_SMOV_into_scalar (sim_cpu *cpu)
2613 {
2614   /* instr[31]    = 0
2615      instr[30]    = word(0)/long(1)
2616      instr[29,21] = 00 1110 000
2617      instr[20,16] = element size and index
2618      instr[15,10] = 00 0010 11
2619      instr[9,5]   = V source
2620      instr[4,0]   = R dest  */
2621 
2622   unsigned vs = INSTR (9, 5);
2623   unsigned rd = INSTR (4, 0);
2624   unsigned imm5 = INSTR (20, 16);
2625   unsigned full = INSTR (30, 30);
2626   int size, index;
2627 
2628   NYI_assert (29, 21, 0x070);
2629   NYI_assert (15, 10, 0x0B);
2630 
2631   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2632 
2633   if (imm5 & 0x1)
2634     {
2635       size = 0;
2636       index = (imm5 >> 1) & 0xF;
2637     }
2638   else if (imm5 & 0x2)
2639     {
2640       size = 1;
2641       index = (imm5 >> 2) & 0x7;
2642     }
2643   else if (full && (imm5 & 0x4))
2644     {
2645       size = 2;
2646       index = (imm5 >> 3) & 0x3;
2647     }
2648   else
2649     HALT_UNALLOC;
2650 
2651   switch (size)
2652     {
2653     case 0:
2654       if (full)
2655 	aarch64_set_reg_s64 (cpu, rd, NO_SP,
2656 			     aarch64_get_vec_s8 (cpu, vs, index));
2657       else
2658 	aarch64_set_reg_s32 (cpu, rd, NO_SP,
2659 			     aarch64_get_vec_s8 (cpu, vs, index));
2660       break;
2661 
2662     case 1:
2663       if (full)
2664 	aarch64_set_reg_s64 (cpu, rd, NO_SP,
2665 			     aarch64_get_vec_s16 (cpu, vs, index));
2666       else
2667 	aarch64_set_reg_s32 (cpu, rd, NO_SP,
2668 			     aarch64_get_vec_s16 (cpu, vs, index));
2669       break;
2670 
2671     case 2:
2672       aarch64_set_reg_s64 (cpu, rd, NO_SP,
2673 			   aarch64_get_vec_s32 (cpu, vs, index));
2674       break;
2675 
2676     default:
2677       HALT_UNALLOC;
2678     }
2679 }
2680 
2681 static void
2682 do_vec_UMOV_into_scalar (sim_cpu *cpu)
2683 {
2684   /* instr[31]    = 0
2685      instr[30]    = word(0)/long(1)
2686      instr[29,21] = 00 1110 000
2687      instr[20,16] = element size and index
2688      instr[15,10] = 00 0011 11
2689      instr[9,5]   = V source
2690      instr[4,0]   = R dest  */
2691 
2692   unsigned vs = INSTR (9, 5);
2693   unsigned rd = INSTR (4, 0);
2694   unsigned imm5 = INSTR (20, 16);
2695   unsigned full = INSTR (30, 30);
2696   int size, index;
2697 
2698   NYI_assert (29, 21, 0x070);
2699   NYI_assert (15, 10, 0x0F);
2700 
2701   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2702 
2703   if (!full)
2704     {
2705       if (imm5 & 0x1)
2706 	{
2707 	  size = 0;
2708 	  index = (imm5 >> 1) & 0xF;
2709 	}
2710       else if (imm5 & 0x2)
2711 	{
2712 	  size = 1;
2713 	  index = (imm5 >> 2) & 0x7;
2714 	}
2715       else if (imm5 & 0x4)
2716 	{
2717 	  size = 2;
2718 	  index = (imm5 >> 3) & 0x3;
2719 	}
2720       else
2721 	HALT_UNALLOC;
2722     }
2723   else if (imm5 & 0x8)
2724     {
2725       size = 3;
2726       index = (imm5 >> 4) & 0x1;
2727     }
2728   else
2729     HALT_UNALLOC;
2730 
2731   switch (size)
2732     {
2733     case 0:
2734       aarch64_set_reg_u32 (cpu, rd, NO_SP,
2735 			   aarch64_get_vec_u8 (cpu, vs, index));
2736       break;
2737 
2738     case 1:
2739       aarch64_set_reg_u32 (cpu, rd, NO_SP,
2740 			   aarch64_get_vec_u16 (cpu, vs, index));
2741       break;
2742 
2743     case 2:
2744       aarch64_set_reg_u32 (cpu, rd, NO_SP,
2745 			   aarch64_get_vec_u32 (cpu, vs, index));
2746       break;
2747 
2748     case 3:
2749       aarch64_set_reg_u64 (cpu, rd, NO_SP,
2750 			   aarch64_get_vec_u64 (cpu, vs, index));
2751       break;
2752 
2753     default:
2754       HALT_UNALLOC;
2755     }
2756 }
2757 
2758 static void
2759 do_vec_INS (sim_cpu *cpu)
2760 {
2761   /* instr[31,21] = 01001110000
2762      instr[20,16] = element size and index
2763      instr[15,10] = 000111
2764      instr[9,5]   = W source
2765      instr[4,0]   = V dest  */
2766 
2767   int index;
2768   unsigned rs = INSTR (9, 5);
2769   unsigned vd = INSTR (4, 0);
2770 
2771   NYI_assert (31, 21, 0x270);
2772   NYI_assert (15, 10, 0x07);
2773 
2774   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2775   if (INSTR (16, 16))
2776     {
2777       index = INSTR (20, 17);
2778       aarch64_set_vec_u8 (cpu, vd, index,
2779 			  aarch64_get_reg_u8 (cpu, rs, NO_SP));
2780     }
2781   else if (INSTR (17, 17))
2782     {
2783       index = INSTR (20, 18);
2784       aarch64_set_vec_u16 (cpu, vd, index,
2785 			   aarch64_get_reg_u16 (cpu, rs, NO_SP));
2786     }
2787   else if (INSTR (18, 18))
2788     {
2789       index = INSTR (20, 19);
2790       aarch64_set_vec_u32 (cpu, vd, index,
2791 			   aarch64_get_reg_u32 (cpu, rs, NO_SP));
2792     }
2793   else if (INSTR (19, 19))
2794     {
2795       index = INSTR (20, 20);
2796       aarch64_set_vec_u64 (cpu, vd, index,
2797 			   aarch64_get_reg_u64 (cpu, rs, NO_SP));
2798     }
2799   else
2800     HALT_NYI;
2801 }
2802 
2803 static void
2804 do_vec_DUP_vector_into_vector (sim_cpu *cpu)
2805 {
2806   /* instr[31]    = 0
2807      instr[30]    = half(0)/full(1)
2808      instr[29,21] = 00 1110 000
2809      instr[20,16] = element size and index
2810      instr[15,10] = 0000 01
2811      instr[9,5]   = V source
2812      instr[4,0]   = V dest.  */
2813 
2814   unsigned full = INSTR (30, 30);
2815   unsigned vs = INSTR (9, 5);
2816   unsigned vd = INSTR (4, 0);
2817   int i, index;
2818 
2819   NYI_assert (29, 21, 0x070);
2820   NYI_assert (15, 10, 0x01);
2821 
2822   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2823   if (INSTR (16, 16))
2824     {
2825       index = INSTR (20, 17);
2826 
2827       for (i = 0; i < (full ? 16 : 8); i++)
2828 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
2829     }
2830   else if (INSTR (17, 17))
2831     {
2832       index = INSTR (20, 18);
2833 
2834       for (i = 0; i < (full ? 8 : 4); i++)
2835 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
2836     }
2837   else if (INSTR (18, 18))
2838     {
2839       index = INSTR (20, 19);
2840 
2841       for (i = 0; i < (full ? 4 : 2); i++)
2842 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
2843     }
2844   else
2845     {
2846       if (INSTR (19, 19) == 0)
2847 	HALT_UNALLOC;
2848 
2849       if (! full)
2850 	HALT_UNALLOC;
2851 
2852       index = INSTR (20, 20);
2853 
2854       for (i = 0; i < 2; i++)
2855 	aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
2856     }
2857 }
2858 
2859 static void
2860 do_vec_TBL (sim_cpu *cpu)
2861 {
2862   /* instr[31]    = 0
2863      instr[30]    = half(0)/full(1)
2864      instr[29,21] = 00 1110 000
2865      instr[20,16] = Vm
2866      instr[15]    = 0
2867      instr[14,13] = vec length
2868      instr[12,10] = 000
2869      instr[9,5]   = V start
2870      instr[4,0]   = V dest  */
2871 
2872   int full    = INSTR (30, 30);
2873   int len     = INSTR (14, 13) + 1;
2874   unsigned vm = INSTR (20, 16);
2875   unsigned vn = INSTR (9, 5);
2876   unsigned vd = INSTR (4, 0);
2877   unsigned i;
2878 
2879   NYI_assert (29, 21, 0x070);
2880   NYI_assert (12, 10, 0);
2881 
2882   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2883   for (i = 0; i < (full ? 16 : 8); i++)
2884     {
2885       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
2886       uint8_t val;
2887 
2888       if (selector < 16)
2889 	val = aarch64_get_vec_u8 (cpu, vn, selector);
2890       else if (selector < 32)
2891 	val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
2892       else if (selector < 48)
2893 	val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
2894       else if (selector < 64)
2895 	val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
2896       else
2897 	val = 0;
2898 
2899       aarch64_set_vec_u8 (cpu, vd, i, val);
2900     }
2901 }
2902 
2903 static void
2904 do_vec_TRN (sim_cpu *cpu)
2905 {
2906   /* instr[31]    = 0
2907      instr[30]    = half(0)/full(1)
2908      instr[29,24] = 00 1110
2909      instr[23,22] = size
2910      instr[21]    = 0
2911      instr[20,16] = Vm
2912      instr[15]    = 0
2913      instr[14]    = TRN1 (0) / TRN2 (1)
2914      instr[13,10] = 1010
2915      instr[9,5]   = V source
2916      instr[4,0]   = V dest.  */
2917 
2918   int full    = INSTR (30, 30);
2919   int second  = INSTR (14, 14);
2920   unsigned vm = INSTR (20, 16);
2921   unsigned vn = INSTR (9, 5);
2922   unsigned vd = INSTR (4, 0);
2923   unsigned i;
2924 
2925   NYI_assert (29, 24, 0x0E);
2926   NYI_assert (13, 10, 0xA);
2927 
2928   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2929   switch (INSTR (23, 22))
2930     {
2931     case 0:
2932       for (i = 0; i < (full ? 8 : 4); i++)
2933 	{
2934 	  aarch64_set_vec_u8
2935 	    (cpu, vd, i * 2,
2936 	     aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
2937 	  aarch64_set_vec_u8
2938 	    (cpu, vd, 1 * 2 + 1,
2939 	     aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
2940 	}
2941       break;
2942 
2943     case 1:
2944       for (i = 0; i < (full ? 4 : 2); i++)
2945 	{
2946 	  aarch64_set_vec_u16
2947 	    (cpu, vd, i * 2,
2948 	     aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
2949 	  aarch64_set_vec_u16
2950 	    (cpu, vd, 1 * 2 + 1,
2951 	     aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
2952 	}
2953       break;
2954 
2955     case 2:
2956       aarch64_set_vec_u32
2957 	(cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
2958       aarch64_set_vec_u32
2959 	(cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
2960       aarch64_set_vec_u32
2961 	(cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
2962       aarch64_set_vec_u32
2963 	(cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
2964       break;
2965 
2966     case 3:
2967       if (! full)
2968 	HALT_UNALLOC;
2969 
2970       aarch64_set_vec_u64 (cpu, vd, 0,
2971 			   aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
2972       aarch64_set_vec_u64 (cpu, vd, 1,
2973 			   aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
2974       break;
2975     }
2976 }
2977 
2978 static void
2979 do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
2980 {
2981   /* instr[31]    = 0
2982      instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
2983                     [must be 1 for 64-bit xfer]
2984      instr[29,20] = 00 1110 0000
2985      instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
2986                                   0100=> 32-bits. 1000=>64-bits
2987      instr[15,10] = 0000 11
2988      instr[9,5]   = W source
2989      instr[4,0]   = V dest.  */
2990 
2991   unsigned i;
2992   unsigned Vd = INSTR (4, 0);
2993   unsigned Rs = INSTR (9, 5);
2994   int both    = INSTR (30, 30);
2995 
2996   NYI_assert (29, 20, 0x0E0);
2997   NYI_assert (15, 10, 0x03);
2998 
2999   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3000   switch (INSTR (19, 16))
3001     {
3002     case 1:
3003       for (i = 0; i < (both ? 16 : 8); i++)
3004 	aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
3005       break;
3006 
3007     case 2:
3008       for (i = 0; i < (both ? 8 : 4); i++)
3009 	aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
3010       break;
3011 
3012     case 4:
3013       for (i = 0; i < (both ? 4 : 2); i++)
3014 	aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
3015       break;
3016 
3017     case 8:
3018       if (!both)
3019 	HALT_NYI;
3020       aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
3021       aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
3022       break;
3023 
3024     default:
3025       HALT_NYI;
3026     }
3027 }
3028 
3029 static void
3030 do_vec_UZP (sim_cpu *cpu)
3031 {
3032   /* instr[31]    = 0
3033      instr[30]    = half(0)/full(1)
3034      instr[29,24] = 00 1110
3035      instr[23,22] = size: byte(00), half(01), word (10), long (11)
3036      instr[21]    = 0
3037      instr[20,16] = Vm
3038      instr[15]    = 0
3039      instr[14]    = lower (0) / upper (1)
3040      instr[13,10] = 0110
3041      instr[9,5]   = Vn
3042      instr[4,0]   = Vd.  */
3043 
3044   int full = INSTR (30, 30);
3045   int upper = INSTR (14, 14);
3046 
3047   unsigned vm = INSTR (20, 16);
3048   unsigned vn = INSTR (9, 5);
3049   unsigned vd = INSTR (4, 0);
3050 
3051   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3052   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3053   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3054   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3055 
3056   uint64_t val1;
3057   uint64_t val2;
3058 
3059   uint64_t input2 = full ? val_n2 : val_m1;
3060 
3061   NYI_assert (29, 24, 0x0E);
3062   NYI_assert (21, 21, 0);
3063   NYI_assert (15, 15, 0);
3064   NYI_assert (13, 10, 6);
3065 
3066   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3067   switch (INSTR (23, 22))
3068     {
3069     case 0:
3070       val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
3071       val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
3072       val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
3073       val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
3074 
3075       val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
3076       val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
3077       val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
3078       val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
3079 
3080       if (full)
3081 	{
3082 	  val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
3083 	  val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
3084 	  val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
3085 	  val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
3086 
3087 	  val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
3088 	  val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
3089 	  val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
3090 	  val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
3091 	}
3092       break;
3093 
3094     case 1:
3095       val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
3096       val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3097 
3098       val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
3099       val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3100 
3101       if (full)
3102 	{
3103 	  val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
3104 	  val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3105 
3106 	  val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
3107 	  val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3108 	}
3109       break;
3110 
3111     case 2:
3112       val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
3113       val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3114 
3115       if (full)
3116 	{
3117 	  val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
3118 	  val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3119 	}
3120       break;
3121 
3122     case 3:
3123       if (! full)
3124 	HALT_UNALLOC;
3125 
3126       val1 = upper ? val_n2 : val_n1;
3127       val2 = upper ? val_m2 : val_m1;
3128       break;
3129     }
3130 
3131   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3132   if (full)
3133     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3134 }
3135 
3136 static void
3137 do_vec_ZIP (sim_cpu *cpu)
3138 {
3139   /* instr[31]    = 0
3140      instr[30]    = half(0)/full(1)
3141      instr[29,24] = 00 1110
3142      instr[23,22] = size: byte(00), hald(01), word (10), long (11)
3143      instr[21]    = 0
3144      instr[20,16] = Vm
3145      instr[15]    = 0
3146      instr[14]    = lower (0) / upper (1)
3147      instr[13,10] = 1110
3148      instr[9,5]   = Vn
3149      instr[4,0]   = Vd.  */
3150 
3151   int full = INSTR (30, 30);
3152   int upper = INSTR (14, 14);
3153 
3154   unsigned vm = INSTR (20, 16);
3155   unsigned vn = INSTR (9, 5);
3156   unsigned vd = INSTR (4, 0);
3157 
3158   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3159   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3160   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3161   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3162 
3163   uint64_t val1 = 0;
3164   uint64_t val2 = 0;
3165 
3166   uint64_t input1 = upper ? val_n1 : val_m1;
3167   uint64_t input2 = upper ? val_n2 : val_m2;
3168 
3169   NYI_assert (29, 24, 0x0E);
3170   NYI_assert (21, 21, 0);
3171   NYI_assert (15, 15, 0);
3172   NYI_assert (13, 10, 0xE);
3173 
3174   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3175   switch (INSTR (23, 23))
3176     {
3177     case 0:
3178       val1 =
3179 	  ((input1 <<  0) & (0xFF    <<  0))
3180 	| ((input2 <<  8) & (0xFF    <<  8))
3181 	| ((input1 <<  8) & (0xFF    << 16))
3182 	| ((input2 << 16) & (0xFF    << 24))
3183 	| ((input1 << 16) & (0xFFULL << 32))
3184 	| ((input2 << 24) & (0xFFULL << 40))
3185 	| ((input1 << 24) & (0xFFULL << 48))
3186 	| ((input2 << 32) & (0xFFULL << 56));
3187 
3188       val2 =
3189 	  ((input1 >> 32) & (0xFF    <<  0))
3190 	| ((input2 >> 24) & (0xFF    <<  8))
3191 	| ((input1 >> 24) & (0xFF    << 16))
3192 	| ((input2 >> 16) & (0xFF    << 24))
3193 	| ((input1 >> 16) & (0xFFULL << 32))
3194 	| ((input2 >>  8) & (0xFFULL << 40))
3195 	| ((input1 >>  8) & (0xFFULL << 48))
3196 	| ((input2 >>  0) & (0xFFULL << 56));
3197       break;
3198 
3199     case 1:
3200       val1 =
3201 	  ((input1 <<  0) & (0xFFFF    <<  0))
3202 	| ((input2 << 16) & (0xFFFF    << 16))
3203 	| ((input1 << 16) & (0xFFFFULL << 32))
3204 	| ((input2 << 32) & (0xFFFFULL << 48));
3205 
3206       val2 =
3207 	  ((input1 >> 32) & (0xFFFF    <<  0))
3208 	| ((input2 >> 16) & (0xFFFF    << 16))
3209 	| ((input1 >> 16) & (0xFFFFULL << 32))
3210 	| ((input2 >>  0) & (0xFFFFULL << 48));
3211       break;
3212 
3213     case 2:
3214       val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
3215       val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
3216       break;
3217 
3218     case 3:
3219       val1 = input1;
3220       val2 = input2;
3221       break;
3222     }
3223 
3224   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3225   if (full)
3226     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3227 }
3228 
3229 /* Floating point immediates are encoded in 8 bits.
3230    fpimm[7] = sign bit.
3231    fpimm[6:4] = signed exponent.
3232    fpimm[3:0] = fraction (assuming leading 1).
3233    i.e. F = s * 1.f * 2^(e - b).  */
3234 
3235 static float
3236 fp_immediate_for_encoding_32 (uint32_t imm8)
3237 {
3238   float u;
3239   uint32_t s, e, f, i;
3240 
3241   s = (imm8 >> 7) & 0x1;
3242   e = (imm8 >> 4) & 0x7;
3243   f = imm8 & 0xf;
3244 
3245   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3246   u = (16.0 + f) / 16.0;
3247 
3248   /* N.B. exponent is signed.  */
3249   if (e < 4)
3250     {
3251       int epos = e;
3252 
3253       for (i = 0; i <= epos; i++)
3254 	u *= 2.0;
3255     }
3256   else
3257     {
3258       int eneg = 7 - e;
3259 
3260       for (i = 0; i < eneg; i++)
3261 	u /= 2.0;
3262     }
3263 
3264   if (s)
3265     u = - u;
3266 
3267   return u;
3268 }
3269 
3270 static double
3271 fp_immediate_for_encoding_64 (uint32_t imm8)
3272 {
3273   double u;
3274   uint32_t s, e, f, i;
3275 
3276   s = (imm8 >> 7) & 0x1;
3277   e = (imm8 >> 4) & 0x7;
3278   f = imm8 & 0xf;
3279 
3280   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3281   u = (16.0 + f) / 16.0;
3282 
3283   /* N.B. exponent is signed.  */
3284   if (e < 4)
3285     {
3286       int epos = e;
3287 
3288       for (i = 0; i <= epos; i++)
3289 	u *= 2.0;
3290     }
3291   else
3292     {
3293       int eneg = 7 - e;
3294 
3295       for (i = 0; i < eneg; i++)
3296 	u /= 2.0;
3297     }
3298 
3299   if (s)
3300     u = - u;
3301 
3302   return u;
3303 }
3304 
3305 static void
3306 do_vec_MOV_immediate (sim_cpu *cpu)
3307 {
3308   /* instr[31]    = 0
3309      instr[30]    = full/half selector
3310      instr[29,19] = 00111100000
3311      instr[18,16] = high 3 bits of uimm8
3312      instr[15,12] = size & shift:
3313                                   0000 => 32-bit
3314                                   0010 => 32-bit + LSL#8
3315                                   0100 => 32-bit + LSL#16
3316                                   0110 => 32-bit + LSL#24
3317                                   1010 => 16-bit + LSL#8
3318                                   1000 => 16-bit
3319                                   1101 => 32-bit + MSL#16
3320                                   1100 => 32-bit + MSL#8
3321                                   1110 => 8-bit
3322                                   1111 => double
3323      instr[11,10] = 01
3324      instr[9,5]   = low 5-bits of uimm8
3325      instr[4,0]   = Vd.  */
3326 
3327   int full     = INSTR (30, 30);
3328   unsigned vd  = INSTR (4, 0);
3329   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3330   unsigned i;
3331 
3332   NYI_assert (29, 19, 0x1E0);
3333   NYI_assert (11, 10, 1);
3334 
3335   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3336   switch (INSTR (15, 12))
3337     {
3338     case 0x0: /* 32-bit, no shift.  */
3339     case 0x2: /* 32-bit, shift by 8.  */
3340     case 0x4: /* 32-bit, shift by 16.  */
3341     case 0x6: /* 32-bit, shift by 24.  */
3342       val <<= (8 * INSTR (14, 13));
3343       for (i = 0; i < (full ? 4 : 2); i++)
3344 	aarch64_set_vec_u32 (cpu, vd, i, val);
3345       break;
3346 
3347     case 0xa: /* 16-bit, shift by 8.  */
3348       val <<= 8;
3349       ATTRIBUTE_FALLTHROUGH;
3350     case 0x8: /* 16-bit, no shift.  */
3351       for (i = 0; i < (full ? 8 : 4); i++)
3352 	aarch64_set_vec_u16 (cpu, vd, i, val);
3353       break;
3354 
3355     case 0xd: /* 32-bit, mask shift by 16.  */
3356       val <<= 8;
3357       val |= 0xFF;
3358       ATTRIBUTE_FALLTHROUGH;
3359     case 0xc: /* 32-bit, mask shift by 8. */
3360       val <<= 8;
3361       val |= 0xFF;
3362       for (i = 0; i < (full ? 4 : 2); i++)
3363 	aarch64_set_vec_u32 (cpu, vd, i, val);
3364       break;
3365 
3366     case 0xe: /* 8-bit, no shift.  */
3367       for (i = 0; i < (full ? 16 : 8); i++)
3368 	aarch64_set_vec_u8 (cpu, vd, i, val);
3369       break;
3370 
3371     case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
3372       {
3373 	float u = fp_immediate_for_encoding_32 (val);
3374 	for (i = 0; i < (full ? 4 : 2); i++)
3375 	  aarch64_set_vec_float (cpu, vd, i, u);
3376 	break;
3377       }
3378 
3379     default:
3380       HALT_NYI;
3381     }
3382 }
3383 
3384 static void
3385 do_vec_MVNI (sim_cpu *cpu)
3386 {
3387   /* instr[31]    = 0
3388      instr[30]    = full/half selector
3389      instr[29,19] = 10111100000
3390      instr[18,16] = high 3 bits of uimm8
3391      instr[15,12] = selector
3392      instr[11,10] = 01
3393      instr[9,5]   = low 5-bits of uimm8
3394      instr[4,0]   = Vd.  */
3395 
3396   int full     = INSTR (30, 30);
3397   unsigned vd  = INSTR (4, 0);
3398   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3399   unsigned i;
3400 
3401   NYI_assert (29, 19, 0x5E0);
3402   NYI_assert (11, 10, 1);
3403 
3404   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3405   switch (INSTR (15, 12))
3406     {
3407     case 0x0: /* 32-bit, no shift.  */
3408     case 0x2: /* 32-bit, shift by 8.  */
3409     case 0x4: /* 32-bit, shift by 16.  */
3410     case 0x6: /* 32-bit, shift by 24.  */
3411       val <<= (8 * INSTR (14, 13));
3412       val = ~ val;
3413       for (i = 0; i < (full ? 4 : 2); i++)
3414 	aarch64_set_vec_u32 (cpu, vd, i, val);
3415       return;
3416 
3417     case 0xa: /* 16-bit, 8 bit shift. */
3418       val <<= 8;
3419       ATTRIBUTE_FALLTHROUGH;
3420     case 0x8: /* 16-bit, no shift. */
3421       val = ~ val;
3422       for (i = 0; i < (full ? 8 : 4); i++)
3423 	aarch64_set_vec_u16 (cpu, vd, i, val);
3424       return;
3425 
3426     case 0xd: /* 32-bit, mask shift by 16.  */
3427       val <<= 8;
3428       val |= 0xFF;
3429       ATTRIBUTE_FALLTHROUGH;
3430     case 0xc: /* 32-bit, mask shift by 8. */
3431       val <<= 8;
3432       val |= 0xFF;
3433       val = ~ val;
3434       for (i = 0; i < (full ? 4 : 2); i++)
3435 	aarch64_set_vec_u32 (cpu, vd, i, val);
3436       return;
3437 
3438     case 0xE: /* MOVI Dn, #mask64 */
3439       {
3440 	uint64_t mask = 0;
3441 
3442 	for (i = 0; i < 8; i++)
3443 	  if (val & (1 << i))
3444 	    mask |= (0xFFUL << (i * 8));
3445 	aarch64_set_vec_u64 (cpu, vd, 0, mask);
3446 	aarch64_set_vec_u64 (cpu, vd, 1, mask);
3447 	return;
3448       }
3449 
3450     case 0xf: /* FMOV Vd.2D, #fpimm.  */
3451       {
3452 	double u = fp_immediate_for_encoding_64 (val);
3453 
3454 	if (! full)
3455 	  HALT_UNALLOC;
3456 
3457 	aarch64_set_vec_double (cpu, vd, 0, u);
3458 	aarch64_set_vec_double (cpu, vd, 1, u);
3459 	return;
3460       }
3461 
3462     default:
3463       HALT_NYI;
3464     }
3465 }
3466 
3467 #define ABS(A) ((A) < 0 ? - (A) : (A))
3468 
3469 static void
3470 do_vec_ABS (sim_cpu *cpu)
3471 {
3472   /* instr[31]    = 0
3473      instr[30]    = half(0)/full(1)
3474      instr[29,24] = 00 1110
3475      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3476      instr[21,10] = 10 0000 1011 10
3477      instr[9,5]   = Vn
3478      instr[4.0]   = Vd.  */
3479 
3480   unsigned vn = INSTR (9, 5);
3481   unsigned vd = INSTR (4, 0);
3482   unsigned full = INSTR (30, 30);
3483   unsigned i;
3484 
3485   NYI_assert (29, 24, 0x0E);
3486   NYI_assert (21, 10, 0x82E);
3487 
3488   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3489   switch (INSTR (23, 22))
3490     {
3491     case 0:
3492       for (i = 0; i < (full ? 16 : 8); i++)
3493 	aarch64_set_vec_s8 (cpu, vd, i,
3494 			    ABS (aarch64_get_vec_s8 (cpu, vn, i)));
3495       break;
3496 
3497     case 1:
3498       for (i = 0; i < (full ? 8 : 4); i++)
3499 	aarch64_set_vec_s16 (cpu, vd, i,
3500 			     ABS (aarch64_get_vec_s16 (cpu, vn, i)));
3501       break;
3502 
3503     case 2:
3504       for (i = 0; i < (full ? 4 : 2); i++)
3505 	aarch64_set_vec_s32 (cpu, vd, i,
3506 			     ABS (aarch64_get_vec_s32 (cpu, vn, i)));
3507       break;
3508 
3509     case 3:
3510       if (! full)
3511 	HALT_NYI;
3512       for (i = 0; i < 2; i++)
3513 	aarch64_set_vec_s64 (cpu, vd, i,
3514 			     ABS (aarch64_get_vec_s64 (cpu, vn, i)));
3515       break;
3516     }
3517 }
3518 
3519 static void
3520 do_vec_ADDV (sim_cpu *cpu)
3521 {
3522   /* instr[31]    = 0
3523      instr[30]    = full/half selector
3524      instr[29,24] = 00 1110
3525      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3526      instr[21,10] = 11 0001 1011 10
3527      instr[9,5]   = Vm
3528      instr[4.0]   = Rd.  */
3529 
3530   unsigned vm = INSTR (9, 5);
3531   unsigned rd = INSTR (4, 0);
3532   unsigned i;
3533   int      full = INSTR (30, 30);
3534 
3535   NYI_assert (29, 24, 0x0E);
3536   NYI_assert (21, 10, 0xC6E);
3537 
3538   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3539   switch (INSTR (23, 22))
3540     {
3541     case 0:
3542       {
3543 	uint8_t val = 0;
3544 	for (i = 0; i < (full ? 16 : 8); i++)
3545 	  val += aarch64_get_vec_u8 (cpu, vm, i);
3546 	aarch64_set_vec_u64 (cpu, rd, 0, val);
3547 	return;
3548       }
3549 
3550     case 1:
3551       {
3552 	uint16_t val = 0;
3553 	for (i = 0; i < (full ? 8 : 4); i++)
3554 	  val += aarch64_get_vec_u16 (cpu, vm, i);
3555 	aarch64_set_vec_u64 (cpu, rd, 0, val);
3556 	return;
3557       }
3558 
3559     case 2:
3560       {
3561 	uint32_t val = 0;
3562 	if (! full)
3563 	  HALT_UNALLOC;
3564 	for (i = 0; i < 4; i++)
3565 	  val += aarch64_get_vec_u32 (cpu, vm, i);
3566 	aarch64_set_vec_u64 (cpu, rd, 0, val);
3567 	return;
3568       }
3569 
3570     case 3:
3571       HALT_UNALLOC;
3572     }
3573 }
3574 
3575 static void
3576 do_vec_ins_2 (sim_cpu *cpu)
3577 {
3578   /* instr[31,21] = 01001110000
3579      instr[20,18] = size & element selector
3580      instr[17,14] = 0000
3581      instr[13]    = direction: to vec(0), from vec (1)
3582      instr[12,10] = 111
3583      instr[9,5]   = Vm
3584      instr[4,0]   = Vd.  */
3585 
3586   unsigned elem;
3587   unsigned vm = INSTR (9, 5);
3588   unsigned vd = INSTR (4, 0);
3589 
3590   NYI_assert (31, 21, 0x270);
3591   NYI_assert (17, 14, 0);
3592   NYI_assert (12, 10, 7);
3593 
3594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3595   if (INSTR (13, 13) == 1)
3596     {
3597       if (INSTR (18, 18) == 1)
3598 	{
3599 	  /* 32-bit moves.  */
3600 	  elem = INSTR (20, 19);
3601 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
3602 			       aarch64_get_vec_u32 (cpu, vm, elem));
3603 	}
3604       else
3605 	{
3606 	  /* 64-bit moves.  */
3607 	  if (INSTR (19, 19) != 1)
3608 	    HALT_NYI;
3609 
3610 	  elem = INSTR (20, 20);
3611 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
3612 			       aarch64_get_vec_u64 (cpu, vm, elem));
3613 	}
3614     }
3615   else
3616     {
3617       if (INSTR (18, 18) == 1)
3618 	{
3619 	  /* 32-bit moves.  */
3620 	  elem = INSTR (20, 19);
3621 	  aarch64_set_vec_u32 (cpu, vd, elem,
3622 			       aarch64_get_reg_u32 (cpu, vm, NO_SP));
3623 	}
3624       else
3625 	{
3626 	  /* 64-bit moves.  */
3627 	  if (INSTR (19, 19) != 1)
3628 	    HALT_NYI;
3629 
3630 	  elem = INSTR (20, 20);
3631 	  aarch64_set_vec_u64 (cpu, vd, elem,
3632 			       aarch64_get_reg_u64 (cpu, vm, NO_SP));
3633 	}
3634     }
3635 }
3636 
3637 #define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)	  \
3638   do								  \
3639     {								  \
3640       DST_TYPE a[N], b[N];					  \
3641 								  \
3642       for (i = 0; i < (N); i++)					  \
3643 	{							  \
3644 	  a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
3645 	  b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
3646 	}							  \
3647       for (i = 0; i < (N); i++)					  \
3648 	aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);	  \
3649     }								  \
3650   while (0)
3651 
3652 static void
3653 do_vec_mull (sim_cpu *cpu)
3654 {
3655   /* instr[31]    = 0
3656      instr[30]    = lower(0)/upper(1) selector
3657      instr[29]    = signed(0)/unsigned(1)
3658      instr[28,24] = 0 1110
3659      instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
3660      instr[21]    = 1
3661      instr[20,16] = Vm
3662      instr[15,10] = 11 0000
3663      instr[9,5]   = Vn
3664      instr[4.0]   = Vd.  */
3665 
3666   int    unsign = INSTR (29, 29);
3667   int    bias = INSTR (30, 30);
3668   unsigned vm = INSTR (20, 16);
3669   unsigned vn = INSTR ( 9,  5);
3670   unsigned vd = INSTR ( 4,  0);
3671   unsigned i;
3672 
3673   NYI_assert (28, 24, 0x0E);
3674   NYI_assert (15, 10, 0x30);
3675 
3676   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3677   /* NB: Read source values before writing results, in case
3678      the source and destination vectors are the same.  */
3679   switch (INSTR (23, 22))
3680     {
3681     case 0:
3682       if (bias)
3683 	bias = 8;
3684       if (unsign)
3685 	DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
3686       else
3687 	DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
3688       return;
3689 
3690     case 1:
3691       if (bias)
3692 	bias = 4;
3693       if (unsign)
3694 	DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
3695       else
3696 	DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
3697       return;
3698 
3699     case 2:
3700       if (bias)
3701 	bias = 2;
3702       if (unsign)
3703 	DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
3704       else
3705 	DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
3706       return;
3707 
3708     case 3:
3709       HALT_NYI;
3710     }
3711 }
3712 
3713 static void
3714 do_vec_fadd (sim_cpu *cpu)
3715 {
3716   /* instr[31]    = 0
3717      instr[30]    = half(0)/full(1)
3718      instr[29,24] = 001110
3719      instr[23]    = FADD(0)/FSUB(1)
3720      instr[22]    = float (0)/double(1)
3721      instr[21]    = 1
3722      instr[20,16] = Vm
3723      instr[15,10] = 110101
3724      instr[9,5]   = Vn
3725      instr[4.0]   = Vd.  */
3726 
3727   unsigned vm = INSTR (20, 16);
3728   unsigned vn = INSTR (9, 5);
3729   unsigned vd = INSTR (4, 0);
3730   unsigned i;
3731   int      full = INSTR (30, 30);
3732 
3733   NYI_assert (29, 24, 0x0E);
3734   NYI_assert (21, 21, 1);
3735   NYI_assert (15, 10, 0x35);
3736 
3737   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3738   if (INSTR (23, 23))
3739     {
3740       if (INSTR (22, 22))
3741 	{
3742 	  if (! full)
3743 	    HALT_NYI;
3744 
3745 	  for (i = 0; i < 2; i++)
3746 	    aarch64_set_vec_double (cpu, vd, i,
3747 				    aarch64_get_vec_double (cpu, vn, i)
3748 				    - aarch64_get_vec_double (cpu, vm, i));
3749 	}
3750       else
3751 	{
3752 	  for (i = 0; i < (full ? 4 : 2); i++)
3753 	    aarch64_set_vec_float (cpu, vd, i,
3754 				   aarch64_get_vec_float (cpu, vn, i)
3755 				   - aarch64_get_vec_float (cpu, vm, i));
3756 	}
3757     }
3758   else
3759     {
3760       if (INSTR (22, 22))
3761 	{
3762 	  if (! full)
3763 	    HALT_NYI;
3764 
3765 	  for (i = 0; i < 2; i++)
3766 	    aarch64_set_vec_double (cpu, vd, i,
3767 				    aarch64_get_vec_double (cpu, vm, i)
3768 				    + aarch64_get_vec_double (cpu, vn, i));
3769 	}
3770       else
3771 	{
3772 	  for (i = 0; i < (full ? 4 : 2); i++)
3773 	    aarch64_set_vec_float (cpu, vd, i,
3774 				   aarch64_get_vec_float (cpu, vm, i)
3775 				   + aarch64_get_vec_float (cpu, vn, i));
3776 	}
3777     }
3778 }
3779 
3780 static void
3781 do_vec_add (sim_cpu *cpu)
3782 {
3783   /* instr[31]    = 0
3784      instr[30]    = full/half selector
3785      instr[29,24] = 001110
3786      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3787      instr[21]    = 1
3788      instr[20,16] = Vn
3789      instr[15,10] = 100001
3790      instr[9,5]   = Vm
3791      instr[4.0]   = Vd.  */
3792 
3793   unsigned vm = INSTR (20, 16);
3794   unsigned vn = INSTR (9, 5);
3795   unsigned vd = INSTR (4, 0);
3796   unsigned i;
3797   int      full = INSTR (30, 30);
3798 
3799   NYI_assert (29, 24, 0x0E);
3800   NYI_assert (21, 21, 1);
3801   NYI_assert (15, 10, 0x21);
3802 
3803   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3804   switch (INSTR (23, 22))
3805     {
3806     case 0:
3807       for (i = 0; i < (full ? 16 : 8); i++)
3808 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
3809 			    + aarch64_get_vec_u8 (cpu, vm, i));
3810       return;
3811 
3812     case 1:
3813       for (i = 0; i < (full ? 8 : 4); i++)
3814 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
3815 			     + aarch64_get_vec_u16 (cpu, vm, i));
3816       return;
3817 
3818     case 2:
3819       for (i = 0; i < (full ? 4 : 2); i++)
3820 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
3821 			     + aarch64_get_vec_u32 (cpu, vm, i));
3822       return;
3823 
3824     case 3:
3825       if (! full)
3826 	HALT_UNALLOC;
3827       aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
3828 			   + aarch64_get_vec_u64 (cpu, vm, 0));
3829       aarch64_set_vec_u64 (cpu, vd, 1,
3830 			   aarch64_get_vec_u64 (cpu, vn, 1)
3831 			   + aarch64_get_vec_u64 (cpu, vm, 1));
3832       return;
3833     }
3834 }
3835 
3836 static void
3837 do_vec_mul (sim_cpu *cpu)
3838 {
3839   /* instr[31]    = 0
3840      instr[30]    = full/half selector
3841      instr[29,24] = 00 1110
3842      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3843      instr[21]    = 1
3844      instr[20,16] = Vn
3845      instr[15,10] = 10 0111
3846      instr[9,5]   = Vm
3847      instr[4.0]   = Vd.  */
3848 
3849   unsigned vm = INSTR (20, 16);
3850   unsigned vn = INSTR (9, 5);
3851   unsigned vd = INSTR (4, 0);
3852   unsigned i;
3853   int      full = INSTR (30, 30);
3854   int      bias = 0;
3855 
3856   NYI_assert (29, 24, 0x0E);
3857   NYI_assert (21, 21, 1);
3858   NYI_assert (15, 10, 0x27);
3859 
3860   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3861   switch (INSTR (23, 22))
3862     {
3863     case 0:
3864       DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
3865       return;
3866 
3867     case 1:
3868       DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
3869       return;
3870 
3871     case 2:
3872       DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
3873       return;
3874 
3875     case 3:
3876       HALT_UNALLOC;
3877     }
3878 }
3879 
3880 static void
3881 do_vec_MLA (sim_cpu *cpu)
3882 {
3883   /* instr[31]    = 0
3884      instr[30]    = full/half selector
3885      instr[29,24] = 00 1110
3886      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3887      instr[21]    = 1
3888      instr[20,16] = Vn
3889      instr[15,10] = 1001 01
3890      instr[9,5]   = Vm
3891      instr[4.0]   = Vd.  */
3892 
3893   unsigned vm = INSTR (20, 16);
3894   unsigned vn = INSTR (9, 5);
3895   unsigned vd = INSTR (4, 0);
3896   unsigned i;
3897   int      full = INSTR (30, 30);
3898 
3899   NYI_assert (29, 24, 0x0E);
3900   NYI_assert (21, 21, 1);
3901   NYI_assert (15, 10, 0x25);
3902 
3903   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3904   switch (INSTR (23, 22))
3905     {
3906     case 0:
3907       for (i = 0; i < (full ? 16 : 8); i++)
3908 	aarch64_set_vec_u8 (cpu, vd, i,
3909 			    aarch64_get_vec_u8 (cpu, vd, i)
3910 			    + (aarch64_get_vec_u8 (cpu, vn, i)
3911 			       * aarch64_get_vec_u8 (cpu, vm, i)));
3912       return;
3913 
3914     case 1:
3915       for (i = 0; i < (full ? 8 : 4); i++)
3916 	aarch64_set_vec_u16 (cpu, vd, i,
3917 			     aarch64_get_vec_u16 (cpu, vd, i)
3918 			     + (aarch64_get_vec_u16 (cpu, vn, i)
3919 				* aarch64_get_vec_u16 (cpu, vm, i)));
3920       return;
3921 
3922     case 2:
3923       for (i = 0; i < (full ? 4 : 2); i++)
3924 	aarch64_set_vec_u32 (cpu, vd, i,
3925 			     aarch64_get_vec_u32 (cpu, vd, i)
3926 			     + (aarch64_get_vec_u32 (cpu, vn, i)
3927 				* aarch64_get_vec_u32 (cpu, vm, i)));
3928       return;
3929 
3930     default:
3931       HALT_UNALLOC;
3932     }
3933 }
3934 
3935 static float
3936 fmaxnm (float a, float b)
3937 {
3938   if (! isnan (a))
3939     {
3940       if (! isnan (b))
3941 	return a > b ? a : b;
3942       return a;
3943     }
3944   else if (! isnan (b))
3945     return b;
3946   return a;
3947 }
3948 
3949 static float
3950 fminnm (float a, float b)
3951 {
3952   if (! isnan (a))
3953     {
3954       if (! isnan (b))
3955 	return a < b ? a : b;
3956       return a;
3957     }
3958   else if (! isnan (b))
3959     return b;
3960   return a;
3961 }
3962 
3963 static double
3964 dmaxnm (double a, double b)
3965 {
3966   if (! isnan (a))
3967     {
3968       if (! isnan (b))
3969 	return a > b ? a : b;
3970       return a;
3971     }
3972   else if (! isnan (b))
3973     return b;
3974   return a;
3975 }
3976 
3977 static double
3978 dminnm (double a, double b)
3979 {
3980   if (! isnan (a))
3981     {
3982       if (! isnan (b))
3983 	return a < b ? a : b;
3984       return a;
3985     }
3986   else if (! isnan (b))
3987     return b;
3988   return a;
3989 }
3990 
3991 static void
3992 do_vec_FminmaxNMP (sim_cpu *cpu)
3993 {
3994   /* instr [31]    = 0
3995      instr [30]    = half (0)/full (1)
3996      instr [29,24] = 10 1110
3997      instr [23]    = max(0)/min(1)
3998      instr [22]    = float (0)/double (1)
3999      instr [21]    = 1
4000      instr [20,16] = Vn
4001      instr [15,10] = 1100 01
4002      instr [9,5]   = Vm
4003      instr [4.0]   = Vd.  */
4004 
4005   unsigned vm = INSTR (20, 16);
4006   unsigned vn = INSTR (9, 5);
4007   unsigned vd = INSTR (4, 0);
4008   int      full = INSTR (30, 30);
4009 
4010   NYI_assert (29, 24, 0x2E);
4011   NYI_assert (21, 21, 1);
4012   NYI_assert (15, 10, 0x31);
4013 
4014   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4015   if (INSTR (22, 22))
4016     {
4017       double (* fn)(double, double) = INSTR (23, 23)
4018 	? dminnm : dmaxnm;
4019 
4020       if (! full)
4021 	HALT_NYI;
4022       aarch64_set_vec_double (cpu, vd, 0,
4023 			      fn (aarch64_get_vec_double (cpu, vn, 0),
4024 				  aarch64_get_vec_double (cpu, vn, 1)));
4025       aarch64_set_vec_double (cpu, vd, 0,
4026 			      fn (aarch64_get_vec_double (cpu, vm, 0),
4027 				  aarch64_get_vec_double (cpu, vm, 1)));
4028     }
4029   else
4030     {
4031       float (* fn)(float, float) = INSTR (23, 23)
4032 	? fminnm : fmaxnm;
4033 
4034       aarch64_set_vec_float (cpu, vd, 0,
4035 			     fn (aarch64_get_vec_float (cpu, vn, 0),
4036 				 aarch64_get_vec_float (cpu, vn, 1)));
4037       if (full)
4038 	aarch64_set_vec_float (cpu, vd, 1,
4039 			       fn (aarch64_get_vec_float (cpu, vn, 2),
4040 				   aarch64_get_vec_float (cpu, vn, 3)));
4041 
4042       aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
4043 			     fn (aarch64_get_vec_float (cpu, vm, 0),
4044 				 aarch64_get_vec_float (cpu, vm, 1)));
4045       if (full)
4046 	aarch64_set_vec_float (cpu, vd, 3,
4047 			       fn (aarch64_get_vec_float (cpu, vm, 2),
4048 				   aarch64_get_vec_float (cpu, vm, 3)));
4049     }
4050 }
4051 
4052 static void
4053 do_vec_AND (sim_cpu *cpu)
4054 {
4055   /* instr[31]    = 0
4056      instr[30]    = half (0)/full (1)
4057      instr[29,21] = 001110001
4058      instr[20,16] = Vm
4059      instr[15,10] = 000111
4060      instr[9,5]   = Vn
4061      instr[4.0]   = Vd.  */
4062 
4063   unsigned vm = INSTR (20, 16);
4064   unsigned vn = INSTR (9, 5);
4065   unsigned vd = INSTR (4, 0);
4066   unsigned i;
4067   int      full = INSTR (30, 30);
4068 
4069   NYI_assert (29, 21, 0x071);
4070   NYI_assert (15, 10, 0x07);
4071 
4072   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4073   for (i = 0; i < (full ? 4 : 2); i++)
4074     aarch64_set_vec_u32 (cpu, vd, i,
4075 			 aarch64_get_vec_u32 (cpu, vn, i)
4076 			 & aarch64_get_vec_u32 (cpu, vm, i));
4077 }
4078 
4079 static void
4080 do_vec_BSL (sim_cpu *cpu)
4081 {
4082   /* instr[31]    = 0
4083      instr[30]    = half (0)/full (1)
4084      instr[29,21] = 101110011
4085      instr[20,16] = Vm
4086      instr[15,10] = 000111
4087      instr[9,5]   = Vn
4088      instr[4.0]   = Vd.  */
4089 
4090   unsigned vm = INSTR (20, 16);
4091   unsigned vn = INSTR (9, 5);
4092   unsigned vd = INSTR (4, 0);
4093   unsigned i;
4094   int      full = INSTR (30, 30);
4095 
4096   NYI_assert (29, 21, 0x173);
4097   NYI_assert (15, 10, 0x07);
4098 
4099   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4100   for (i = 0; i < (full ? 16 : 8); i++)
4101     aarch64_set_vec_u8 (cpu, vd, i,
4102 			(    aarch64_get_vec_u8 (cpu, vd, i)
4103 			   & aarch64_get_vec_u8 (cpu, vn, i))
4104 			| ((~ aarch64_get_vec_u8 (cpu, vd, i))
4105 			   & aarch64_get_vec_u8 (cpu, vm, i)));
4106 }
4107 
4108 static void
4109 do_vec_EOR (sim_cpu *cpu)
4110 {
4111   /* instr[31]    = 0
4112      instr[30]    = half (0)/full (1)
4113      instr[29,21] = 10 1110 001
4114      instr[20,16] = Vm
4115      instr[15,10] = 000111
4116      instr[9,5]   = Vn
4117      instr[4.0]   = Vd.  */
4118 
4119   unsigned vm = INSTR (20, 16);
4120   unsigned vn = INSTR (9, 5);
4121   unsigned vd = INSTR (4, 0);
4122   unsigned i;
4123   int      full = INSTR (30, 30);
4124 
4125   NYI_assert (29, 21, 0x171);
4126   NYI_assert (15, 10, 0x07);
4127 
4128   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4129   for (i = 0; i < (full ? 4 : 2); i++)
4130     aarch64_set_vec_u32 (cpu, vd, i,
4131 			 aarch64_get_vec_u32 (cpu, vn, i)
4132 			 ^ aarch64_get_vec_u32 (cpu, vm, i));
4133 }
4134 
4135 static void
4136 do_vec_bit (sim_cpu *cpu)
4137 {
4138   /* instr[31]    = 0
4139      instr[30]    = half (0)/full (1)
4140      instr[29,23] = 10 1110 1
4141      instr[22]    = BIT (0) / BIF (1)
4142      instr[21]    = 1
4143      instr[20,16] = Vm
4144      instr[15,10] = 0001 11
4145      instr[9,5]   = Vn
4146      instr[4.0]   = Vd.  */
4147 
4148   unsigned vm = INSTR (20, 16);
4149   unsigned vn = INSTR (9, 5);
4150   unsigned vd = INSTR (4, 0);
4151   unsigned full = INSTR (30, 30);
4152   unsigned test_false = INSTR (22, 22);
4153   unsigned i;
4154 
4155   NYI_assert (29, 23, 0x5D);
4156   NYI_assert (21, 21, 1);
4157   NYI_assert (15, 10, 0x07);
4158 
4159   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4160   for (i = 0; i < (full ? 4 : 2); i++)
4161     {
4162       uint32_t vd_val = aarch64_get_vec_u32 (cpu, vd, i);
4163       uint32_t vn_val = aarch64_get_vec_u32 (cpu, vn, i);
4164       uint32_t vm_val = aarch64_get_vec_u32 (cpu, vm, i);
4165       if (test_false)
4166 	aarch64_set_vec_u32 (cpu, vd, i,
4167 			     (vd_val & vm_val) | (vn_val & ~vm_val));
4168       else
4169 	aarch64_set_vec_u32 (cpu, vd, i,
4170 			     (vd_val & ~vm_val) | (vn_val & vm_val));
4171     }
4172 }
4173 
4174 static void
4175 do_vec_ORN (sim_cpu *cpu)
4176 {
4177   /* instr[31]    = 0
4178      instr[30]    = half (0)/full (1)
4179      instr[29,21] = 00 1110 111
4180      instr[20,16] = Vm
4181      instr[15,10] = 00 0111
4182      instr[9,5]   = Vn
4183      instr[4.0]   = Vd.  */
4184 
4185   unsigned vm = INSTR (20, 16);
4186   unsigned vn = INSTR (9, 5);
4187   unsigned vd = INSTR (4, 0);
4188   unsigned i;
4189   int      full = INSTR (30, 30);
4190 
4191   NYI_assert (29, 21, 0x077);
4192   NYI_assert (15, 10, 0x07);
4193 
4194   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4195   for (i = 0; i < (full ? 16 : 8); i++)
4196     aarch64_set_vec_u8 (cpu, vd, i,
4197 			aarch64_get_vec_u8 (cpu, vn, i)
4198 			| ~ aarch64_get_vec_u8 (cpu, vm, i));
4199 }
4200 
4201 static void
4202 do_vec_ORR (sim_cpu *cpu)
4203 {
4204   /* instr[31]    = 0
4205      instr[30]    = half (0)/full (1)
4206      instr[29,21] = 00 1110 101
4207      instr[20,16] = Vm
4208      instr[15,10] = 0001 11
4209      instr[9,5]   = Vn
4210      instr[4.0]   = Vd.  */
4211 
4212   unsigned vm = INSTR (20, 16);
4213   unsigned vn = INSTR (9, 5);
4214   unsigned vd = INSTR (4, 0);
4215   unsigned i;
4216   int      full = INSTR (30, 30);
4217 
4218   NYI_assert (29, 21, 0x075);
4219   NYI_assert (15, 10, 0x07);
4220 
4221   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4222   for (i = 0; i < (full ? 16 : 8); i++)
4223     aarch64_set_vec_u8 (cpu, vd, i,
4224 			aarch64_get_vec_u8 (cpu, vn, i)
4225 			| aarch64_get_vec_u8 (cpu, vm, i));
4226 }
4227 
4228 static void
4229 do_vec_BIC (sim_cpu *cpu)
4230 {
4231   /* instr[31]    = 0
4232      instr[30]    = half (0)/full (1)
4233      instr[29,21] = 00 1110 011
4234      instr[20,16] = Vm
4235      instr[15,10] = 00 0111
4236      instr[9,5]   = Vn
4237      instr[4.0]   = Vd.  */
4238 
4239   unsigned vm = INSTR (20, 16);
4240   unsigned vn = INSTR (9, 5);
4241   unsigned vd = INSTR (4, 0);
4242   unsigned i;
4243   int      full = INSTR (30, 30);
4244 
4245   NYI_assert (29, 21, 0x073);
4246   NYI_assert (15, 10, 0x07);
4247 
4248   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4249   for (i = 0; i < (full ? 16 : 8); i++)
4250     aarch64_set_vec_u8 (cpu, vd, i,
4251 			aarch64_get_vec_u8 (cpu, vn, i)
4252 			& ~ aarch64_get_vec_u8 (cpu, vm, i));
4253 }
4254 
4255 static void
4256 do_vec_XTN (sim_cpu *cpu)
4257 {
4258   /* instr[31]    = 0
4259      instr[30]    = first part (0)/ second part (1)
4260      instr[29,24] = 00 1110
4261      instr[23,22] = size: byte(00), half(01), word (10)
4262      instr[21,10] = 1000 0100 1010
4263      instr[9,5]   = Vs
4264      instr[4,0]   = Vd.  */
4265 
4266   unsigned vs = INSTR (9, 5);
4267   unsigned vd = INSTR (4, 0);
4268   unsigned bias = INSTR (30, 30);
4269   unsigned i;
4270 
4271   NYI_assert (29, 24, 0x0E);
4272   NYI_assert (21, 10, 0x84A);
4273 
4274   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4275   switch (INSTR (23, 22))
4276     {
4277     case 0:
4278       for (i = 0; i < 8; i++)
4279 	aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
4280 			    aarch64_get_vec_u16 (cpu, vs, i));
4281       return;
4282 
4283     case 1:
4284       for (i = 0; i < 4; i++)
4285 	aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
4286 			     aarch64_get_vec_u32 (cpu, vs, i));
4287       return;
4288 
4289     case 2:
4290       for (i = 0; i < 2; i++)
4291 	aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
4292 			     aarch64_get_vec_u64 (cpu, vs, i));
4293       return;
4294     }
4295 }
4296 
4297 /* Return the number of bits set in the input value.  */
4298 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
4299 # define popcount __builtin_popcount
4300 #else
4301 static int
4302 popcount (unsigned char x)
4303 {
4304   static const unsigned char popcnt[16] =
4305     {
4306       0, 1, 1, 2,
4307       1, 2, 2, 3,
4308       1, 2, 2, 3,
4309       2, 3, 3, 4
4310     };
4311 
4312   /* Only counts the low 8 bits of the input as that is all we need.  */
4313   return popcnt[x % 16] + popcnt[x / 16];
4314 }
4315 #endif
4316 
4317 static void
4318 do_vec_CNT (sim_cpu *cpu)
4319 {
4320   /* instr[31]    = 0
4321      instr[30]    = half (0)/ full (1)
4322      instr[29,24] = 00 1110
4323      instr[23,22] = size: byte(00)
4324      instr[21,10] = 1000 0001 0110
4325      instr[9,5]   = Vs
4326      instr[4,0]   = Vd.  */
4327 
4328   unsigned vs = INSTR (9, 5);
4329   unsigned vd = INSTR (4, 0);
4330   int full = INSTR (30, 30);
4331   int size = INSTR (23, 22);
4332   int i;
4333 
4334   NYI_assert (29, 24, 0x0E);
4335   NYI_assert (21, 10, 0x816);
4336 
4337   if (size != 0)
4338     HALT_UNALLOC;
4339 
4340   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4341 
4342   for (i = 0; i < (full ? 16 : 8); i++)
4343     aarch64_set_vec_u8 (cpu, vd, i,
4344 			popcount (aarch64_get_vec_u8 (cpu, vs, i)));
4345 }
4346 
4347 static void
4348 do_vec_maxv (sim_cpu *cpu)
4349 {
4350   /* instr[31]    = 0
4351      instr[30]    = half(0)/full(1)
4352      instr[29]    = signed (0)/unsigned(1)
4353      instr[28,24] = 0 1110
4354      instr[23,22] = size: byte(00), half(01), word (10)
4355      instr[21]    = 1
4356      instr[20,17] = 1 000
4357      instr[16]    = max(0)/min(1)
4358      instr[15,10] = 1010 10
4359      instr[9,5]   = V source
4360      instr[4.0]   = R dest.  */
4361 
4362   unsigned vs = INSTR (9, 5);
4363   unsigned rd = INSTR (4, 0);
4364   unsigned full = INSTR (30, 30);
4365   unsigned i;
4366 
4367   NYI_assert (28, 24, 0x0E);
4368   NYI_assert (21, 21, 1);
4369   NYI_assert (20, 17, 8);
4370   NYI_assert (15, 10, 0x2A);
4371 
4372   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4373   switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
4374     {
4375     case 0: /* SMAXV.  */
4376        {
4377 	int64_t smax;
4378 	switch (INSTR (23, 22))
4379 	  {
4380 	  case 0:
4381 	    smax = aarch64_get_vec_s8 (cpu, vs, 0);
4382 	    for (i = 1; i < (full ? 16 : 8); i++)
4383 	      smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
4384 	    break;
4385 	  case 1:
4386 	    smax = aarch64_get_vec_s16 (cpu, vs, 0);
4387 	    for (i = 1; i < (full ? 8 : 4); i++)
4388 	      smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
4389 	    break;
4390 	  case 2:
4391 	    smax = aarch64_get_vec_s32 (cpu, vs, 0);
4392 	    for (i = 1; i < (full ? 4 : 2); i++)
4393 	      smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
4394 	    break;
4395 	  case 3:
4396 	    HALT_UNALLOC;
4397 	  }
4398 	aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
4399 	return;
4400       }
4401 
4402     case 1: /* SMINV.  */
4403       {
4404 	int64_t smin;
4405 	switch (INSTR (23, 22))
4406 	  {
4407 	  case 0:
4408 	    smin = aarch64_get_vec_s8 (cpu, vs, 0);
4409 	    for (i = 1; i < (full ? 16 : 8); i++)
4410 	      smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
4411 	    break;
4412 	  case 1:
4413 	    smin = aarch64_get_vec_s16 (cpu, vs, 0);
4414 	    for (i = 1; i < (full ? 8 : 4); i++)
4415 	      smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
4416 	    break;
4417 	  case 2:
4418 	    smin = aarch64_get_vec_s32 (cpu, vs, 0);
4419 	    for (i = 1; i < (full ? 4 : 2); i++)
4420 	      smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
4421 	    break;
4422 
4423 	  case 3:
4424 	    HALT_UNALLOC;
4425 	  }
4426 	aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
4427 	return;
4428       }
4429 
4430     case 2: /* UMAXV.  */
4431       {
4432 	uint64_t umax;
4433 	switch (INSTR (23, 22))
4434 	  {
4435 	  case 0:
4436 	    umax = aarch64_get_vec_u8 (cpu, vs, 0);
4437 	    for (i = 1; i < (full ? 16 : 8); i++)
4438 	      umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
4439 	    break;
4440 	  case 1:
4441 	    umax = aarch64_get_vec_u16 (cpu, vs, 0);
4442 	    for (i = 1; i < (full ? 8 : 4); i++)
4443 	      umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
4444 	    break;
4445 	  case 2:
4446 	    umax = aarch64_get_vec_u32 (cpu, vs, 0);
4447 	    for (i = 1; i < (full ? 4 : 2); i++)
4448 	      umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
4449 	    break;
4450 
4451 	  case 3:
4452 	    HALT_UNALLOC;
4453 	  }
4454 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
4455 	return;
4456       }
4457 
4458     case 3: /* UMINV.  */
4459       {
4460 	uint64_t umin;
4461 	switch (INSTR (23, 22))
4462 	  {
4463 	  case 0:
4464 	    umin = aarch64_get_vec_u8 (cpu, vs, 0);
4465 	    for (i = 1; i < (full ? 16 : 8); i++)
4466 	      umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
4467 	    break;
4468 	  case 1:
4469 	    umin = aarch64_get_vec_u16 (cpu, vs, 0);
4470 	    for (i = 1; i < (full ? 8 : 4); i++)
4471 	      umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
4472 	    break;
4473 	  case 2:
4474 	    umin = aarch64_get_vec_u32 (cpu, vs, 0);
4475 	    for (i = 1; i < (full ? 4 : 2); i++)
4476 	      umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
4477 	    break;
4478 
4479 	  case 3:
4480 	    HALT_UNALLOC;
4481 	  }
4482 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
4483 	return;
4484       }
4485     }
4486 }
4487 
4488 static void
4489 do_vec_fminmaxV (sim_cpu *cpu)
4490 {
4491   /* instr[31,24] = 0110 1110
4492      instr[23]    = max(0)/min(1)
4493      instr[22,14] = 011 0000 11
4494      instr[13,12] = nm(00)/normal(11)
4495      instr[11,10] = 10
4496      instr[9,5]   = V source
4497      instr[4.0]   = R dest.  */
4498 
4499   unsigned vs = INSTR (9, 5);
4500   unsigned rd = INSTR (4, 0);
4501   unsigned i;
4502   float res   = aarch64_get_vec_float (cpu, vs, 0);
4503 
4504   NYI_assert (31, 24, 0x6E);
4505   NYI_assert (22, 14, 0x0C3);
4506   NYI_assert (11, 10, 2);
4507 
4508   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4509   if (INSTR (23, 23))
4510     {
4511       switch (INSTR (13, 12))
4512 	{
4513 	case 0: /* FMNINNMV.  */
4514 	  for (i = 1; i < 4; i++)
4515 	    res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
4516 	  break;
4517 
4518 	case 3: /* FMINV.  */
4519 	  for (i = 1; i < 4; i++)
4520 	    res = min (res, aarch64_get_vec_float (cpu, vs, i));
4521 	  break;
4522 
4523 	default:
4524 	  HALT_NYI;
4525 	}
4526     }
4527   else
4528     {
4529       switch (INSTR (13, 12))
4530 	{
4531 	case 0: /* FMNAXNMV.  */
4532 	  for (i = 1; i < 4; i++)
4533 	    res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
4534 	  break;
4535 
4536 	case 3: /* FMAXV.  */
4537 	  for (i = 1; i < 4; i++)
4538 	    res = max (res, aarch64_get_vec_float (cpu, vs, i));
4539 	  break;
4540 
4541 	default:
4542 	  HALT_NYI;
4543 	}
4544     }
4545 
4546   aarch64_set_FP_float (cpu, rd, res);
4547 }
4548 
4549 static void
4550 do_vec_Fminmax (sim_cpu *cpu)
4551 {
4552   /* instr[31]    = 0
4553      instr[30]    = half(0)/full(1)
4554      instr[29,24] = 00 1110
4555      instr[23]    = max(0)/min(1)
4556      instr[22]    = float(0)/double(1)
4557      instr[21]    = 1
4558      instr[20,16] = Vm
4559      instr[15,14] = 11
4560      instr[13,12] = nm(00)/normal(11)
4561      instr[11,10] = 01
4562      instr[9,5]   = Vn
4563      instr[4,0]   = Vd.  */
4564 
4565   unsigned vm = INSTR (20, 16);
4566   unsigned vn = INSTR (9, 5);
4567   unsigned vd = INSTR (4, 0);
4568   unsigned full = INSTR (30, 30);
4569   unsigned min = INSTR (23, 23);
4570   unsigned i;
4571 
4572   NYI_assert (29, 24, 0x0E);
4573   NYI_assert (21, 21, 1);
4574   NYI_assert (15, 14, 3);
4575   NYI_assert (11, 10, 1);
4576 
4577   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4578   if (INSTR (22, 22))
4579     {
4580       double (* func)(double, double);
4581 
4582       if (! full)
4583 	HALT_NYI;
4584 
4585       if (INSTR (13, 12) == 0)
4586 	func = min ? dminnm : dmaxnm;
4587       else if (INSTR (13, 12) == 3)
4588 	func = min ? fmin : fmax;
4589       else
4590 	HALT_NYI;
4591 
4592       for (i = 0; i < 2; i++)
4593 	aarch64_set_vec_double (cpu, vd, i,
4594 				func (aarch64_get_vec_double (cpu, vn, i),
4595 				      aarch64_get_vec_double (cpu, vm, i)));
4596     }
4597   else
4598     {
4599       float (* func)(float, float);
4600 
4601       if (INSTR (13, 12) == 0)
4602 	func = min ? fminnm : fmaxnm;
4603       else if (INSTR (13, 12) == 3)
4604 	func = min ? fminf : fmaxf;
4605       else
4606 	HALT_NYI;
4607 
4608       for (i = 0; i < (full ? 4 : 2); i++)
4609 	aarch64_set_vec_float (cpu, vd, i,
4610 			       func (aarch64_get_vec_float (cpu, vn, i),
4611 				     aarch64_get_vec_float (cpu, vm, i)));
4612     }
4613 }
4614 
4615 static void
4616 do_vec_SCVTF (sim_cpu *cpu)
4617 {
4618   /* instr[31]    = 0
4619      instr[30]    = Q
4620      instr[29,23] = 00 1110 0
4621      instr[22]    = float(0)/double(1)
4622      instr[21,10] = 10 0001 1101 10
4623      instr[9,5]   = Vn
4624      instr[4,0]   = Vd.  */
4625 
4626   unsigned vn = INSTR (9, 5);
4627   unsigned vd = INSTR (4, 0);
4628   unsigned full = INSTR (30, 30);
4629   unsigned size = INSTR (22, 22);
4630   unsigned i;
4631 
4632   NYI_assert (29, 23, 0x1C);
4633   NYI_assert (21, 10, 0x876);
4634 
4635   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4636   if (size)
4637     {
4638       if (! full)
4639 	HALT_UNALLOC;
4640 
4641       for (i = 0; i < 2; i++)
4642 	{
4643 	  double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
4644 	  aarch64_set_vec_double (cpu, vd, i, val);
4645 	}
4646     }
4647   else
4648     {
4649       for (i = 0; i < (full ? 4 : 2); i++)
4650 	{
4651 	  float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
4652 	  aarch64_set_vec_float (cpu, vd, i, val);
4653 	}
4654     }
4655 }
4656 
4657 #define VEC_CMP(SOURCE, CMP)						\
4658   do									\
4659     {									\
4660       switch (size)							\
4661 	{								\
4662 	case 0:								\
4663 	  for (i = 0; i < (full ? 16 : 8); i++)				\
4664 	    aarch64_set_vec_u8 (cpu, vd, i,				\
4665 				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4666 				CMP					\
4667 				aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
4668 				? -1 : 0);				\
4669 	  return;							\
4670 	case 1:								\
4671 	  for (i = 0; i < (full ? 8 : 4); i++)				\
4672 	    aarch64_set_vec_u16 (cpu, vd, i,				\
4673 				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4674 				 CMP					\
4675 				 aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
4676 				 ? -1 : 0);				\
4677 	  return;							\
4678 	case 2:								\
4679 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4680 	    aarch64_set_vec_u32 (cpu, vd, i, \
4681 				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4682 				 CMP					\
4683 				 aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
4684 				 ? -1 : 0);				\
4685 	  return;							\
4686 	case 3:								\
4687 	  if (! full)							\
4688 	    HALT_UNALLOC;						\
4689 	  for (i = 0; i < 2; i++)					\
4690 	    aarch64_set_vec_u64 (cpu, vd, i, \
4691 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4692 				 CMP					\
4693 				 aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
4694 				 ? -1ULL : 0);				\
4695 	  return;							\
4696 	default:							\
4697 	  HALT_UNALLOC;							\
4698 	}								\
4699     }									\
4700   while (0)
4701 
4702 #define VEC_CMP0(SOURCE, CMP)						\
4703   do									\
4704     {									\
4705       switch (size)							\
4706 	{								\
4707 	case 0:								\
4708 	  for (i = 0; i < (full ? 16 : 8); i++)				\
4709 	    aarch64_set_vec_u8 (cpu, vd, i,				\
4710 				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4711 				CMP 0 ? -1 : 0);			\
4712 	  return;							\
4713 	case 1:								\
4714 	  for (i = 0; i < (full ? 8 : 4); i++)				\
4715 	    aarch64_set_vec_u16 (cpu, vd, i,				\
4716 				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4717 				 CMP 0 ? -1 : 0);			\
4718 	  return;							\
4719 	case 2:								\
4720 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4721 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4722 				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4723 				 CMP 0 ? -1 : 0);			\
4724 	  return;							\
4725 	case 3:								\
4726 	  if (! full)							\
4727 	    HALT_UNALLOC;						\
4728 	  for (i = 0; i < 2; i++)					\
4729 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4730 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4731 				 CMP 0 ? -1ULL : 0);			\
4732 	  return;							\
4733 	default:							\
4734 	  HALT_UNALLOC;							\
4735 	}								\
4736     }									\
4737   while (0)
4738 
4739 #define VEC_FCMP0(CMP)							\
4740   do									\
4741     {									\
4742       if (vm != 0)							\
4743 	HALT_NYI;							\
4744       if (INSTR (22, 22))						\
4745 	{								\
4746 	  if (! full)							\
4747 	    HALT_NYI;							\
4748 	  for (i = 0; i < 2; i++)					\
4749 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4750 				 aarch64_get_vec_double (cpu, vn, i)	\
4751 				 CMP 0.0 ? -1 : 0);			\
4752 	}								\
4753       else								\
4754 	{								\
4755 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4756 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4757 				 aarch64_get_vec_float (cpu, vn, i)	\
4758 				 CMP 0.0 ? -1 : 0);			\
4759 	}								\
4760       return;								\
4761     }									\
4762   while (0)
4763 
4764 #define VEC_FCMP(CMP)							\
4765   do									\
4766     {									\
4767       if (INSTR (22, 22))						\
4768 	{								\
4769 	  if (! full)							\
4770 	    HALT_NYI;							\
4771 	  for (i = 0; i < 2; i++)					\
4772 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4773 				 aarch64_get_vec_double (cpu, vn, i)	\
4774 				 CMP					\
4775 				 aarch64_get_vec_double (cpu, vm, i)	\
4776 				 ? -1 : 0);				\
4777 	}								\
4778       else								\
4779 	{								\
4780 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4781 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4782 				 aarch64_get_vec_float (cpu, vn, i)	\
4783 				 CMP					\
4784 				 aarch64_get_vec_float (cpu, vm, i)	\
4785 				 ? -1 : 0);				\
4786 	}								\
4787       return;								\
4788     }									\
4789   while (0)
4790 
4791 static void
4792 do_vec_compare (sim_cpu *cpu)
4793 {
4794   /* instr[31]    = 0
4795      instr[30]    = half(0)/full(1)
4796      instr[29]    = part-of-comparison-type
4797      instr[28,24] = 0 1110
4798      instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
4799                     type of float compares: single (-0) / double (-1)
4800      instr[21]    = 1
4801      instr[20,16] = Vm or 00000 (compare vs 0)
4802      instr[15,10] = part-of-comparison-type
4803      instr[9,5]   = Vn
4804      instr[4.0]   = Vd.  */
4805 
4806   int full = INSTR (30, 30);
4807   int size = INSTR (23, 22);
4808   unsigned vm = INSTR (20, 16);
4809   unsigned vn = INSTR (9, 5);
4810   unsigned vd = INSTR (4, 0);
4811   unsigned i;
4812 
4813   NYI_assert (28, 24, 0x0E);
4814   NYI_assert (21, 21, 1);
4815 
4816   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4817   if ((INSTR (11, 11)
4818        && INSTR (14, 14))
4819       || ((INSTR (11, 11) == 0
4820 	   && INSTR (10, 10) == 0)))
4821     {
4822       /* A compare vs 0.  */
4823       if (vm != 0)
4824 	{
4825 	  if (INSTR (15, 10) == 0x2A)
4826 	    do_vec_maxv (cpu);
4827 	  else if (INSTR (15, 10) == 0x32
4828 		   || INSTR (15, 10) == 0x3E)
4829 	    do_vec_fminmaxV (cpu);
4830 	  else if (INSTR (29, 23) == 0x1C
4831 		   && INSTR (21, 10) == 0x876)
4832 	    do_vec_SCVTF (cpu);
4833 	  else
4834 	    HALT_NYI;
4835 	  return;
4836 	}
4837     }
4838 
4839   if (INSTR (14, 14))
4840     {
4841       /* A floating point compare.  */
4842       unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
4843 	| INSTR (13, 10);
4844 
4845       NYI_assert (15, 15, 1);
4846 
4847       switch (decode)
4848 	{
4849 	case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
4850 	case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
4851 	case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
4852 	case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
4853 	case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
4854 	case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
4855 	case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
4856 	case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);
4857 
4858 	default:
4859 	  HALT_NYI;
4860 	}
4861     }
4862   else
4863     {
4864       unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
4865 
4866       switch (decode)
4867 	{
4868 	case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
4869 	case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
4870 	case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
4871 	case 0x23: /* 0100011 TST */	VEC_CMP  (u, & );
4872 	case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
4873 	case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
4874 	case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
4875 	case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
4876 	case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
4877 	case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
4878 	case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
4879 	default:
4880 	  if (vm == 0)
4881 	    HALT_NYI;
4882 	  do_vec_maxv (cpu);
4883 	}
4884     }
4885 }
4886 
4887 static void
4888 do_vec_SSHL (sim_cpu *cpu)
4889 {
4890   /* instr[31]    = 0
4891      instr[30]    = first part (0)/ second part (1)
4892      instr[29,24] = 00 1110
4893      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4894      instr[21]    = 1
4895      instr[20,16] = Vm
4896      instr[15,10] = 0100 01
4897      instr[9,5]   = Vn
4898      instr[4,0]   = Vd.  */
4899 
4900   unsigned full = INSTR (30, 30);
4901   unsigned vm = INSTR (20, 16);
4902   unsigned vn = INSTR (9, 5);
4903   unsigned vd = INSTR (4, 0);
4904   unsigned i;
4905   signed int shift;
4906 
4907   NYI_assert (29, 24, 0x0E);
4908   NYI_assert (21, 21, 1);
4909   NYI_assert (15, 10, 0x11);
4910 
4911   /* FIXME: What is a signed shift left in this context ?.  */
4912 
4913   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4914   switch (INSTR (23, 22))
4915     {
4916     case 0:
4917       for (i = 0; i < (full ? 16 : 8); i++)
4918 	{
4919 	  shift = aarch64_get_vec_s8 (cpu, vm, i);
4920 	  if (shift >= 0)
4921 	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4922 				<< shift);
4923 	  else
4924 	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4925 				>> - shift);
4926 	}
4927       return;
4928 
4929     case 1:
4930       for (i = 0; i < (full ? 8 : 4); i++)
4931 	{
4932 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4933 	  if (shift >= 0)
4934 	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4935 				 << shift);
4936 	  else
4937 	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4938 				 >> - shift);
4939 	}
4940       return;
4941 
4942     case 2:
4943       for (i = 0; i < (full ? 4 : 2); i++)
4944 	{
4945 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4946 	  if (shift >= 0)
4947 	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4948 				 << shift);
4949 	  else
4950 	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4951 				 >> - shift);
4952 	}
4953       return;
4954 
4955     case 3:
4956       if (! full)
4957 	HALT_UNALLOC;
4958       for (i = 0; i < 2; i++)
4959 	{
4960 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4961 	  if (shift >= 0)
4962 	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4963 				 << shift);
4964 	  else
4965 	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4966 				 >> - shift);
4967 	}
4968       return;
4969     }
4970 }
4971 
4972 static void
4973 do_vec_USHL (sim_cpu *cpu)
4974 {
4975   /* instr[31]    = 0
4976      instr[30]    = first part (0)/ second part (1)
4977      instr[29,24] = 10 1110
4978      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4979      instr[21]    = 1
4980      instr[20,16] = Vm
4981      instr[15,10] = 0100 01
4982      instr[9,5]   = Vn
4983      instr[4,0]   = Vd  */
4984 
4985   unsigned full = INSTR (30, 30);
4986   unsigned vm = INSTR (20, 16);
4987   unsigned vn = INSTR (9, 5);
4988   unsigned vd = INSTR (4, 0);
4989   unsigned i;
4990   signed int shift;
4991 
4992   NYI_assert (29, 24, 0x2E);
4993   NYI_assert (15, 10, 0x11);
4994 
4995   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4996   switch (INSTR (23, 22))
4997     {
4998     case 0:
4999 	for (i = 0; i < (full ? 16 : 8); i++)
5000 	  {
5001 	    shift = aarch64_get_vec_s8 (cpu, vm, i);
5002 	    if (shift >= 0)
5003 	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
5004 				  << shift);
5005 	    else
5006 	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
5007 				  >> - shift);
5008 	  }
5009       return;
5010 
5011     case 1:
5012       for (i = 0; i < (full ? 8 : 4); i++)
5013 	{
5014 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
5015 	  if (shift >= 0)
5016 	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
5017 				 << shift);
5018 	  else
5019 	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
5020 				 >> - shift);
5021 	}
5022       return;
5023 
5024     case 2:
5025       for (i = 0; i < (full ? 4 : 2); i++)
5026 	{
5027 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
5028 	  if (shift >= 0)
5029 	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
5030 				 << shift);
5031 	  else
5032 	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
5033 				 >> - shift);
5034 	}
5035       return;
5036 
5037     case 3:
5038       if (! full)
5039 	HALT_UNALLOC;
5040       for (i = 0; i < 2; i++)
5041 	{
5042 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
5043 	  if (shift >= 0)
5044 	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
5045 				 << shift);
5046 	  else
5047 	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
5048 				 >> - shift);
5049 	}
5050       return;
5051     }
5052 }
5053 
5054 static void
5055 do_vec_FMLA (sim_cpu *cpu)
5056 {
5057   /* instr[31]    = 0
5058      instr[30]    = full/half selector
5059      instr[29,23] = 0011100
5060      instr[22]    = size: 0=>float, 1=>double
5061      instr[21]    = 1
5062      instr[20,16] = Vn
5063      instr[15,10] = 1100 11
5064      instr[9,5]   = Vm
5065      instr[4.0]   = Vd.  */
5066 
5067   unsigned vm = INSTR (20, 16);
5068   unsigned vn = INSTR (9, 5);
5069   unsigned vd = INSTR (4, 0);
5070   unsigned i;
5071   int      full = INSTR (30, 30);
5072 
5073   NYI_assert (29, 23, 0x1C);
5074   NYI_assert (21, 21, 1);
5075   NYI_assert (15, 10, 0x33);
5076 
5077   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5078   if (INSTR (22, 22))
5079     {
5080       if (! full)
5081 	HALT_UNALLOC;
5082       for (i = 0; i < 2; i++)
5083 	aarch64_set_vec_double (cpu, vd, i,
5084 				aarch64_get_vec_double (cpu, vn, i) *
5085 				aarch64_get_vec_double (cpu, vm, i) +
5086 				aarch64_get_vec_double (cpu, vd, i));
5087     }
5088   else
5089     {
5090       for (i = 0; i < (full ? 4 : 2); i++)
5091 	aarch64_set_vec_float (cpu, vd, i,
5092 			       aarch64_get_vec_float (cpu, vn, i) *
5093 			       aarch64_get_vec_float (cpu, vm, i) +
5094 			       aarch64_get_vec_float (cpu, vd, i));
5095     }
5096 }
5097 
5098 static void
5099 do_vec_max (sim_cpu *cpu)
5100 {
5101   /* instr[31]    = 0
5102      instr[30]    = full/half selector
5103      instr[29]    = SMAX (0) / UMAX (1)
5104      instr[28,24] = 0 1110
5105      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5106      instr[21]    = 1
5107      instr[20,16] = Vn
5108      instr[15,10] = 0110 01
5109      instr[9,5]   = Vm
5110      instr[4.0]   = Vd.  */
5111 
5112   unsigned vm = INSTR (20, 16);
5113   unsigned vn = INSTR (9, 5);
5114   unsigned vd = INSTR (4, 0);
5115   unsigned i;
5116   int      full = INSTR (30, 30);
5117 
5118   NYI_assert (28, 24, 0x0E);
5119   NYI_assert (21, 21, 1);
5120   NYI_assert (15, 10, 0x19);
5121 
5122   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5123   if (INSTR (29, 29))
5124     {
5125       switch (INSTR (23, 22))
5126 	{
5127 	case 0:
5128 	  for (i = 0; i < (full ? 16 : 8); i++)
5129 	    aarch64_set_vec_u8 (cpu, vd, i,
5130 				aarch64_get_vec_u8 (cpu, vn, i)
5131 				> aarch64_get_vec_u8 (cpu, vm, i)
5132 				? aarch64_get_vec_u8 (cpu, vn, i)
5133 				: aarch64_get_vec_u8 (cpu, vm, i));
5134 	  return;
5135 
5136 	case 1:
5137 	  for (i = 0; i < (full ? 8 : 4); i++)
5138 	    aarch64_set_vec_u16 (cpu, vd, i,
5139 				 aarch64_get_vec_u16 (cpu, vn, i)
5140 				 > aarch64_get_vec_u16 (cpu, vm, i)
5141 				 ? aarch64_get_vec_u16 (cpu, vn, i)
5142 				 : aarch64_get_vec_u16 (cpu, vm, i));
5143 	  return;
5144 
5145 	case 2:
5146 	  for (i = 0; i < (full ? 4 : 2); i++)
5147 	    aarch64_set_vec_u32 (cpu, vd, i,
5148 				 aarch64_get_vec_u32 (cpu, vn, i)
5149 				 > aarch64_get_vec_u32 (cpu, vm, i)
5150 				 ? aarch64_get_vec_u32 (cpu, vn, i)
5151 				 : aarch64_get_vec_u32 (cpu, vm, i));
5152 	  return;
5153 
5154 	case 3:
5155 	  HALT_UNALLOC;
5156 	}
5157     }
5158   else
5159     {
5160       switch (INSTR (23, 22))
5161 	{
5162 	case 0:
5163 	  for (i = 0; i < (full ? 16 : 8); i++)
5164 	    aarch64_set_vec_s8 (cpu, vd, i,
5165 				aarch64_get_vec_s8 (cpu, vn, i)
5166 				> aarch64_get_vec_s8 (cpu, vm, i)
5167 				? aarch64_get_vec_s8 (cpu, vn, i)
5168 				: aarch64_get_vec_s8 (cpu, vm, i));
5169 	  return;
5170 
5171 	case 1:
5172 	  for (i = 0; i < (full ? 8 : 4); i++)
5173 	    aarch64_set_vec_s16 (cpu, vd, i,
5174 				 aarch64_get_vec_s16 (cpu, vn, i)
5175 				 > aarch64_get_vec_s16 (cpu, vm, i)
5176 				 ? aarch64_get_vec_s16 (cpu, vn, i)
5177 				 : aarch64_get_vec_s16 (cpu, vm, i));
5178 	  return;
5179 
5180 	case 2:
5181 	  for (i = 0; i < (full ? 4 : 2); i++)
5182 	    aarch64_set_vec_s32 (cpu, vd, i,
5183 				 aarch64_get_vec_s32 (cpu, vn, i)
5184 				 > aarch64_get_vec_s32 (cpu, vm, i)
5185 				 ? aarch64_get_vec_s32 (cpu, vn, i)
5186 				 : aarch64_get_vec_s32 (cpu, vm, i));
5187 	  return;
5188 
5189 	case 3:
5190 	  HALT_UNALLOC;
5191 	}
5192     }
5193 }
5194 
5195 static void
5196 do_vec_min (sim_cpu *cpu)
5197 {
5198   /* instr[31]    = 0
5199      instr[30]    = full/half selector
5200      instr[29]    = SMIN (0) / UMIN (1)
5201      instr[28,24] = 0 1110
5202      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5203      instr[21]    = 1
5204      instr[20,16] = Vn
5205      instr[15,10] = 0110 11
5206      instr[9,5]   = Vm
5207      instr[4.0]   = Vd.  */
5208 
5209   unsigned vm = INSTR (20, 16);
5210   unsigned vn = INSTR (9, 5);
5211   unsigned vd = INSTR (4, 0);
5212   unsigned i;
5213   int      full = INSTR (30, 30);
5214 
5215   NYI_assert (28, 24, 0x0E);
5216   NYI_assert (21, 21, 1);
5217   NYI_assert (15, 10, 0x1B);
5218 
5219   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5220   if (INSTR (29, 29))
5221     {
5222       switch (INSTR (23, 22))
5223 	{
5224 	case 0:
5225 	  for (i = 0; i < (full ? 16 : 8); i++)
5226 	    aarch64_set_vec_u8 (cpu, vd, i,
5227 				aarch64_get_vec_u8 (cpu, vn, i)
5228 				< aarch64_get_vec_u8 (cpu, vm, i)
5229 				? aarch64_get_vec_u8 (cpu, vn, i)
5230 				: aarch64_get_vec_u8 (cpu, vm, i));
5231 	  return;
5232 
5233 	case 1:
5234 	  for (i = 0; i < (full ? 8 : 4); i++)
5235 	    aarch64_set_vec_u16 (cpu, vd, i,
5236 				 aarch64_get_vec_u16 (cpu, vn, i)
5237 				 < aarch64_get_vec_u16 (cpu, vm, i)
5238 				 ? aarch64_get_vec_u16 (cpu, vn, i)
5239 				 : aarch64_get_vec_u16 (cpu, vm, i));
5240 	  return;
5241 
5242 	case 2:
5243 	  for (i = 0; i < (full ? 4 : 2); i++)
5244 	    aarch64_set_vec_u32 (cpu, vd, i,
5245 				 aarch64_get_vec_u32 (cpu, vn, i)
5246 				 < aarch64_get_vec_u32 (cpu, vm, i)
5247 				 ? aarch64_get_vec_u32 (cpu, vn, i)
5248 				 : aarch64_get_vec_u32 (cpu, vm, i));
5249 	  return;
5250 
5251 	case 3:
5252 	  HALT_UNALLOC;
5253 	}
5254     }
5255   else
5256     {
5257       switch (INSTR (23, 22))
5258 	{
5259 	case 0:
5260 	  for (i = 0; i < (full ? 16 : 8); i++)
5261 	    aarch64_set_vec_s8 (cpu, vd, i,
5262 				aarch64_get_vec_s8 (cpu, vn, i)
5263 				< aarch64_get_vec_s8 (cpu, vm, i)
5264 				? aarch64_get_vec_s8 (cpu, vn, i)
5265 				: aarch64_get_vec_s8 (cpu, vm, i));
5266 	  return;
5267 
5268 	case 1:
5269 	  for (i = 0; i < (full ? 8 : 4); i++)
5270 	    aarch64_set_vec_s16 (cpu, vd, i,
5271 				 aarch64_get_vec_s16 (cpu, vn, i)
5272 				 < aarch64_get_vec_s16 (cpu, vm, i)
5273 				 ? aarch64_get_vec_s16 (cpu, vn, i)
5274 				 : aarch64_get_vec_s16 (cpu, vm, i));
5275 	  return;
5276 
5277 	case 2:
5278 	  for (i = 0; i < (full ? 4 : 2); i++)
5279 	    aarch64_set_vec_s32 (cpu, vd, i,
5280 				 aarch64_get_vec_s32 (cpu, vn, i)
5281 				 < aarch64_get_vec_s32 (cpu, vm, i)
5282 				 ? aarch64_get_vec_s32 (cpu, vn, i)
5283 				 : aarch64_get_vec_s32 (cpu, vm, i));
5284 	  return;
5285 
5286 	case 3:
5287 	  HALT_UNALLOC;
5288 	}
5289     }
5290 }
5291 
5292 static void
5293 do_vec_sub_long (sim_cpu *cpu)
5294 {
5295   /* instr[31]    = 0
5296      instr[30]    = lower (0) / upper (1)
5297      instr[29]    = signed (0) / unsigned (1)
5298      instr[28,24] = 0 1110
5299      instr[23,22] = size: bytes (00), half (01), word (10)
5300      instr[21]    = 1
5301      insrt[20,16] = Vm
5302      instr[15,10] = 0010 00
5303      instr[9,5]   = Vn
5304      instr[4,0]   = V dest.  */
5305 
5306   unsigned size = INSTR (23, 22);
5307   unsigned vm = INSTR (20, 16);
5308   unsigned vn = INSTR (9, 5);
5309   unsigned vd = INSTR (4, 0);
5310   unsigned bias = 0;
5311   unsigned i;
5312 
5313   NYI_assert (28, 24, 0x0E);
5314   NYI_assert (21, 21, 1);
5315   NYI_assert (15, 10, 0x08);
5316 
5317   if (size == 3)
5318     HALT_UNALLOC;
5319 
5320   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5321   switch (INSTR (30, 29))
5322     {
5323     case 2: /* SSUBL2.  */
5324       bias = 2;
5325       ATTRIBUTE_FALLTHROUGH;
5326     case 0: /* SSUBL.  */
5327       switch (size)
5328 	{
5329 	case 0:
5330 	  bias *= 3;
5331 	  for (i = 0; i < 8; i++)
5332 	    aarch64_set_vec_s16 (cpu, vd, i,
5333 				 aarch64_get_vec_s8 (cpu, vn, i + bias)
5334 				 - aarch64_get_vec_s8 (cpu, vm, i + bias));
5335 	  break;
5336 
5337 	case 1:
5338 	  bias *= 2;
5339 	  for (i = 0; i < 4; i++)
5340 	    aarch64_set_vec_s32 (cpu, vd, i,
5341 				 aarch64_get_vec_s16 (cpu, vn, i + bias)
5342 				 - aarch64_get_vec_s16 (cpu, vm, i + bias));
5343 	  break;
5344 
5345 	case 2:
5346 	  for (i = 0; i < 2; i++)
5347 	    aarch64_set_vec_s64 (cpu, vd, i,
5348 				 aarch64_get_vec_s32 (cpu, vn, i + bias)
5349 				 - aarch64_get_vec_s32 (cpu, vm, i + bias));
5350 	  break;
5351 
5352 	default:
5353 	  HALT_UNALLOC;
5354 	}
5355       break;
5356 
5357     case 3: /* USUBL2.  */
5358       bias = 2;
5359       ATTRIBUTE_FALLTHROUGH;
5360     case 1: /* USUBL.  */
5361       switch (size)
5362 	{
5363 	case 0:
5364 	  bias *= 3;
5365 	  for (i = 0; i < 8; i++)
5366 	    aarch64_set_vec_u16 (cpu, vd, i,
5367 				 aarch64_get_vec_u8 (cpu, vn, i + bias)
5368 				 - aarch64_get_vec_u8 (cpu, vm, i + bias));
5369 	  break;
5370 
5371 	case 1:
5372 	  bias *= 2;
5373 	  for (i = 0; i < 4; i++)
5374 	    aarch64_set_vec_u32 (cpu, vd, i,
5375 				 aarch64_get_vec_u16 (cpu, vn, i + bias)
5376 				 - aarch64_get_vec_u16 (cpu, vm, i + bias));
5377 	  break;
5378 
5379 	case 2:
5380 	  for (i = 0; i < 2; i++)
5381 	    aarch64_set_vec_u64 (cpu, vd, i,
5382 				 aarch64_get_vec_u32 (cpu, vn, i + bias)
5383 				 - aarch64_get_vec_u32 (cpu, vm, i + bias));
5384 	  break;
5385 
5386 	default:
5387 	  HALT_UNALLOC;
5388 	}
5389       break;
5390     }
5391 }
5392 
5393 static void
5394 do_vec_ADDP (sim_cpu *cpu)
5395 {
5396   /* instr[31]    = 0
5397      instr[30]    = half(0)/full(1)
5398      instr[29,24] = 00 1110
5399      instr[23,22] = size: bytes (00), half (01), word (10), long (11)
5400      instr[21]    = 1
5401      insrt[20,16] = Vm
5402      instr[15,10] = 1011 11
5403      instr[9,5]   = Vn
5404      instr[4,0]   = V dest.  */
5405 
5406   struct aarch64_sim_cpu *aarch64_cpu = AARCH64_SIM_CPU (cpu);
5407   FRegister copy_vn;
5408   FRegister copy_vm;
5409   unsigned full = INSTR (30, 30);
5410   unsigned size = INSTR (23, 22);
5411   unsigned vm = INSTR (20, 16);
5412   unsigned vn = INSTR (9, 5);
5413   unsigned vd = INSTR (4, 0);
5414   unsigned i, range;
5415 
5416   NYI_assert (29, 24, 0x0E);
5417   NYI_assert (21, 21, 1);
5418   NYI_assert (15, 10, 0x2F);
5419 
5420   /* Make copies of the source registers in case vd == vn/vm.  */
5421   copy_vn = aarch64_cpu->fr[vn];
5422   copy_vm = aarch64_cpu->fr[vm];
5423 
5424   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5425   switch (size)
5426     {
5427     case 0:
5428       range = full ? 8 : 4;
5429       for (i = 0; i < range; i++)
5430 	{
5431 	  aarch64_set_vec_u8 (cpu, vd, i,
5432 			      copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
5433 	  aarch64_set_vec_u8 (cpu, vd, i + range,
5434 			      copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
5435 	}
5436       return;
5437 
5438     case 1:
5439       range = full ? 4 : 2;
5440       for (i = 0; i < range; i++)
5441 	{
5442 	  aarch64_set_vec_u16 (cpu, vd, i,
5443 			       copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
5444 	  aarch64_set_vec_u16 (cpu, vd, i + range,
5445 			       copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
5446 	}
5447       return;
5448 
5449     case 2:
5450       range = full ? 2 : 1;
5451       for (i = 0; i < range; i++)
5452 	{
5453 	  aarch64_set_vec_u32 (cpu, vd, i,
5454 			       copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
5455 	  aarch64_set_vec_u32 (cpu, vd, i + range,
5456 			       copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
5457 	}
5458       return;
5459 
5460     case 3:
5461       if (! full)
5462 	HALT_UNALLOC;
5463       aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
5464       aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
5465       return;
5466     }
5467 }
5468 
5469 /* Float point vector convert to longer (precision).  */
5470 static void
5471 do_vec_FCVTL (sim_cpu *cpu)
5472 {
5473   /* instr[31]    = 0
5474      instr[30]    = half (0) / all (1)
5475      instr[29,23] = 00 1110 0
5476      instr[22]    = single (0) / double (1)
5477      instr[21,10] = 10 0001 0111 10
5478      instr[9,5]   = Rn
5479      instr[4,0]   = Rd.  */
5480 
5481   unsigned rn = INSTR (9, 5);
5482   unsigned rd = INSTR (4, 0);
5483   unsigned full = INSTR (30, 30);
5484   unsigned i;
5485 
5486   NYI_assert (31, 31, 0);
5487   NYI_assert (29, 23, 0x1C);
5488   NYI_assert (21, 10, 0x85E);
5489 
5490   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5491   if (INSTR (22, 22))
5492     {
5493       for (i = 0; i < 2; i++)
5494 	aarch64_set_vec_double (cpu, rd, i,
5495 				aarch64_get_vec_float (cpu, rn, i + 2*full));
5496     }
5497   else
5498     {
5499       HALT_NYI;
5500 
5501 #if 0
5502       /* TODO: Implement missing half-float support.  */
5503       for (i = 0; i < 4; i++)
5504 	aarch64_set_vec_float (cpu, rd, i,
5505 			     aarch64_get_vec_halffloat (cpu, rn, i + 4*full));
5506 #endif
5507     }
5508 }
5509 
5510 static void
5511 do_vec_FABS (sim_cpu *cpu)
5512 {
5513   /* instr[31]    = 0
5514      instr[30]    = half(0)/full(1)
5515      instr[29,23] = 00 1110 1
5516      instr[22]    = float(0)/double(1)
5517      instr[21,16] = 10 0000
5518      instr[15,10] = 1111 10
5519      instr[9,5]   = Vn
5520      instr[4,0]   = Vd.  */
5521 
5522   unsigned vn = INSTR (9, 5);
5523   unsigned vd = INSTR (4, 0);
5524   unsigned full = INSTR (30, 30);
5525   unsigned i;
5526 
5527   NYI_assert (29, 23, 0x1D);
5528   NYI_assert (21, 10, 0x83E);
5529 
5530   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5531   if (INSTR (22, 22))
5532     {
5533       if (! full)
5534 	HALT_NYI;
5535 
5536       for (i = 0; i < 2; i++)
5537 	aarch64_set_vec_double (cpu, vd, i,
5538 				fabs (aarch64_get_vec_double (cpu, vn, i)));
5539     }
5540   else
5541     {
5542       for (i = 0; i < (full ? 4 : 2); i++)
5543 	aarch64_set_vec_float (cpu, vd, i,
5544 			       fabsf (aarch64_get_vec_float (cpu, vn, i)));
5545     }
5546 }
5547 
5548 static void
5549 do_vec_FCVTZS (sim_cpu *cpu)
5550 {
5551   /* instr[31]    = 0
5552      instr[30]    = half (0) / all (1)
5553      instr[29,23] = 00 1110 1
5554      instr[22]    = single (0) / double (1)
5555      instr[21,10] = 10 0001 1011 10
5556      instr[9,5]   = Rn
5557      instr[4,0]   = Rd.  */
5558 
5559   unsigned rn = INSTR (9, 5);
5560   unsigned rd = INSTR (4, 0);
5561   unsigned full = INSTR (30, 30);
5562   unsigned i;
5563 
5564   NYI_assert (31, 31, 0);
5565   NYI_assert (29, 23, 0x1D);
5566   NYI_assert (21, 10, 0x86E);
5567 
5568   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5569   if (INSTR (22, 22))
5570     {
5571       if (! full)
5572 	HALT_UNALLOC;
5573 
5574       for (i = 0; i < 2; i++)
5575 	aarch64_set_vec_s64 (cpu, rd, i,
5576 			     (int64_t) aarch64_get_vec_double (cpu, rn, i));
5577     }
5578   else
5579     for (i = 0; i < (full ? 4 : 2); i++)
5580       aarch64_set_vec_s32 (cpu, rd, i,
5581 			   (int32_t) aarch64_get_vec_float (cpu, rn, i));
5582 }
5583 
5584 static void
5585 do_vec_REV64 (sim_cpu *cpu)
5586 {
5587   /* instr[31]    = 0
5588      instr[30]    = full/half
5589      instr[29,24] = 00 1110
5590      instr[23,22] = size
5591      instr[21,10] = 10 0000 0000 10
5592      instr[9,5]   = Rn
5593      instr[4,0]   = Rd.  */
5594 
5595   unsigned rn = INSTR (9, 5);
5596   unsigned rd = INSTR (4, 0);
5597   unsigned size = INSTR (23, 22);
5598   unsigned full = INSTR (30, 30);
5599   unsigned i;
5600   FRegister val;
5601 
5602   NYI_assert (29, 24, 0x0E);
5603   NYI_assert (21, 10, 0x802);
5604 
5605   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5606   switch (size)
5607     {
5608     case 0:
5609       for (i = 0; i < (full ? 16 : 8); i++)
5610 	val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
5611       break;
5612 
5613     case 1:
5614       for (i = 0; i < (full ? 8 : 4); i++)
5615 	val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
5616       break;
5617 
5618     case 2:
5619       for (i = 0; i < (full ? 4 : 2); i++)
5620 	val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
5621       break;
5622 
5623     case 3:
5624       HALT_UNALLOC;
5625     }
5626 
5627   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5628   if (full)
5629     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5630 }
5631 
5632 static void
5633 do_vec_REV16 (sim_cpu *cpu)
5634 {
5635   /* instr[31]    = 0
5636      instr[30]    = full/half
5637      instr[29,24] = 00 1110
5638      instr[23,22] = size
5639      instr[21,10] = 10 0000 0001 10
5640      instr[9,5]   = Rn
5641      instr[4,0]   = Rd.  */
5642 
5643   unsigned rn = INSTR (9, 5);
5644   unsigned rd = INSTR (4, 0);
5645   unsigned size = INSTR (23, 22);
5646   unsigned full = INSTR (30, 30);
5647   unsigned i;
5648   FRegister val;
5649 
5650   NYI_assert (29, 24, 0x0E);
5651   NYI_assert (21, 10, 0x806);
5652 
5653   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5654   switch (size)
5655     {
5656     case 0:
5657       for (i = 0; i < (full ? 16 : 8); i++)
5658 	val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
5659       break;
5660 
5661     default:
5662       HALT_UNALLOC;
5663     }
5664 
5665   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5666   if (full)
5667     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5668 }
5669 
5670 static void
5671 do_vec_op1 (sim_cpu *cpu)
5672 {
5673   /* instr[31]    = 0
5674      instr[30]    = half/full
5675      instr[29,24] = 00 1110
5676      instr[23,21] = ???
5677      instr[20,16] = Vm
5678      instr[15,10] = sub-opcode
5679      instr[9,5]   = Vn
5680      instr[4,0]   = Vd  */
5681   NYI_assert (29, 24, 0x0E);
5682 
5683   if (INSTR (21, 21) == 0)
5684     {
5685       if (INSTR (23, 22) == 0)
5686 	{
5687 	  if (INSTR (30, 30) == 1
5688 	      && INSTR (17, 14) == 0
5689 	      && INSTR (12, 10) == 7)
5690 	    return do_vec_ins_2 (cpu);
5691 
5692 	  switch (INSTR (15, 10))
5693 	    {
5694 	    case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
5695 	    case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
5696 	    case 0x07: do_vec_INS (cpu); return;
5697 	    case 0x0B: do_vec_SMOV_into_scalar (cpu); return;
5698 	    case 0x0F: do_vec_UMOV_into_scalar (cpu); return;
5699 
5700 	    case 0x00:
5701 	    case 0x08:
5702 	    case 0x10:
5703 	    case 0x18:
5704 	      do_vec_TBL (cpu); return;
5705 
5706 	    case 0x06:
5707 	    case 0x16:
5708 	      do_vec_UZP (cpu); return;
5709 
5710 	    case 0x0A: do_vec_TRN (cpu); return;
5711 
5712 	    case 0x0E:
5713 	    case 0x1E:
5714 	      do_vec_ZIP (cpu); return;
5715 
5716 	    default:
5717 	      HALT_NYI;
5718 	    }
5719 	}
5720 
5721       switch (INSTR (13, 10))
5722 	{
5723 	case 0x6: do_vec_UZP (cpu); return;
5724 	case 0xE: do_vec_ZIP (cpu); return;
5725 	case 0xA: do_vec_TRN (cpu); return;
5726 	default:  HALT_NYI;
5727 	}
5728     }
5729 
5730   switch (INSTR (15, 10))
5731     {
5732     case 0x02: do_vec_REV64 (cpu); return;
5733     case 0x06: do_vec_REV16 (cpu); return;
5734 
5735     case 0x07:
5736       switch (INSTR (23, 21))
5737 	{
5738 	case 1: do_vec_AND (cpu); return;
5739 	case 3: do_vec_BIC (cpu); return;
5740 	case 5: do_vec_ORR (cpu); return;
5741 	case 7: do_vec_ORN (cpu); return;
5742 	default: HALT_NYI;
5743 	}
5744 
5745     case 0x08: do_vec_sub_long (cpu); return;
5746     case 0x0a: do_vec_XTN (cpu); return;
5747     case 0x11: do_vec_SSHL (cpu); return;
5748     case 0x16: do_vec_CNT (cpu); return;
5749     case 0x19: do_vec_max (cpu); return;
5750     case 0x1B: do_vec_min (cpu); return;
5751     case 0x21: do_vec_add (cpu); return;
5752     case 0x25: do_vec_MLA (cpu); return;
5753     case 0x27: do_vec_mul (cpu); return;
5754     case 0x2F: do_vec_ADDP (cpu); return;
5755     case 0x30: do_vec_mull (cpu); return;
5756     case 0x33: do_vec_FMLA (cpu); return;
5757     case 0x35: do_vec_fadd (cpu); return;
5758 
5759     case 0x1E:
5760       switch (INSTR (20, 16))
5761 	{
5762 	case 0x01: do_vec_FCVTL (cpu); return;
5763 	default: HALT_NYI;
5764 	}
5765 
5766     case 0x2E:
5767       switch (INSTR (20, 16))
5768 	{
5769 	case 0x00: do_vec_ABS (cpu); return;
5770 	case 0x01: do_vec_FCVTZS (cpu); return;
5771 	case 0x11: do_vec_ADDV (cpu); return;
5772 	default: HALT_NYI;
5773 	}
5774 
5775     case 0x31:
5776     case 0x3B:
5777       do_vec_Fminmax (cpu); return;
5778 
5779     case 0x0D:
5780     case 0x0F:
5781     case 0x22:
5782     case 0x23:
5783     case 0x26:
5784     case 0x2A:
5785     case 0x32:
5786     case 0x36:
5787     case 0x39:
5788     case 0x3A:
5789       do_vec_compare (cpu); return;
5790 
5791     case 0x3E:
5792       do_vec_FABS (cpu); return;
5793 
5794     default:
5795       HALT_NYI;
5796     }
5797 }
5798 
5799 static void
5800 do_vec_xtl (sim_cpu *cpu)
5801 {
5802   /* instr[31]    = 0
5803      instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
5804      instr[28,22] = 0 1111 00
5805      instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
5806      instr[15,10] = 1010 01
5807      instr[9,5]   = V source
5808      instr[4,0]   = V dest.  */
5809 
5810   unsigned vs = INSTR (9, 5);
5811   unsigned vd = INSTR (4, 0);
5812   unsigned i, shift, bias = 0;
5813 
5814   NYI_assert (28, 22, 0x3C);
5815   NYI_assert (15, 10, 0x29);
5816 
5817   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5818   switch (INSTR (30, 29))
5819     {
5820     case 2: /* SXTL2, SSHLL2.  */
5821       bias = 2;
5822       ATTRIBUTE_FALLTHROUGH;
5823     case 0: /* SXTL, SSHLL.  */
5824       if (INSTR (21, 21))
5825 	{
5826 	  int64_t val1, val2;
5827 
5828 	  shift = INSTR (20, 16);
5829 	  /* Get the source values before setting the destination values
5830 	     in case the source and destination are the same.  */
5831 	  val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
5832 	  val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
5833 	  aarch64_set_vec_s64 (cpu, vd, 0, val1);
5834 	  aarch64_set_vec_s64 (cpu, vd, 1, val2);
5835 	}
5836       else if (INSTR (20, 20))
5837 	{
5838 	  int32_t v[4];
5839 
5840 	  shift = INSTR (19, 16);
5841 	  bias *= 2;
5842 	  for (i = 0; i < 4; i++)
5843 	    v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
5844 	  for (i = 0; i < 4; i++)
5845 	    aarch64_set_vec_s32 (cpu, vd, i, v[i]);
5846 	}
5847       else
5848 	{
5849 	  int16_t v[8];
5850 	  NYI_assert (19, 19, 1);
5851 
5852 	  shift = INSTR (18, 16);
5853 	  bias *= 4;
5854 	  for (i = 0; i < 8; i++)
5855 	    v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
5856 	  for (i = 0; i < 8; i++)
5857 	    aarch64_set_vec_s16 (cpu, vd, i, v[i]);
5858 	}
5859       return;
5860 
5861     case 3: /* UXTL2, USHLL2.  */
5862       bias = 2;
5863       ATTRIBUTE_FALLTHROUGH;
5864     case 1: /* UXTL, USHLL.  */
5865       if (INSTR (21, 21))
5866 	{
5867 	  uint64_t v1, v2;
5868 	  shift = INSTR (20, 16);
5869 	  v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
5870 	  v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
5871 	  aarch64_set_vec_u64 (cpu, vd, 0, v1);
5872 	  aarch64_set_vec_u64 (cpu, vd, 1, v2);
5873 	}
5874       else if (INSTR (20, 20))
5875 	{
5876 	  uint32_t v[4];
5877 	  shift = INSTR (19, 16);
5878 	  bias *= 2;
5879 	  for (i = 0; i < 4; i++)
5880 	    v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
5881 	  for (i = 0; i < 4; i++)
5882 	    aarch64_set_vec_u32 (cpu, vd, i, v[i]);
5883 	}
5884       else
5885 	{
5886 	  uint16_t v[8];
5887 	  NYI_assert (19, 19, 1);
5888 
5889 	  shift = INSTR (18, 16);
5890 	  bias *= 4;
5891 	  for (i = 0; i < 8; i++)
5892 	    v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
5893 	  for (i = 0; i < 8; i++)
5894 	    aarch64_set_vec_u16 (cpu, vd, i, v[i]);
5895 	}
5896       return;
5897     }
5898 }
5899 
5900 static void
5901 do_vec_SHL (sim_cpu *cpu)
5902 {
5903   /* instr [31]    = 0
5904      instr [30]    = half(0)/full(1)
5905      instr [29,23] = 001 1110
5906      instr [22,16] = size and shift amount
5907      instr [15,10] = 01 0101
5908      instr [9, 5]  = Vs
5909      instr [4, 0]  = Vd.  */
5910 
5911   int shift;
5912   int full    = INSTR (30, 30);
5913   unsigned vs = INSTR (9, 5);
5914   unsigned vd = INSTR (4, 0);
5915   unsigned i;
5916 
5917   NYI_assert (29, 23, 0x1E);
5918   NYI_assert (15, 10, 0x15);
5919 
5920   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5921   if (INSTR (22, 22))
5922     {
5923       shift = INSTR (21, 16);
5924 
5925       if (full == 0)
5926 	HALT_UNALLOC;
5927 
5928       for (i = 0; i < 2; i++)
5929 	{
5930 	  uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5931 	  aarch64_set_vec_u64 (cpu, vd, i, val << shift);
5932 	}
5933 
5934       return;
5935     }
5936 
5937   if (INSTR (21, 21))
5938     {
5939       shift = INSTR (20, 16);
5940 
5941       for (i = 0; i < (full ? 4 : 2); i++)
5942 	{
5943 	  uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5944 	  aarch64_set_vec_u32 (cpu, vd, i, val << shift);
5945 	}
5946 
5947       return;
5948     }
5949 
5950   if (INSTR (20, 20))
5951     {
5952       shift = INSTR (19, 16);
5953 
5954       for (i = 0; i < (full ? 8 : 4); i++)
5955 	{
5956 	  uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5957 	  aarch64_set_vec_u16 (cpu, vd, i, val << shift);
5958 	}
5959 
5960       return;
5961     }
5962 
5963   if (INSTR (19, 19) == 0)
5964     HALT_UNALLOC;
5965 
5966   shift = INSTR (18, 16);
5967 
5968   for (i = 0; i < (full ? 16 : 8); i++)
5969     {
5970       uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5971       aarch64_set_vec_u8 (cpu, vd, i, val << shift);
5972     }
5973 }
5974 
5975 static void
5976 do_vec_SSHR_USHR (sim_cpu *cpu)
5977 {
5978   /* instr [31]    = 0
5979      instr [30]    = half(0)/full(1)
5980      instr [29]    = signed(0)/unsigned(1)
5981      instr [28,23] = 0 1111 0
5982      instr [22,16] = size and shift amount
5983      instr [15,10] = 0000 01
5984      instr [9, 5]  = Vs
5985      instr [4, 0]  = Vd.  */
5986 
5987   int full       = INSTR (30, 30);
5988   int sign       = ! INSTR (29, 29);
5989   unsigned shift = INSTR (22, 16);
5990   unsigned vs    = INSTR (9, 5);
5991   unsigned vd    = INSTR (4, 0);
5992   unsigned i;
5993 
5994   NYI_assert (28, 23, 0x1E);
5995   NYI_assert (15, 10, 0x01);
5996 
5997   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5998   if (INSTR (22, 22))
5999     {
6000       shift = 128 - shift;
6001 
6002       if (full == 0)
6003 	HALT_UNALLOC;
6004 
6005       if (sign)
6006 	for (i = 0; i < 2; i++)
6007 	  {
6008 	    int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
6009 	    aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
6010 	  }
6011       else
6012 	for (i = 0; i < 2; i++)
6013 	  {
6014 	    uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
6015 	    aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
6016 	  }
6017 
6018       return;
6019     }
6020 
6021   if (INSTR (21, 21))
6022     {
6023       shift = 64 - shift;
6024 
6025       if (sign)
6026 	for (i = 0; i < (full ? 4 : 2); i++)
6027 	  {
6028 	    int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
6029 	    aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
6030 	  }
6031       else
6032 	for (i = 0; i < (full ? 4 : 2); i++)
6033 	  {
6034 	    uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
6035 	    aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
6036 	  }
6037 
6038       return;
6039     }
6040 
6041   if (INSTR (20, 20))
6042     {
6043       shift = 32 - shift;
6044 
6045       if (sign)
6046 	for (i = 0; i < (full ? 8 : 4); i++)
6047 	  {
6048 	    int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
6049 	    aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
6050 	  }
6051       else
6052 	for (i = 0; i < (full ? 8 : 4); i++)
6053 	  {
6054 	    uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
6055 	    aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
6056 	  }
6057 
6058       return;
6059     }
6060 
6061   if (INSTR (19, 19) == 0)
6062     HALT_UNALLOC;
6063 
6064   shift = 16 - shift;
6065 
6066   if (sign)
6067     for (i = 0; i < (full ? 16 : 8); i++)
6068       {
6069 	int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
6070 	aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
6071       }
6072   else
6073     for (i = 0; i < (full ? 16 : 8); i++)
6074       {
6075 	uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
6076 	aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
6077       }
6078 }
6079 
6080 static void
6081 do_vec_MUL_by_element (sim_cpu *cpu)
6082 {
6083   /* instr[31]    = 0
6084      instr[30]    = half/full
6085      instr[29,24] = 00 1111
6086      instr[23,22] = size
6087      instr[21]    = L
6088      instr[20]    = M
6089      instr[19,16] = m
6090      instr[15,12] = 1000
6091      instr[11]    = H
6092      instr[10]    = 0
6093      instr[9,5]   = Vn
6094      instr[4,0]   = Vd  */
6095 
6096   unsigned full     = INSTR (30, 30);
6097   unsigned L        = INSTR (21, 21);
6098   unsigned H        = INSTR (11, 11);
6099   unsigned vn       = INSTR (9, 5);
6100   unsigned vd       = INSTR (4, 0);
6101   unsigned size     = INSTR (23, 22);
6102   unsigned index;
6103   unsigned vm;
6104   unsigned e;
6105 
6106   NYI_assert (29, 24, 0x0F);
6107   NYI_assert (15, 12, 0x8);
6108   NYI_assert (10, 10, 0);
6109 
6110   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6111   switch (size)
6112     {
6113     case 1:
6114       {
6115 	/* 16 bit products.  */
6116 	uint16_t product;
6117 	uint16_t element1;
6118 	uint16_t element2;
6119 
6120 	index = (H << 2) | (L << 1) | INSTR (20, 20);
6121 	vm = INSTR (19, 16);
6122 	element2 = aarch64_get_vec_u16 (cpu, vm, index);
6123 
6124 	for (e = 0; e < (full ? 8 : 4); e ++)
6125 	  {
6126 	    element1 = aarch64_get_vec_u16 (cpu, vn, e);
6127 	    product  = element1 * element2;
6128 	    aarch64_set_vec_u16 (cpu, vd, e, product);
6129 	  }
6130       }
6131       break;
6132 
6133     case 2:
6134       {
6135 	/* 32 bit products.  */
6136 	uint32_t product;
6137 	uint32_t element1;
6138 	uint32_t element2;
6139 
6140 	index = (H << 1) | L;
6141 	vm = INSTR (20, 16);
6142 	element2 = aarch64_get_vec_u32 (cpu, vm, index);
6143 
6144 	for (e = 0; e < (full ? 4 : 2); e ++)
6145 	  {
6146 	    element1 = aarch64_get_vec_u32 (cpu, vn, e);
6147 	    product  = element1 * element2;
6148 	    aarch64_set_vec_u32 (cpu, vd, e, product);
6149 	  }
6150       }
6151       break;
6152 
6153     default:
6154       HALT_UNALLOC;
6155     }
6156 }
6157 
6158 static void
6159 do_FMLA_by_element (sim_cpu *cpu)
6160 {
6161   /* instr[31]    = 0
6162      instr[30]    = half/full
6163      instr[29,23] = 00 1111 1
6164      instr[22]    = size
6165      instr[21]    = L
6166      instr[20,16] = m
6167      instr[15,12] = 0001
6168      instr[11]    = H
6169      instr[10]    = 0
6170      instr[9,5]   = Vn
6171      instr[4,0]   = Vd  */
6172 
6173   unsigned full     = INSTR (30, 30);
6174   unsigned size     = INSTR (22, 22);
6175   unsigned L        = INSTR (21, 21);
6176   unsigned vm       = INSTR (20, 16);
6177   unsigned H        = INSTR (11, 11);
6178   unsigned vn       = INSTR (9, 5);
6179   unsigned vd       = INSTR (4, 0);
6180   unsigned e;
6181 
6182   NYI_assert (29, 23, 0x1F);
6183   NYI_assert (15, 12, 0x1);
6184   NYI_assert (10, 10, 0);
6185 
6186   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6187   if (size)
6188     {
6189       double element1, element2;
6190 
6191       if (! full || L)
6192 	HALT_UNALLOC;
6193 
6194       element2 = aarch64_get_vec_double (cpu, vm, H);
6195 
6196       for (e = 0; e < 2; e++)
6197 	{
6198 	  element1 = aarch64_get_vec_double (cpu, vn, e);
6199 	  element1 *= element2;
6200 	  element1 += aarch64_get_vec_double (cpu, vd, e);
6201 	  aarch64_set_vec_double (cpu, vd, e, element1);
6202 	}
6203     }
6204   else
6205     {
6206       float element1;
6207       float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
6208 
6209       for (e = 0; e < (full ? 4 : 2); e++)
6210 	{
6211 	  element1 = aarch64_get_vec_float (cpu, vn, e);
6212 	  element1 *= element2;
6213 	  element1 += aarch64_get_vec_float (cpu, vd, e);
6214 	  aarch64_set_vec_float (cpu, vd, e, element1);
6215 	}
6216     }
6217 }
6218 
6219 static void
6220 do_vec_op2 (sim_cpu *cpu)
6221 {
6222   /* instr[31]    = 0
6223      instr[30]    = half/full
6224      instr[29,24] = 00 1111
6225      instr[23]    = ?
6226      instr[22,16] = element size & index
6227      instr[15,10] = sub-opcode
6228      instr[9,5]   = Vm
6229      instr[4,0]   = Vd  */
6230 
6231   NYI_assert (29, 24, 0x0F);
6232 
6233   if (INSTR (23, 23) != 0)
6234     {
6235       switch (INSTR (15, 10))
6236 	{
6237 	case 0x04:
6238 	case 0x06:
6239 	  do_FMLA_by_element (cpu);
6240 	  return;
6241 
6242 	case 0x20:
6243 	case 0x22:
6244 	  do_vec_MUL_by_element (cpu);
6245 	  return;
6246 
6247 	default:
6248 	  HALT_NYI;
6249 	}
6250     }
6251   else
6252     {
6253       switch (INSTR (15, 10))
6254 	{
6255 	case 0x01: do_vec_SSHR_USHR (cpu); return;
6256 	case 0x15: do_vec_SHL (cpu); return;
6257 	case 0x20:
6258 	case 0x22: do_vec_MUL_by_element (cpu); return;
6259 	case 0x29: do_vec_xtl (cpu); return;
6260 	default:   HALT_NYI;
6261 	}
6262     }
6263 }
6264 
6265 static void
6266 do_vec_neg (sim_cpu *cpu)
6267 {
6268   /* instr[31]    = 0
6269      instr[30]    = full(1)/half(0)
6270      instr[29,24] = 10 1110
6271      instr[23,22] = size: byte(00), half (01), word (10), long (11)
6272      instr[21,10] = 1000 0010 1110
6273      instr[9,5]   = Vs
6274      instr[4,0]   = Vd  */
6275 
6276   int    full = INSTR (30, 30);
6277   unsigned vs = INSTR (9, 5);
6278   unsigned vd = INSTR (4, 0);
6279   unsigned i;
6280 
6281   NYI_assert (29, 24, 0x2E);
6282   NYI_assert (21, 10, 0x82E);
6283 
6284   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6285   switch (INSTR (23, 22))
6286     {
6287     case 0:
6288       for (i = 0; i < (full ? 16 : 8); i++)
6289 	aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
6290       return;
6291 
6292     case 1:
6293       for (i = 0; i < (full ? 8 : 4); i++)
6294 	aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
6295       return;
6296 
6297     case 2:
6298       for (i = 0; i < (full ? 4 : 2); i++)
6299 	aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
6300       return;
6301 
6302     case 3:
6303       if (! full)
6304 	HALT_NYI;
6305       for (i = 0; i < 2; i++)
6306 	aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
6307       return;
6308     }
6309 }
6310 
6311 static void
6312 do_vec_sqrt (sim_cpu *cpu)
6313 {
6314   /* instr[31]    = 0
6315      instr[30]    = full(1)/half(0)
6316      instr[29,23] = 101 1101
6317      instr[22]    = single(0)/double(1)
6318      instr[21,10] = 1000 0111 1110
6319      instr[9,5]   = Vs
6320      instr[4,0]   = Vd.  */
6321 
6322   int    full = INSTR (30, 30);
6323   unsigned vs = INSTR (9, 5);
6324   unsigned vd = INSTR (4, 0);
6325   unsigned i;
6326 
6327   NYI_assert (29, 23, 0x5B);
6328   NYI_assert (21, 10, 0x87E);
6329 
6330   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6331   if (INSTR (22, 22) == 0)
6332     for (i = 0; i < (full ? 4 : 2); i++)
6333       aarch64_set_vec_float (cpu, vd, i,
6334 			     sqrtf (aarch64_get_vec_float (cpu, vs, i)));
6335   else
6336     for (i = 0; i < 2; i++)
6337       aarch64_set_vec_double (cpu, vd, i,
6338 			      sqrt (aarch64_get_vec_double (cpu, vs, i)));
6339 }
6340 
6341 static void
6342 do_vec_mls_indexed (sim_cpu *cpu)
6343 {
6344   /* instr[31]       = 0
6345      instr[30]       = half(0)/full(1)
6346      instr[29,24]    = 10 1111
6347      instr[23,22]    = 16-bit(01)/32-bit(10)
6348      instr[21,20+11] = index (if 16-bit)
6349      instr[21+11]    = index (if 32-bit)
6350      instr[20,16]    = Vm
6351      instr[15,12]    = 0100
6352      instr[11]       = part of index
6353      instr[10]       = 0
6354      instr[9,5]      = Vs
6355      instr[4,0]      = Vd.  */
6356 
6357   int    full = INSTR (30, 30);
6358   unsigned vs = INSTR (9, 5);
6359   unsigned vd = INSTR (4, 0);
6360   unsigned vm = INSTR (20, 16);
6361   unsigned i;
6362 
6363   NYI_assert (15, 12, 4);
6364   NYI_assert (10, 10, 0);
6365 
6366   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6367   switch (INSTR (23, 22))
6368     {
6369     case 1:
6370       {
6371 	unsigned elem;
6372 	uint32_t val;
6373 
6374 	if (vm > 15)
6375 	  HALT_NYI;
6376 
6377 	elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
6378 	val = aarch64_get_vec_u16 (cpu, vm, elem);
6379 
6380 	for (i = 0; i < (full ? 8 : 4); i++)
6381 	  aarch64_set_vec_u32 (cpu, vd, i,
6382 			       aarch64_get_vec_u32 (cpu, vd, i) -
6383 			       (aarch64_get_vec_u32 (cpu, vs, i) * val));
6384 	return;
6385       }
6386 
6387     case 2:
6388       {
6389 	unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
6390 	uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
6391 
6392 	for (i = 0; i < (full ? 4 : 2); i++)
6393 	  aarch64_set_vec_u64 (cpu, vd, i,
6394 			       aarch64_get_vec_u64 (cpu, vd, i) -
6395 			       (aarch64_get_vec_u64 (cpu, vs, i) * val));
6396 	return;
6397       }
6398 
6399     case 0:
6400     case 3:
6401     default:
6402       HALT_NYI;
6403     }
6404 }
6405 
6406 static void
6407 do_vec_SUB (sim_cpu *cpu)
6408 {
6409   /* instr [31]    = 0
6410      instr [30]    = half(0)/full(1)
6411      instr [29,24] = 10 1110
6412      instr [23,22] = size: byte(00, half(01), word (10), long (11)
6413      instr [21]    = 1
6414      instr [20,16] = Vm
6415      instr [15,10] = 10 0001
6416      instr [9, 5]  = Vn
6417      instr [4, 0]  = Vd.  */
6418 
6419   unsigned full = INSTR (30, 30);
6420   unsigned vm = INSTR (20, 16);
6421   unsigned vn = INSTR (9, 5);
6422   unsigned vd = INSTR (4, 0);
6423   unsigned i;
6424 
6425   NYI_assert (29, 24, 0x2E);
6426   NYI_assert (21, 21, 1);
6427   NYI_assert (15, 10, 0x21);
6428 
6429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6430   switch (INSTR (23, 22))
6431     {
6432     case 0:
6433       for (i = 0; i < (full ? 16 : 8); i++)
6434 	aarch64_set_vec_s8 (cpu, vd, i,
6435 			    aarch64_get_vec_s8 (cpu, vn, i)
6436 			    - aarch64_get_vec_s8 (cpu, vm, i));
6437       return;
6438 
6439     case 1:
6440       for (i = 0; i < (full ? 8 : 4); i++)
6441 	aarch64_set_vec_s16 (cpu, vd, i,
6442 			     aarch64_get_vec_s16 (cpu, vn, i)
6443 			     - aarch64_get_vec_s16 (cpu, vm, i));
6444       return;
6445 
6446     case 2:
6447       for (i = 0; i < (full ? 4 : 2); i++)
6448 	aarch64_set_vec_s32 (cpu, vd, i,
6449 			     aarch64_get_vec_s32 (cpu, vn, i)
6450 			     - aarch64_get_vec_s32 (cpu, vm, i));
6451       return;
6452 
6453     case 3:
6454       if (full == 0)
6455 	HALT_UNALLOC;
6456 
6457       for (i = 0; i < 2; i++)
6458 	aarch64_set_vec_s64 (cpu, vd, i,
6459 			     aarch64_get_vec_s64 (cpu, vn, i)
6460 			     - aarch64_get_vec_s64 (cpu, vm, i));
6461       return;
6462     }
6463 }
6464 
6465 static void
6466 do_vec_MLS (sim_cpu *cpu)
6467 {
6468   /* instr [31]    = 0
6469      instr [30]    = half(0)/full(1)
6470      instr [29,24] = 10 1110
6471      instr [23,22] = size: byte(00, half(01), word (10)
6472      instr [21]    = 1
6473      instr [20,16] = Vm
6474      instr [15,10] = 10 0101
6475      instr [9, 5]  = Vn
6476      instr [4, 0]  = Vd.  */
6477 
6478   unsigned full = INSTR (30, 30);
6479   unsigned vm = INSTR (20, 16);
6480   unsigned vn = INSTR (9, 5);
6481   unsigned vd = INSTR (4, 0);
6482   unsigned i;
6483 
6484   NYI_assert (29, 24, 0x2E);
6485   NYI_assert (21, 21, 1);
6486   NYI_assert (15, 10, 0x25);
6487 
6488   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6489   switch (INSTR (23, 22))
6490     {
6491     case 0:
6492       for (i = 0; i < (full ? 16 : 8); i++)
6493 	aarch64_set_vec_u8 (cpu, vd, i,
6494 			    aarch64_get_vec_u8 (cpu, vd, i)
6495 			    - (aarch64_get_vec_u8 (cpu, vn, i)
6496 			       * aarch64_get_vec_u8 (cpu, vm, i)));
6497       return;
6498 
6499     case 1:
6500       for (i = 0; i < (full ? 8 : 4); i++)
6501 	aarch64_set_vec_u16 (cpu, vd, i,
6502 			     aarch64_get_vec_u16 (cpu, vd, i)
6503 			     - (aarch64_get_vec_u16 (cpu, vn, i)
6504 				* aarch64_get_vec_u16 (cpu, vm, i)));
6505       return;
6506 
6507     case 2:
6508       for (i = 0; i < (full ? 4 : 2); i++)
6509 	aarch64_set_vec_u32 (cpu, vd, i,
6510 			     aarch64_get_vec_u32 (cpu, vd, i)
6511 			     - (aarch64_get_vec_u32 (cpu, vn, i)
6512 				* aarch64_get_vec_u32 (cpu, vm, i)));
6513       return;
6514 
6515     default:
6516       HALT_UNALLOC;
6517     }
6518 }
6519 
6520 static void
6521 do_vec_FDIV (sim_cpu *cpu)
6522 {
6523   /* instr [31]    = 0
6524      instr [30]    = half(0)/full(1)
6525      instr [29,23] = 10 1110 0
6526      instr [22]    = float()/double(1)
6527      instr [21]    = 1
6528      instr [20,16] = Vm
6529      instr [15,10] = 1111 11
6530      instr [9, 5]  = Vn
6531      instr [4, 0]  = Vd.  */
6532 
6533   unsigned full = INSTR (30, 30);
6534   unsigned vm = INSTR (20, 16);
6535   unsigned vn = INSTR (9, 5);
6536   unsigned vd = INSTR (4, 0);
6537   unsigned i;
6538 
6539   NYI_assert (29, 23, 0x5C);
6540   NYI_assert (21, 21, 1);
6541   NYI_assert (15, 10, 0x3F);
6542 
6543   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6544   if (INSTR (22, 22))
6545     {
6546       if (! full)
6547 	HALT_UNALLOC;
6548 
6549       for (i = 0; i < 2; i++)
6550 	aarch64_set_vec_double (cpu, vd, i,
6551 				aarch64_get_vec_double (cpu, vn, i)
6552 				/ aarch64_get_vec_double (cpu, vm, i));
6553     }
6554   else
6555     for (i = 0; i < (full ? 4 : 2); i++)
6556       aarch64_set_vec_float (cpu, vd, i,
6557 			     aarch64_get_vec_float (cpu, vn, i)
6558 			     / aarch64_get_vec_float (cpu, vm, i));
6559 }
6560 
6561 static void
6562 do_vec_FMUL (sim_cpu *cpu)
6563 {
6564   /* instr [31]    = 0
6565      instr [30]    = half(0)/full(1)
6566      instr [29,23] = 10 1110 0
6567      instr [22]    = float(0)/double(1)
6568      instr [21]    = 1
6569      instr [20,16] = Vm
6570      instr [15,10] = 1101 11
6571      instr [9, 5]  = Vn
6572      instr [4, 0]  = Vd.  */
6573 
6574   unsigned full = INSTR (30, 30);
6575   unsigned vm = INSTR (20, 16);
6576   unsigned vn = INSTR (9, 5);
6577   unsigned vd = INSTR (4, 0);
6578   unsigned i;
6579 
6580   NYI_assert (29, 23, 0x5C);
6581   NYI_assert (21, 21, 1);
6582   NYI_assert (15, 10, 0x37);
6583 
6584   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6585   if (INSTR (22, 22))
6586     {
6587       if (! full)
6588 	HALT_UNALLOC;
6589 
6590       for (i = 0; i < 2; i++)
6591 	aarch64_set_vec_double (cpu, vd, i,
6592 				aarch64_get_vec_double (cpu, vn, i)
6593 				* aarch64_get_vec_double (cpu, vm, i));
6594     }
6595   else
6596     for (i = 0; i < (full ? 4 : 2); i++)
6597       aarch64_set_vec_float (cpu, vd, i,
6598 			     aarch64_get_vec_float (cpu, vn, i)
6599 			     * aarch64_get_vec_float (cpu, vm, i));
6600 }
6601 
6602 static void
6603 do_vec_FADDP (sim_cpu *cpu)
6604 {
6605   /* instr [31]    = 0
6606      instr [30]    = half(0)/full(1)
6607      instr [29,23] = 10 1110 0
6608      instr [22]    = float(0)/double(1)
6609      instr [21]    = 1
6610      instr [20,16] = Vm
6611      instr [15,10] = 1101 01
6612      instr [9, 5]  = Vn
6613      instr [4, 0]  = Vd.  */
6614 
6615   unsigned full = INSTR (30, 30);
6616   unsigned vm = INSTR (20, 16);
6617   unsigned vn = INSTR (9, 5);
6618   unsigned vd = INSTR (4, 0);
6619 
6620   NYI_assert (29, 23, 0x5C);
6621   NYI_assert (21, 21, 1);
6622   NYI_assert (15, 10, 0x35);
6623 
6624   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6625   if (INSTR (22, 22))
6626     {
6627       /* Extract values before adding them incase vd == vn/vm.  */
6628       double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
6629       double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
6630       double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
6631       double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
6632 
6633       if (! full)
6634 	HALT_UNALLOC;
6635 
6636       aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
6637       aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
6638     }
6639   else
6640     {
6641       /* Extract values before adding them incase vd == vn/vm.  */
6642       float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
6643       float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
6644       float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
6645       float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
6646 
6647       if (full)
6648 	{
6649 	  float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
6650 	  float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
6651 	  float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
6652 	  float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
6653 
6654 	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6655 	  aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
6656 	  aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
6657 	  aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
6658 	}
6659       else
6660 	{
6661 	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6662 	  aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
6663 	}
6664     }
6665 }
6666 
6667 static void
6668 do_vec_FSQRT (sim_cpu *cpu)
6669 {
6670   /* instr[31]    = 0
6671      instr[30]    = half(0)/full(1)
6672      instr[29,23] = 10 1110 1
6673      instr[22]    = single(0)/double(1)
6674      instr[21,10] = 10 0001 1111 10
6675      instr[9,5]   = Vsrc
6676      instr[4,0]   = Vdest.  */
6677 
6678   unsigned vn = INSTR (9, 5);
6679   unsigned vd = INSTR (4, 0);
6680   unsigned full = INSTR (30, 30);
6681   int i;
6682 
6683   NYI_assert (29, 23, 0x5D);
6684   NYI_assert (21, 10, 0x87E);
6685 
6686   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6687   if (INSTR (22, 22))
6688     {
6689       if (! full)
6690 	HALT_UNALLOC;
6691 
6692       for (i = 0; i < 2; i++)
6693 	aarch64_set_vec_double (cpu, vd, i,
6694 				sqrt (aarch64_get_vec_double (cpu, vn, i)));
6695     }
6696   else
6697     {
6698       for (i = 0; i < (full ? 4 : 2); i++)
6699 	aarch64_set_vec_float (cpu, vd, i,
6700 			       sqrtf (aarch64_get_vec_float (cpu, vn, i)));
6701     }
6702 }
6703 
6704 static void
6705 do_vec_FNEG (sim_cpu *cpu)
6706 {
6707   /* instr[31]    = 0
6708      instr[30]    = half (0)/full (1)
6709      instr[29,23] = 10 1110 1
6710      instr[22]    = single (0)/double (1)
6711      instr[21,10] = 10 0000 1111 10
6712      instr[9,5]   = Vsrc
6713      instr[4,0]   = Vdest.  */
6714 
6715   unsigned vn = INSTR (9, 5);
6716   unsigned vd = INSTR (4, 0);
6717   unsigned full = INSTR (30, 30);
6718   int i;
6719 
6720   NYI_assert (29, 23, 0x5D);
6721   NYI_assert (21, 10, 0x83E);
6722 
6723   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6724   if (INSTR (22, 22))
6725     {
6726       if (! full)
6727 	HALT_UNALLOC;
6728 
6729       for (i = 0; i < 2; i++)
6730 	aarch64_set_vec_double (cpu, vd, i,
6731 				- aarch64_get_vec_double (cpu, vn, i));
6732     }
6733   else
6734     {
6735       for (i = 0; i < (full ? 4 : 2); i++)
6736 	aarch64_set_vec_float (cpu, vd, i,
6737 			       - aarch64_get_vec_float (cpu, vn, i));
6738     }
6739 }
6740 
6741 static void
6742 do_vec_NOT (sim_cpu *cpu)
6743 {
6744   /* instr[31]    = 0
6745      instr[30]    = half (0)/full (1)
6746      instr[29,10] = 10 1110 0010 0000 0101 10
6747      instr[9,5]   = Vn
6748      instr[4.0]   = Vd.  */
6749 
6750   unsigned vn = INSTR (9, 5);
6751   unsigned vd = INSTR (4, 0);
6752   unsigned i;
6753   int      full = INSTR (30, 30);
6754 
6755   NYI_assert (29, 10, 0xB8816);
6756 
6757   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6758   for (i = 0; i < (full ? 16 : 8); i++)
6759     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
6760 }
6761 
6762 static unsigned int
6763 clz (uint64_t val, unsigned size)
6764 {
6765   uint64_t mask = 1;
6766   int      count;
6767 
6768   mask <<= (size - 1);
6769   count = 0;
6770   do
6771     {
6772       if (val & mask)
6773 	break;
6774       mask >>= 1;
6775       count ++;
6776     }
6777   while (mask);
6778 
6779   return count;
6780 }
6781 
6782 static void
6783 do_vec_CLZ (sim_cpu *cpu)
6784 {
6785   /* instr[31]    = 0
6786      instr[30]    = half (0)/full (1)
6787      instr[29,24] = 10 1110
6788      instr[23,22] = size
6789      instr[21,10] = 10 0000 0100 10
6790      instr[9,5]   = Vn
6791      instr[4.0]   = Vd.  */
6792 
6793   unsigned vn = INSTR (9, 5);
6794   unsigned vd = INSTR (4, 0);
6795   unsigned i;
6796   int      full = INSTR (30,30);
6797 
6798   NYI_assert (29, 24, 0x2E);
6799   NYI_assert (21, 10, 0x812);
6800 
6801   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6802   switch (INSTR (23, 22))
6803     {
6804     case 0:
6805       for (i = 0; i < (full ? 16 : 8); i++)
6806 	aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
6807       break;
6808     case 1:
6809       for (i = 0; i < (full ? 8 : 4); i++)
6810 	aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
6811       break;
6812     case 2:
6813       for (i = 0; i < (full ? 4 : 2); i++)
6814 	aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
6815       break;
6816     case 3:
6817       if (! full)
6818 	HALT_UNALLOC;
6819       aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
6820       aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
6821       break;
6822     }
6823 }
6824 
6825 static void
6826 do_vec_MOV_element (sim_cpu *cpu)
6827 {
6828   /* instr[31,21] = 0110 1110 000
6829      instr[20,16] = size & dest index
6830      instr[15]    = 0
6831      instr[14,11] = source index
6832      instr[10]    = 1
6833      instr[9,5]   = Vs
6834      instr[4.0]   = Vd.  */
6835 
6836   unsigned vs = INSTR (9, 5);
6837   unsigned vd = INSTR (4, 0);
6838   unsigned src_index;
6839   unsigned dst_index;
6840 
6841   NYI_assert (31, 21, 0x370);
6842   NYI_assert (15, 15, 0);
6843   NYI_assert (10, 10, 1);
6844 
6845   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6846   if (INSTR (16, 16))
6847     {
6848       /* Move a byte.  */
6849       src_index = INSTR (14, 11);
6850       dst_index = INSTR (20, 17);
6851       aarch64_set_vec_u8 (cpu, vd, dst_index,
6852 			  aarch64_get_vec_u8 (cpu, vs, src_index));
6853     }
6854   else if (INSTR (17, 17))
6855     {
6856       /* Move 16-bits.  */
6857       NYI_assert (11, 11, 0);
6858       src_index = INSTR (14, 12);
6859       dst_index = INSTR (20, 18);
6860       aarch64_set_vec_u16 (cpu, vd, dst_index,
6861 			   aarch64_get_vec_u16 (cpu, vs, src_index));
6862     }
6863   else if (INSTR (18, 18))
6864     {
6865       /* Move 32-bits.  */
6866       NYI_assert (12, 11, 0);
6867       src_index = INSTR (14, 13);
6868       dst_index = INSTR (20, 19);
6869       aarch64_set_vec_u32 (cpu, vd, dst_index,
6870 			   aarch64_get_vec_u32 (cpu, vs, src_index));
6871     }
6872   else
6873     {
6874       NYI_assert (19, 19, 1);
6875       NYI_assert (13, 11, 0);
6876       src_index = INSTR (14, 14);
6877       dst_index = INSTR (20, 20);
6878       aarch64_set_vec_u64 (cpu, vd, dst_index,
6879 			   aarch64_get_vec_u64 (cpu, vs, src_index));
6880     }
6881 }
6882 
6883 static void
6884 do_vec_REV32 (sim_cpu *cpu)
6885 {
6886   /* instr[31]    = 0
6887      instr[30]    = full/half
6888      instr[29,24] = 10 1110
6889      instr[23,22] = size
6890      instr[21,10] = 10 0000 0000 10
6891      instr[9,5]   = Rn
6892      instr[4,0]   = Rd.  */
6893 
6894   unsigned rn = INSTR (9, 5);
6895   unsigned rd = INSTR (4, 0);
6896   unsigned size = INSTR (23, 22);
6897   unsigned full = INSTR (30, 30);
6898   unsigned i;
6899   FRegister val;
6900 
6901   NYI_assert (29, 24, 0x2E);
6902   NYI_assert (21, 10, 0x802);
6903 
6904   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6905   switch (size)
6906     {
6907     case 0:
6908       for (i = 0; i < (full ? 16 : 8); i++)
6909 	val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
6910       break;
6911 
6912     case 1:
6913       for (i = 0; i < (full ? 8 : 4); i++)
6914 	val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
6915       break;
6916 
6917     default:
6918       HALT_UNALLOC;
6919     }
6920 
6921   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
6922   if (full)
6923     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
6924 }
6925 
6926 static void
6927 do_vec_EXT (sim_cpu *cpu)
6928 {
6929   /* instr[31]    = 0
6930      instr[30]    = full/half
6931      instr[29,21] = 10 1110 000
6932      instr[20,16] = Vm
6933      instr[15]    = 0
6934      instr[14,11] = source index
6935      instr[10]    = 0
6936      instr[9,5]   = Vn
6937      instr[4.0]   = Vd.  */
6938 
6939   unsigned vm = INSTR (20, 16);
6940   unsigned vn = INSTR (9, 5);
6941   unsigned vd = INSTR (4, 0);
6942   unsigned src_index = INSTR (14, 11);
6943   unsigned full = INSTR (30, 30);
6944   unsigned i;
6945   unsigned j;
6946   FRegister val;
6947 
6948   NYI_assert (31, 21, 0x370);
6949   NYI_assert (15, 15, 0);
6950   NYI_assert (10, 10, 0);
6951 
6952   if (!full && (src_index & 0x8))
6953     HALT_UNALLOC;
6954 
6955   j = 0;
6956 
6957   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6958   for (i = src_index; i < (full ? 16 : 8); i++)
6959     val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
6960   for (i = 0; i < src_index; i++)
6961     val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
6962 
6963   aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
6964   if (full)
6965     aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
6966 }
6967 
6968 static void
6969 dexAdvSIMD0 (sim_cpu *cpu)
6970 {
6971   /* instr [28,25] = 0 111.  */
6972   if (    INSTR (15, 10) == 0x07
6973       && (INSTR (9, 5) ==
6974 	  INSTR (20, 16)))
6975     {
6976       if (INSTR (31, 21) == 0x075
6977 	  || INSTR (31, 21) == 0x275)
6978 	{
6979 	  do_vec_MOV_whole_vector (cpu);
6980 	  return;
6981 	}
6982     }
6983 
6984   if (INSTR (29, 19) == 0x1E0)
6985     {
6986       do_vec_MOV_immediate (cpu);
6987       return;
6988     }
6989 
6990   if (INSTR (29, 19) == 0x5E0)
6991     {
6992       do_vec_MVNI (cpu);
6993       return;
6994     }
6995 
6996   if (INSTR (29, 19) == 0x1C0
6997       || INSTR (29, 19) == 0x1C1)
6998     {
6999       if (INSTR (15, 10) == 0x03)
7000 	{
7001 	  do_vec_DUP_scalar_into_vector (cpu);
7002 	  return;
7003 	}
7004     }
7005 
7006   switch (INSTR (29, 24))
7007     {
7008     case 0x0E: do_vec_op1 (cpu); return;
7009     case 0x0F: do_vec_op2 (cpu); return;
7010 
7011     case 0x2E:
7012       if (INSTR (21, 21) == 1)
7013 	{
7014 	  switch (INSTR (15, 10))
7015 	    {
7016 	    case 0x02:
7017 	      do_vec_REV32 (cpu);
7018 	      return;
7019 
7020 	    case 0x07:
7021 	      switch (INSTR (23, 22))
7022 		{
7023 		case 0: do_vec_EOR (cpu); return;
7024 		case 1: do_vec_BSL (cpu); return;
7025 		case 2:
7026 		case 3: do_vec_bit (cpu); return;
7027 		}
7028 	      break;
7029 
7030 	    case 0x08: do_vec_sub_long (cpu); return;
7031 	    case 0x11: do_vec_USHL (cpu); return;
7032 	    case 0x12: do_vec_CLZ (cpu); return;
7033 	    case 0x16: do_vec_NOT (cpu); return;
7034 	    case 0x19: do_vec_max (cpu); return;
7035 	    case 0x1B: do_vec_min (cpu); return;
7036 	    case 0x21: do_vec_SUB (cpu); return;
7037 	    case 0x25: do_vec_MLS (cpu); return;
7038 	    case 0x31: do_vec_FminmaxNMP (cpu); return;
7039 	    case 0x35: do_vec_FADDP (cpu); return;
7040 	    case 0x37: do_vec_FMUL (cpu); return;
7041 	    case 0x3F: do_vec_FDIV (cpu); return;
7042 
7043 	    case 0x3E:
7044 	      switch (INSTR (20, 16))
7045 		{
7046 		case 0x00: do_vec_FNEG (cpu); return;
7047 		case 0x01: do_vec_FSQRT (cpu); return;
7048 		default:   HALT_NYI;
7049 		}
7050 
7051 	    case 0x0D:
7052 	    case 0x0F:
7053 	    case 0x22:
7054 	    case 0x23:
7055 	    case 0x26:
7056 	    case 0x2A:
7057 	    case 0x32:
7058 	    case 0x36:
7059 	    case 0x39:
7060 	    case 0x3A:
7061 	      do_vec_compare (cpu); return;
7062 
7063 	    default:
7064 	      break;
7065 	    }
7066 	}
7067 
7068       if (INSTR (31, 21) == 0x370)
7069 	{
7070 	  if (INSTR (10, 10))
7071 	    do_vec_MOV_element (cpu);
7072 	  else
7073 	    do_vec_EXT (cpu);
7074 	  return;
7075 	}
7076 
7077       switch (INSTR (21, 10))
7078 	{
7079 	case 0x82E: do_vec_neg (cpu); return;
7080 	case 0x87E: do_vec_sqrt (cpu); return;
7081 	default:
7082 	  if (INSTR (15, 10) == 0x30)
7083 	    {
7084 	      do_vec_mull (cpu);
7085 	      return;
7086 	    }
7087 	  break;
7088 	}
7089       break;
7090 
7091     case 0x2f:
7092       switch (INSTR (15, 10))
7093 	{
7094 	case 0x01: do_vec_SSHR_USHR (cpu); return;
7095 	case 0x10:
7096 	case 0x12: do_vec_mls_indexed (cpu); return;
7097 	case 0x29: do_vec_xtl (cpu); return;
7098 	default:
7099 	  HALT_NYI;
7100 	}
7101 
7102     default:
7103       break;
7104     }
7105 
7106   HALT_NYI;
7107 }
7108 
7109 /* 3 sources.  */
7110 
7111 /* Float multiply add.  */
7112 static void
7113 fmadds (sim_cpu *cpu)
7114 {
7115   unsigned sa = INSTR (14, 10);
7116   unsigned sm = INSTR (20, 16);
7117   unsigned sn = INSTR ( 9,  5);
7118   unsigned sd = INSTR ( 4,  0);
7119 
7120   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7121   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7122 			+ aarch64_get_FP_float (cpu, sn)
7123 			* aarch64_get_FP_float (cpu, sm));
7124 }
7125 
7126 /* Double multiply add.  */
7127 static void
7128 fmaddd (sim_cpu *cpu)
7129 {
7130   unsigned sa = INSTR (14, 10);
7131   unsigned sm = INSTR (20, 16);
7132   unsigned sn = INSTR ( 9,  5);
7133   unsigned sd = INSTR ( 4,  0);
7134 
7135   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7136   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7137 			 + aarch64_get_FP_double (cpu, sn)
7138 			 * aarch64_get_FP_double (cpu, sm));
7139 }
7140 
7141 /* Float multiply subtract.  */
7142 static void
7143 fmsubs (sim_cpu *cpu)
7144 {
7145   unsigned sa = INSTR (14, 10);
7146   unsigned sm = INSTR (20, 16);
7147   unsigned sn = INSTR ( 9,  5);
7148   unsigned sd = INSTR ( 4,  0);
7149 
7150   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7151   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7152 			- aarch64_get_FP_float (cpu, sn)
7153 			* aarch64_get_FP_float (cpu, sm));
7154 }
7155 
7156 /* Double multiply subtract.  */
7157 static void
7158 fmsubd (sim_cpu *cpu)
7159 {
7160   unsigned sa = INSTR (14, 10);
7161   unsigned sm = INSTR (20, 16);
7162   unsigned sn = INSTR ( 9,  5);
7163   unsigned sd = INSTR ( 4,  0);
7164 
7165   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7166   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7167 			 - aarch64_get_FP_double (cpu, sn)
7168 			 * aarch64_get_FP_double (cpu, sm));
7169 }
7170 
7171 /* Float negative multiply add.  */
7172 static void
7173 fnmadds (sim_cpu *cpu)
7174 {
7175   unsigned sa = INSTR (14, 10);
7176   unsigned sm = INSTR (20, 16);
7177   unsigned sn = INSTR ( 9,  5);
7178   unsigned sd = INSTR ( 4,  0);
7179 
7180   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7181   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7182 			+ (- aarch64_get_FP_float (cpu, sn))
7183 			* aarch64_get_FP_float (cpu, sm));
7184 }
7185 
7186 /* Double negative multiply add.  */
7187 static void
7188 fnmaddd (sim_cpu *cpu)
7189 {
7190   unsigned sa = INSTR (14, 10);
7191   unsigned sm = INSTR (20, 16);
7192   unsigned sn = INSTR ( 9,  5);
7193   unsigned sd = INSTR ( 4,  0);
7194 
7195   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7196   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7197 			 + (- aarch64_get_FP_double (cpu, sn))
7198 			 * aarch64_get_FP_double (cpu, sm));
7199 }
7200 
7201 /* Float negative multiply subtract.  */
7202 static void
7203 fnmsubs (sim_cpu *cpu)
7204 {
7205   unsigned sa = INSTR (14, 10);
7206   unsigned sm = INSTR (20, 16);
7207   unsigned sn = INSTR ( 9,  5);
7208   unsigned sd = INSTR ( 4,  0);
7209 
7210   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7211   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7212 			+ aarch64_get_FP_float (cpu, sn)
7213 			* aarch64_get_FP_float (cpu, sm));
7214 }
7215 
7216 /* Double negative multiply subtract.  */
7217 static void
7218 fnmsubd (sim_cpu *cpu)
7219 {
7220   unsigned sa = INSTR (14, 10);
7221   unsigned sm = INSTR (20, 16);
7222   unsigned sn = INSTR ( 9,  5);
7223   unsigned sd = INSTR ( 4,  0);
7224 
7225   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7226   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7227 			 + aarch64_get_FP_double (cpu, sn)
7228 			 * aarch64_get_FP_double (cpu, sm));
7229 }
7230 
7231 static void
7232 dexSimpleFPDataProc3Source (sim_cpu *cpu)
7233 {
7234   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7235      instr[30]    = 0
7236      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7237      instr[28,25] = 1111
7238      instr[24]    = 1
7239      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7240      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
7241      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
7242 
7243   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7244   /* dispatch on combined type:o1:o2.  */
7245   uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
7246 
7247   if (M_S != 0)
7248     HALT_UNALLOC;
7249 
7250   switch (dispatch)
7251     {
7252     case 0: fmadds (cpu); return;
7253     case 1: fmsubs (cpu); return;
7254     case 2: fnmadds (cpu); return;
7255     case 3: fnmsubs (cpu); return;
7256     case 4: fmaddd (cpu); return;
7257     case 5: fmsubd (cpu); return;
7258     case 6: fnmaddd (cpu); return;
7259     case 7: fnmsubd (cpu); return;
7260     default:
7261       /* type > 1 is currently unallocated.  */
7262       HALT_UNALLOC;
7263     }
7264 }
7265 
7266 static void
7267 dexSimpleFPFixedConvert (sim_cpu *cpu)
7268 {
7269   HALT_NYI;
7270 }
7271 
7272 static void
7273 dexSimpleFPCondCompare (sim_cpu *cpu)
7274 {
7275   /* instr [31,23] = 0001 1110 0
7276      instr [22]    = type
7277      instr [21]    = 1
7278      instr [20,16] = Rm
7279      instr [15,12] = condition
7280      instr [11,10] = 01
7281      instr [9,5]   = Rn
7282      instr [4]     = 0
7283      instr [3,0]   = nzcv  */
7284 
7285   unsigned rm = INSTR (20, 16);
7286   unsigned rn = INSTR (9, 5);
7287 
7288   NYI_assert (31, 23, 0x3C);
7289   NYI_assert (11, 10, 0x1);
7290   NYI_assert (4,  4,  0);
7291 
7292   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7293   if (! testConditionCode (cpu, INSTR (15, 12)))
7294     {
7295       aarch64_set_CPSR (cpu, INSTR (3, 0));
7296       return;
7297     }
7298 
7299   if (INSTR (22, 22))
7300     {
7301       /* Double precision.  */
7302       double val1 = aarch64_get_vec_double (cpu, rn, 0);
7303       double val2 = aarch64_get_vec_double (cpu, rm, 0);
7304 
7305       /* FIXME: Check for NaNs.  */
7306       if (val1 == val2)
7307 	aarch64_set_CPSR (cpu, (Z | C));
7308       else if (val1 < val2)
7309 	aarch64_set_CPSR (cpu, N);
7310       else /* val1 > val2 */
7311 	aarch64_set_CPSR (cpu, C);
7312     }
7313   else
7314     {
7315       /* Single precision.  */
7316       float val1 = aarch64_get_vec_float (cpu, rn, 0);
7317       float val2 = aarch64_get_vec_float (cpu, rm, 0);
7318 
7319       /* FIXME: Check for NaNs.  */
7320       if (val1 == val2)
7321 	aarch64_set_CPSR (cpu, (Z | C));
7322       else if (val1 < val2)
7323 	aarch64_set_CPSR (cpu, N);
7324       else /* val1 > val2 */
7325 	aarch64_set_CPSR (cpu, C);
7326     }
7327 }
7328 
7329 /* 2 sources.  */
7330 
7331 /* Float add.  */
7332 static void
7333 fadds (sim_cpu *cpu)
7334 {
7335   unsigned sm = INSTR (20, 16);
7336   unsigned sn = INSTR ( 9,  5);
7337   unsigned sd = INSTR ( 4,  0);
7338 
7339   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7340   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7341 			+ aarch64_get_FP_float (cpu, sm));
7342 }
7343 
7344 /* Double add.  */
7345 static void
7346 faddd (sim_cpu *cpu)
7347 {
7348   unsigned sm = INSTR (20, 16);
7349   unsigned sn = INSTR ( 9,  5);
7350   unsigned sd = INSTR ( 4,  0);
7351 
7352   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7353   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7354 			 + aarch64_get_FP_double (cpu, sm));
7355 }
7356 
7357 /* Float divide.  */
7358 static void
7359 fdivs (sim_cpu *cpu)
7360 {
7361   unsigned sm = INSTR (20, 16);
7362   unsigned sn = INSTR ( 9,  5);
7363   unsigned sd = INSTR ( 4,  0);
7364 
7365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7366   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7367 			/ aarch64_get_FP_float (cpu, sm));
7368 }
7369 
7370 /* Double divide.  */
7371 static void
7372 fdivd (sim_cpu *cpu)
7373 {
7374   unsigned sm = INSTR (20, 16);
7375   unsigned sn = INSTR ( 9,  5);
7376   unsigned sd = INSTR ( 4,  0);
7377 
7378   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7379   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7380 			 / aarch64_get_FP_double (cpu, sm));
7381 }
7382 
7383 /* Float multiply.  */
7384 static void
7385 fmuls (sim_cpu *cpu)
7386 {
7387   unsigned sm = INSTR (20, 16);
7388   unsigned sn = INSTR ( 9,  5);
7389   unsigned sd = INSTR ( 4,  0);
7390 
7391   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7392   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7393 			* aarch64_get_FP_float (cpu, sm));
7394 }
7395 
7396 /* Double multiply.  */
7397 static void
7398 fmuld (sim_cpu *cpu)
7399 {
7400   unsigned sm = INSTR (20, 16);
7401   unsigned sn = INSTR ( 9,  5);
7402   unsigned sd = INSTR ( 4,  0);
7403 
7404   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7405   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7406 			 * aarch64_get_FP_double (cpu, sm));
7407 }
7408 
7409 /* Float negate and multiply.  */
7410 static void
7411 fnmuls (sim_cpu *cpu)
7412 {
7413   unsigned sm = INSTR (20, 16);
7414   unsigned sn = INSTR ( 9,  5);
7415   unsigned sd = INSTR ( 4,  0);
7416 
7417   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7418   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
7419 				    * aarch64_get_FP_float (cpu, sm)));
7420 }
7421 
7422 /* Double negate and multiply.  */
7423 static void
7424 fnmuld (sim_cpu *cpu)
7425 {
7426   unsigned sm = INSTR (20, 16);
7427   unsigned sn = INSTR ( 9,  5);
7428   unsigned sd = INSTR ( 4,  0);
7429 
7430   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7431   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
7432 				     * aarch64_get_FP_double (cpu, sm)));
7433 }
7434 
7435 /* Float subtract.  */
7436 static void
7437 fsubs (sim_cpu *cpu)
7438 {
7439   unsigned sm = INSTR (20, 16);
7440   unsigned sn = INSTR ( 9,  5);
7441   unsigned sd = INSTR ( 4,  0);
7442 
7443   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7444   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7445 			- aarch64_get_FP_float (cpu, sm));
7446 }
7447 
7448 /* Double subtract.  */
7449 static void
7450 fsubd (sim_cpu *cpu)
7451 {
7452   unsigned sm = INSTR (20, 16);
7453   unsigned sn = INSTR ( 9,  5);
7454   unsigned sd = INSTR ( 4,  0);
7455 
7456   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7457   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7458 			 - aarch64_get_FP_double (cpu, sm));
7459 }
7460 
7461 static void
7462 do_FMINNM (sim_cpu *cpu)
7463 {
7464   /* instr[31,23] = 0 0011 1100
7465      instr[22]    = float(0)/double(1)
7466      instr[21]    = 1
7467      instr[20,16] = Sm
7468      instr[15,10] = 01 1110
7469      instr[9,5]   = Sn
7470      instr[4,0]   = Cpu  */
7471 
7472   unsigned sm = INSTR (20, 16);
7473   unsigned sn = INSTR ( 9,  5);
7474   unsigned sd = INSTR ( 4,  0);
7475 
7476   NYI_assert (31, 23, 0x03C);
7477   NYI_assert (15, 10, 0x1E);
7478 
7479   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7480   if (INSTR (22, 22))
7481     aarch64_set_FP_double (cpu, sd,
7482 			   dminnm (aarch64_get_FP_double (cpu, sn),
7483 				   aarch64_get_FP_double (cpu, sm)));
7484   else
7485     aarch64_set_FP_float (cpu, sd,
7486 			  fminnm (aarch64_get_FP_float (cpu, sn),
7487 				  aarch64_get_FP_float (cpu, sm)));
7488 }
7489 
7490 static void
7491 do_FMAXNM (sim_cpu *cpu)
7492 {
7493   /* instr[31,23] = 0 0011 1100
7494      instr[22]    = float(0)/double(1)
7495      instr[21]    = 1
7496      instr[20,16] = Sm
7497      instr[15,10] = 01 1010
7498      instr[9,5]   = Sn
7499      instr[4,0]   = Cpu  */
7500 
7501   unsigned sm = INSTR (20, 16);
7502   unsigned sn = INSTR ( 9,  5);
7503   unsigned sd = INSTR ( 4,  0);
7504 
7505   NYI_assert (31, 23, 0x03C);
7506   NYI_assert (15, 10, 0x1A);
7507 
7508   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7509   if (INSTR (22, 22))
7510     aarch64_set_FP_double (cpu, sd,
7511 			   dmaxnm (aarch64_get_FP_double (cpu, sn),
7512 				   aarch64_get_FP_double (cpu, sm)));
7513   else
7514     aarch64_set_FP_float (cpu, sd,
7515 			  fmaxnm (aarch64_get_FP_float (cpu, sn),
7516 				  aarch64_get_FP_float (cpu, sm)));
7517 }
7518 
7519 static void
7520 dexSimpleFPDataProc2Source (sim_cpu *cpu)
7521 {
7522   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7523      instr[30]    = 0
7524      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7525      instr[28,25] = 1111
7526      instr[24]    = 0
7527      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7528      instr[21]    = 1
7529      instr[20,16] = Vm
7530      instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
7531                                0010 ==> FADD, 0011 ==> FSUB,
7532                                0100 ==> FMAX, 0101 ==> FMIN
7533                                0110 ==> FMAXNM, 0111 ==> FMINNM
7534                                1000 ==> FNMUL, ow ==> UNALLOC
7535      instr[11,10] = 10
7536      instr[9,5]   = Vn
7537      instr[4,0]   = Vd  */
7538 
7539   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7540   uint32_t type = INSTR (23, 22);
7541   /* Dispatch on opcode.  */
7542   uint32_t dispatch = INSTR (15, 12);
7543 
7544   if (type > 1)
7545     HALT_UNALLOC;
7546 
7547   if (M_S != 0)
7548     HALT_UNALLOC;
7549 
7550   if (type)
7551     switch (dispatch)
7552       {
7553       case 0: fmuld (cpu); return;
7554       case 1: fdivd (cpu); return;
7555       case 2: faddd (cpu); return;
7556       case 3: fsubd (cpu); return;
7557       case 6: do_FMAXNM (cpu); return;
7558       case 7: do_FMINNM (cpu); return;
7559       case 8: fnmuld (cpu); return;
7560 
7561 	/* Have not yet implemented fmax and fmin.  */
7562       case 4:
7563       case 5:
7564 	HALT_NYI;
7565 
7566       default:
7567 	HALT_UNALLOC;
7568       }
7569   else /* type == 0 => floats.  */
7570     switch (dispatch)
7571       {
7572       case 0: fmuls (cpu); return;
7573       case 1: fdivs (cpu); return;
7574       case 2: fadds (cpu); return;
7575       case 3: fsubs (cpu); return;
7576       case 6: do_FMAXNM (cpu); return;
7577       case 7: do_FMINNM (cpu); return;
7578       case 8: fnmuls (cpu); return;
7579 
7580       case 4:
7581       case 5:
7582 	HALT_NYI;
7583 
7584       default:
7585 	HALT_UNALLOC;
7586       }
7587 }
7588 
7589 static void
7590 dexSimpleFPCondSelect (sim_cpu *cpu)
7591 {
7592   /* FCSEL
7593      instr[31,23] = 0 0011 1100
7594      instr[22]    = 0=>single 1=>double
7595      instr[21]    = 1
7596      instr[20,16] = Sm
7597      instr[15,12] = cond
7598      instr[11,10] = 11
7599      instr[9,5]   = Sn
7600      instr[4,0]   = Cpu  */
7601   unsigned sm = INSTR (20, 16);
7602   unsigned sn = INSTR ( 9, 5);
7603   unsigned sd = INSTR ( 4, 0);
7604   uint32_t set = testConditionCode (cpu, INSTR (15, 12));
7605 
7606   NYI_assert (31, 23, 0x03C);
7607   NYI_assert (11, 10, 0x3);
7608 
7609   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7610   if (INSTR (22, 22))
7611     aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
7612 				     : aarch64_get_FP_double (cpu, sm)));
7613   else
7614     aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
7615 				    : aarch64_get_FP_float (cpu, sm)));
7616 }
7617 
7618 /* Store 32 bit unscaled signed 9 bit.  */
7619 static void
7620 fsturs (sim_cpu *cpu, int32_t offset)
7621 {
7622   unsigned int rn = INSTR (9, 5);
7623   unsigned int st = INSTR (4, 0);
7624 
7625   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7626   aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7627 		       aarch64_get_vec_u32 (cpu, st, 0));
7628 }
7629 
7630 /* Store 64 bit unscaled signed 9 bit.  */
7631 static void
7632 fsturd (sim_cpu *cpu, int32_t offset)
7633 {
7634   unsigned int rn = INSTR (9, 5);
7635   unsigned int st = INSTR (4, 0);
7636 
7637   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7638   aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7639 		       aarch64_get_vec_u64 (cpu, st, 0));
7640 }
7641 
7642 /* Store 128 bit unscaled signed 9 bit.  */
7643 static void
7644 fsturq (sim_cpu *cpu, int32_t offset)
7645 {
7646   unsigned int rn = INSTR (9, 5);
7647   unsigned int st = INSTR (4, 0);
7648   FRegister a;
7649 
7650   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7651   aarch64_get_FP_long_double (cpu, st, & a);
7652   aarch64_set_mem_long_double (cpu,
7653 			       aarch64_get_reg_u64 (cpu, rn, 1)
7654 			       + offset, a);
7655 }
7656 
7657 /* TODO FP move register.  */
7658 
7659 /* 32 bit fp to fp move register.  */
7660 static void
7661 ffmovs (sim_cpu *cpu)
7662 {
7663   unsigned int rn = INSTR (9, 5);
7664   unsigned int st = INSTR (4, 0);
7665 
7666   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7667   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
7668 }
7669 
7670 /* 64 bit fp to fp move register.  */
7671 static void
7672 ffmovd (sim_cpu *cpu)
7673 {
7674   unsigned int rn = INSTR (9, 5);
7675   unsigned int st = INSTR (4, 0);
7676 
7677   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7678   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
7679 }
7680 
7681 /* 32 bit GReg to Vec move register.  */
7682 static void
7683 fgmovs (sim_cpu *cpu)
7684 {
7685   unsigned int rn = INSTR (9, 5);
7686   unsigned int st = INSTR (4, 0);
7687 
7688   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7689   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
7690 }
7691 
7692 /* 64 bit g to fp move register.  */
7693 static void
7694 fgmovd (sim_cpu *cpu)
7695 {
7696   unsigned int rn = INSTR (9, 5);
7697   unsigned int st = INSTR (4, 0);
7698 
7699   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7700   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
7701 }
7702 
7703 /* 32 bit fp to g move register.  */
7704 static void
7705 gfmovs (sim_cpu *cpu)
7706 {
7707   unsigned int rn = INSTR (9, 5);
7708   unsigned int st = INSTR (4, 0);
7709 
7710   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7711   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
7712 }
7713 
7714 /* 64 bit fp to g move register.  */
7715 static void
7716 gfmovd (sim_cpu *cpu)
7717 {
7718   unsigned int rn = INSTR (9, 5);
7719   unsigned int st = INSTR (4, 0);
7720 
7721   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7722   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
7723 }
7724 
7725 /* FP move immediate
7726 
7727    These install an immediate 8 bit value in the target register
7728    where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
7729    bit exponent.  */
7730 
7731 static void
7732 fmovs (sim_cpu *cpu)
7733 {
7734   unsigned int sd = INSTR (4, 0);
7735   uint32_t imm = INSTR (20, 13);
7736   float f = fp_immediate_for_encoding_32 (imm);
7737 
7738   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7739   aarch64_set_FP_float (cpu, sd, f);
7740 }
7741 
7742 static void
7743 fmovd (sim_cpu *cpu)
7744 {
7745   unsigned int sd = INSTR (4, 0);
7746   uint32_t imm = INSTR (20, 13);
7747   double d = fp_immediate_for_encoding_64 (imm);
7748 
7749   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7750   aarch64_set_FP_double (cpu, sd, d);
7751 }
7752 
7753 static void
7754 dexSimpleFPImmediate (sim_cpu *cpu)
7755 {
7756   /* instr[31,23] == 00111100
7757      instr[22]    == type : single(0)/double(1)
7758      instr[21]    == 1
7759      instr[20,13] == imm8
7760      instr[12,10] == 100
7761      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
7762      instr[4,0]   == Rd  */
7763   uint32_t imm5 = INSTR (9, 5);
7764 
7765   NYI_assert (31, 23, 0x3C);
7766 
7767   if (imm5 != 0)
7768     HALT_UNALLOC;
7769 
7770   if (INSTR (22, 22))
7771     fmovd (cpu);
7772   else
7773     fmovs (cpu);
7774 }
7775 
7776 /* TODO specific decode and execute for group Load Store.  */
7777 
7778 /* TODO FP load/store single register (unscaled offset).  */
7779 
7780 /* TODO load 8 bit unscaled signed 9 bit.  */
7781 /* TODO load 16 bit unscaled signed 9 bit.  */
7782 
7783 /* Load 32 bit unscaled signed 9 bit.  */
7784 static void
7785 fldurs (sim_cpu *cpu, int32_t offset)
7786 {
7787   unsigned int rn = INSTR (9, 5);
7788   unsigned int st = INSTR (4, 0);
7789 
7790   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7791   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
7792 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7793 }
7794 
7795 /* Load 64 bit unscaled signed 9 bit.  */
7796 static void
7797 fldurd (sim_cpu *cpu, int32_t offset)
7798 {
7799   unsigned int rn = INSTR (9, 5);
7800   unsigned int st = INSTR (4, 0);
7801 
7802   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7803   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
7804 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7805 }
7806 
7807 /* Load 128 bit unscaled signed 9 bit.  */
7808 static void
7809 fldurq (sim_cpu *cpu, int32_t offset)
7810 {
7811   unsigned int rn = INSTR (9, 5);
7812   unsigned int st = INSTR (4, 0);
7813   FRegister a;
7814   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
7815 
7816   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7817   aarch64_get_mem_long_double (cpu, addr, & a);
7818   aarch64_set_FP_long_double (cpu, st, a);
7819 }
7820 
7821 /* TODO store 8 bit unscaled signed 9 bit.  */
7822 /* TODO store 16 bit unscaled signed 9 bit.  */
7823 
7824 
7825 /* 1 source.  */
7826 
7827 /* Float absolute value.  */
7828 static void
7829 fabss (sim_cpu *cpu)
7830 {
7831   unsigned sn = INSTR (9, 5);
7832   unsigned sd = INSTR (4, 0);
7833   float value = aarch64_get_FP_float (cpu, sn);
7834 
7835   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7836   aarch64_set_FP_float (cpu, sd, fabsf (value));
7837 }
7838 
7839 /* Double absolute value.  */
7840 static void
7841 fabcpu (sim_cpu *cpu)
7842 {
7843   unsigned sn = INSTR (9, 5);
7844   unsigned sd = INSTR (4, 0);
7845   double value = aarch64_get_FP_double (cpu, sn);
7846 
7847   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7848   aarch64_set_FP_double (cpu, sd, fabs (value));
7849 }
7850 
7851 /* Float negative value.  */
7852 static void
7853 fnegs (sim_cpu *cpu)
7854 {
7855   unsigned sn = INSTR (9, 5);
7856   unsigned sd = INSTR (4, 0);
7857 
7858   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7859   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
7860 }
7861 
7862 /* Double negative value.  */
7863 static void
7864 fnegd (sim_cpu *cpu)
7865 {
7866   unsigned sn = INSTR (9, 5);
7867   unsigned sd = INSTR (4, 0);
7868 
7869   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7870   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
7871 }
7872 
7873 /* Float square root.  */
7874 static void
7875 fsqrts (sim_cpu *cpu)
7876 {
7877   unsigned sn = INSTR (9, 5);
7878   unsigned sd = INSTR (4, 0);
7879 
7880   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7881   aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
7882 }
7883 
7884 /* Double square root.  */
7885 static void
7886 fsqrtd (sim_cpu *cpu)
7887 {
7888   unsigned sn = INSTR (9, 5);
7889   unsigned sd = INSTR (4, 0);
7890 
7891   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7892   aarch64_set_FP_double (cpu, sd,
7893 			 sqrt (aarch64_get_FP_double (cpu, sn)));
7894 }
7895 
7896 /* Convert double to float.  */
7897 static void
7898 fcvtds (sim_cpu *cpu)
7899 {
7900   unsigned sn = INSTR (9, 5);
7901   unsigned sd = INSTR (4, 0);
7902 
7903   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7904   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
7905 }
7906 
7907 /* Convert float to double.  */
7908 static void
7909 fcvtcpu (sim_cpu *cpu)
7910 {
7911   unsigned sn = INSTR (9, 5);
7912   unsigned sd = INSTR (4, 0);
7913 
7914   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7915   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
7916 }
7917 
7918 static void
7919 do_FRINT (sim_cpu *cpu)
7920 {
7921   /* instr[31,23] = 0001 1110 0
7922      instr[22]    = single(0)/double(1)
7923      instr[21,18] = 1001
7924      instr[17,15] = rounding mode
7925      instr[14,10] = 10000
7926      instr[9,5]   = source
7927      instr[4,0]   = dest  */
7928 
7929   float val;
7930   unsigned rs = INSTR (9, 5);
7931   unsigned rd = INSTR (4, 0);
7932   unsigned int rmode = INSTR (17, 15);
7933 
7934   NYI_assert (31, 23, 0x03C);
7935   NYI_assert (21, 18, 0x9);
7936   NYI_assert (14, 10, 0x10);
7937 
7938   if (rmode == 6 || rmode == 7)
7939     /* FIXME: Add support for rmode == 6 exactness check.  */
7940     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
7941 
7942   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7943   if (INSTR (22, 22))
7944     {
7945       double dval = aarch64_get_FP_double (cpu, rs);
7946 
7947       switch (rmode)
7948 	{
7949 	case 0: /* mode N: nearest or even.  */
7950 	  {
7951 	    double rval = round (dval);
7952 
7953 	    if (dval - rval == 0.5)
7954 	      {
7955 		if (((rval / 2.0) * 2.0) != rval)
7956 		  rval += 1.0;
7957 	      }
7958 
7959 	    aarch64_set_FP_double (cpu, rd, round (dval));
7960 	    return;
7961 	  }
7962 
7963 	case 1: /* mode P: towards +inf.  */
7964 	  if (dval < 0.0)
7965 	    aarch64_set_FP_double (cpu, rd, trunc (dval));
7966 	  else
7967 	    aarch64_set_FP_double (cpu, rd, round (dval));
7968 	  return;
7969 
7970 	case 2: /* mode M: towards -inf.  */
7971 	  if (dval < 0.0)
7972 	    aarch64_set_FP_double (cpu, rd, round (dval));
7973 	  else
7974 	    aarch64_set_FP_double (cpu, rd, trunc (dval));
7975 	  return;
7976 
7977 	case 3: /* mode Z: towards 0.  */
7978 	  aarch64_set_FP_double (cpu, rd, trunc (dval));
7979 	  return;
7980 
7981 	case 4: /* mode A: away from 0.  */
7982 	  aarch64_set_FP_double (cpu, rd, round (dval));
7983 	  return;
7984 
7985 	case 6: /* mode X: use FPCR with exactness check.  */
7986 	case 7: /* mode I: use FPCR mode.  */
7987 	  HALT_NYI;
7988 
7989 	default:
7990 	  HALT_UNALLOC;
7991 	}
7992     }
7993 
7994   val = aarch64_get_FP_float (cpu, rs);
7995 
7996   switch (rmode)
7997     {
7998     case 0: /* mode N: nearest or even.  */
7999       {
8000 	float rval = roundf (val);
8001 
8002 	if (val - rval == 0.5)
8003 	  {
8004 	    if (((rval / 2.0) * 2.0) != rval)
8005 	      rval += 1.0;
8006 	  }
8007 
8008 	aarch64_set_FP_float (cpu, rd, rval);
8009 	return;
8010       }
8011 
8012     case 1: /* mode P: towards +inf.  */
8013       if (val < 0.0)
8014 	aarch64_set_FP_float (cpu, rd, truncf (val));
8015       else
8016 	aarch64_set_FP_float (cpu, rd, roundf (val));
8017       return;
8018 
8019     case 2: /* mode M: towards -inf.  */
8020       if (val < 0.0)
8021 	aarch64_set_FP_float (cpu, rd, truncf (val));
8022       else
8023 	aarch64_set_FP_float (cpu, rd, roundf (val));
8024       return;
8025 
8026     case 3: /* mode Z: towards 0.  */
8027       aarch64_set_FP_float (cpu, rd, truncf (val));
8028       return;
8029 
8030     case 4: /* mode A: away from 0.  */
8031       aarch64_set_FP_float (cpu, rd, roundf (val));
8032       return;
8033 
8034     case 6: /* mode X: use FPCR with exactness check.  */
8035     case 7: /* mode I: use FPCR mode.  */
8036       HALT_NYI;
8037 
8038     default:
8039       HALT_UNALLOC;
8040     }
8041 }
8042 
8043 /* Convert half to float.  */
8044 static void
8045 do_FCVT_half_to_single (sim_cpu *cpu)
8046 {
8047   unsigned rn = INSTR (9, 5);
8048   unsigned rd = INSTR (4, 0);
8049 
8050   NYI_assert (31, 10, 0x7B890);
8051 
8052   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8053   aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
8054 }
8055 
8056 /* Convert half to double.  */
8057 static void
8058 do_FCVT_half_to_double (sim_cpu *cpu)
8059 {
8060   unsigned rn = INSTR (9, 5);
8061   unsigned rd = INSTR (4, 0);
8062 
8063   NYI_assert (31, 10, 0x7B8B0);
8064 
8065   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8066   aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
8067 }
8068 
8069 static void
8070 do_FCVT_single_to_half (sim_cpu *cpu)
8071 {
8072   unsigned rn = INSTR (9, 5);
8073   unsigned rd = INSTR (4, 0);
8074 
8075   NYI_assert (31, 10, 0x788F0);
8076 
8077   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8078   aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
8079 }
8080 
8081 /* Convert double to half.  */
8082 static void
8083 do_FCVT_double_to_half (sim_cpu *cpu)
8084 {
8085   unsigned rn = INSTR (9, 5);
8086   unsigned rd = INSTR (4, 0);
8087 
8088   NYI_assert (31, 10, 0x798F0);
8089 
8090   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8091   aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
8092 }
8093 
8094 static void
8095 dexSimpleFPDataProc1Source (sim_cpu *cpu)
8096 {
8097   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
8098      instr[30]    = 0
8099      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
8100      instr[28,25] = 1111
8101      instr[24]    = 0
8102      instr[23,22] ==> type : 00 ==> source is single,
8103                              01 ==> source is double
8104                              10 ==> UNALLOC
8105                              11 ==> UNALLOC or source is half
8106      instr[21]    = 1
8107      instr[20,15] ==> opcode : with type 00 or 01
8108                                000000 ==> FMOV, 000001 ==> FABS,
8109                                000010 ==> FNEG, 000011 ==> FSQRT,
8110                                000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
8111                                000110 ==> UNALLOC, 000111 ==> FCVT (to half)
8112                                001000 ==> FRINTN, 001001 ==> FRINTP,
8113                                001010 ==> FRINTM, 001011 ==> FRINTZ,
8114                                001100 ==> FRINTA, 001101 ==> UNALLOC
8115                                001110 ==> FRINTX, 001111 ==> FRINTI
8116                                with type 11
8117                                000100 ==> FCVT (half-to-single)
8118                                000101 ==> FCVT (half-to-double)
8119 			       instr[14,10] = 10000.  */
8120 
8121   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8122   uint32_t type   = INSTR (23, 22);
8123   uint32_t opcode = INSTR (20, 15);
8124 
8125   if (M_S != 0)
8126     HALT_UNALLOC;
8127 
8128   if (type == 3)
8129     {
8130       if (opcode == 4)
8131 	do_FCVT_half_to_single (cpu);
8132       else if (opcode == 5)
8133 	do_FCVT_half_to_double (cpu);
8134       else
8135 	HALT_UNALLOC;
8136       return;
8137     }
8138 
8139   if (type == 2)
8140     HALT_UNALLOC;
8141 
8142   switch (opcode)
8143     {
8144     case 0:
8145       if (type)
8146 	ffmovd (cpu);
8147       else
8148 	ffmovs (cpu);
8149       return;
8150 
8151     case 1:
8152       if (type)
8153 	fabcpu (cpu);
8154       else
8155 	fabss (cpu);
8156       return;
8157 
8158     case 2:
8159       if (type)
8160 	fnegd (cpu);
8161       else
8162 	fnegs (cpu);
8163       return;
8164 
8165     case 3:
8166       if (type)
8167 	fsqrtd (cpu);
8168       else
8169 	fsqrts (cpu);
8170       return;
8171 
8172     case 4:
8173       if (type)
8174 	fcvtds (cpu);
8175       else
8176 	HALT_UNALLOC;
8177       return;
8178 
8179     case 5:
8180       if (type)
8181 	HALT_UNALLOC;
8182       fcvtcpu (cpu);
8183       return;
8184 
8185     case 8:		/* FRINTN etc.  */
8186     case 9:
8187     case 10:
8188     case 11:
8189     case 12:
8190     case 14:
8191     case 15:
8192        do_FRINT (cpu);
8193        return;
8194 
8195     case 7:
8196       if (INSTR (22, 22))
8197 	do_FCVT_double_to_half (cpu);
8198       else
8199 	do_FCVT_single_to_half (cpu);
8200       return;
8201 
8202     case 13:
8203       HALT_NYI;
8204 
8205     default:
8206       HALT_UNALLOC;
8207     }
8208 }
8209 
8210 /* 32 bit signed int to float.  */
8211 static void
8212 scvtf32 (sim_cpu *cpu)
8213 {
8214   unsigned rn = INSTR (9, 5);
8215   unsigned sd = INSTR (4, 0);
8216 
8217   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8218   aarch64_set_FP_float
8219     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8220 }
8221 
8222 /* signed int to float.  */
8223 static void
8224 scvtf (sim_cpu *cpu)
8225 {
8226   unsigned rn = INSTR (9, 5);
8227   unsigned sd = INSTR (4, 0);
8228 
8229   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8230   aarch64_set_FP_float
8231     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8232 }
8233 
8234 /* 32 bit signed int to double.  */
8235 static void
8236 scvtd32 (sim_cpu *cpu)
8237 {
8238   unsigned rn = INSTR (9, 5);
8239   unsigned sd = INSTR (4, 0);
8240 
8241   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8242   aarch64_set_FP_double
8243     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8244 }
8245 
8246 /* signed int to double.  */
8247 static void
8248 scvtd (sim_cpu *cpu)
8249 {
8250   unsigned rn = INSTR (9, 5);
8251   unsigned sd = INSTR (4, 0);
8252 
8253   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8254   aarch64_set_FP_double
8255     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8256 }
8257 
8258 static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
8259 static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
8260 static const double DOUBLE_INT_MAX  = (double) INT_MAX;
8261 static const double DOUBLE_INT_MIN  = (double) INT_MIN;
8262 static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
8263 static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
8264 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
8265 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
8266 
8267 #define UINT_MIN 0
8268 #define ULONG_MIN 0
8269 static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
8270 static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
8271 static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
8272 static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
8273 static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
8274 static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
8275 static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
8276 static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
8277 
8278 /* Check for FP exception conditions:
8279      NaN raises IO
8280      Infinity raises IO
8281      Out of Range raises IO and IX and saturates value
8282      Denormal raises ID and IX and sets to zero.  */
8283 #define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)	\
8284   do							\
8285     {							\
8286       switch (fpclassify (F))				\
8287 	{						\
8288 	case FP_INFINITE:				\
8289 	case FP_NAN:					\
8290 	  aarch64_set_FPSR (cpu, IO);			\
8291 	  if (signbit (F))				\
8292 	    VALUE = ITYPE##_MAX;			\
8293 	  else						\
8294 	    VALUE = ITYPE##_MIN;			\
8295 	  break;					\
8296 							\
8297 	case FP_NORMAL:					\
8298 	  if (F >= FTYPE##_##ITYPE##_MAX)		\
8299 	    {						\
8300 	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
8301 	      VALUE = ITYPE##_MAX;			\
8302 	    }						\
8303 	  else if (F <= FTYPE##_##ITYPE##_MIN)		\
8304 	    {						\
8305 	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
8306 	      VALUE = ITYPE##_MIN;			\
8307 	    }						\
8308 	  break;					\
8309 							\
8310 	case FP_SUBNORMAL:				\
8311 	  aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);	\
8312 	  VALUE = 0;					\
8313 	  break;					\
8314 							\
8315 	default:					\
8316 	case FP_ZERO:					\
8317 	  VALUE = 0;					\
8318 	  break;					\
8319 	}						\
8320     }							\
8321   while (0)
8322 
8323 /* 32 bit convert float to signed int truncate towards zero.  */
8324 static void
8325 fcvtszs32 (sim_cpu *cpu)
8326 {
8327   unsigned sn = INSTR (9, 5);
8328   unsigned rd = INSTR (4, 0);
8329   /* TODO : check that this rounds toward zero.  */
8330   float   f = aarch64_get_FP_float (cpu, sn);
8331   int32_t value = (int32_t) f;
8332 
8333   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
8334 
8335   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8336   /* Avoid sign extension to 64 bit.  */
8337   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8338 }
8339 
8340 /* 64 bit convert float to signed int truncate towards zero.  */
8341 static void
8342 fcvtszs (sim_cpu *cpu)
8343 {
8344   unsigned sn = INSTR (9, 5);
8345   unsigned rd = INSTR (4, 0);
8346   float f = aarch64_get_FP_float (cpu, sn);
8347   int64_t value = (int64_t) f;
8348 
8349   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
8350 
8351   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8352   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8353 }
8354 
8355 /* 32 bit convert double to signed int truncate towards zero.  */
8356 static void
8357 fcvtszd32 (sim_cpu *cpu)
8358 {
8359   unsigned sn = INSTR (9, 5);
8360   unsigned rd = INSTR (4, 0);
8361   /* TODO : check that this rounds toward zero.  */
8362   double   d = aarch64_get_FP_double (cpu, sn);
8363   int32_t  value = (int32_t) d;
8364 
8365   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
8366 
8367   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8368   /* Avoid sign extension to 64 bit.  */
8369   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8370 }
8371 
8372 /* 64 bit convert double to signed int truncate towards zero.  */
8373 static void
8374 fcvtszd (sim_cpu *cpu)
8375 {
8376   unsigned sn = INSTR (9, 5);
8377   unsigned rd = INSTR (4, 0);
8378   /* TODO : check that this rounds toward zero.  */
8379   double  d = aarch64_get_FP_double (cpu, sn);
8380   int64_t value;
8381 
8382   value = (int64_t) d;
8383 
8384   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
8385 
8386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8387   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8388 }
8389 
8390 static void
8391 do_fcvtzu (sim_cpu *cpu)
8392 {
8393   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8394      instr[30,23] = 00111100
8395      instr[22]    = type: single (0)/ double (1)
8396      instr[21]    = enable (0)/disable(1) precision
8397      instr[20,16] = 11001
8398      instr[15,10] = precision
8399      instr[9,5]   = Rs
8400      instr[4,0]   = Rd.  */
8401 
8402   unsigned rs = INSTR (9, 5);
8403   unsigned rd = INSTR (4, 0);
8404 
8405   NYI_assert (30, 23, 0x3C);
8406   NYI_assert (20, 16, 0x19);
8407 
8408   if (INSTR (21, 21) != 1)
8409     /* Convert to fixed point.  */
8410     HALT_NYI;
8411 
8412   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8413   if (INSTR (31, 31))
8414     {
8415       /* Convert to unsigned 64-bit integer.  */
8416       if (INSTR (22, 22))
8417 	{
8418 	  double  d = aarch64_get_FP_double (cpu, rs);
8419 	  uint64_t value = (uint64_t) d;
8420 
8421 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
8422 	  if (value != (1ULL << 63))
8423 	    RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
8424 
8425 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8426 	}
8427       else
8428 	{
8429 	  float  f = aarch64_get_FP_float (cpu, rs);
8430 	  uint64_t value = (uint64_t) f;
8431 
8432 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
8433 	  if (value != (1ULL << 63))
8434 	    RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
8435 
8436 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8437 	}
8438     }
8439   else
8440     {
8441       uint32_t value;
8442 
8443       /* Convert to unsigned 32-bit integer.  */
8444       if (INSTR (22, 22))
8445 	{
8446 	  double  d = aarch64_get_FP_double (cpu, rs);
8447 
8448 	  value = (uint32_t) d;
8449 	  /* Do not raise an exception if we have reached UINT_MAX.  */
8450 	  if (value != (1UL << 31))
8451 	    RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
8452 	}
8453       else
8454 	{
8455 	  float  f = aarch64_get_FP_float (cpu, rs);
8456 
8457 	  value = (uint32_t) f;
8458 	  /* Do not raise an exception if we have reached UINT_MAX.  */
8459 	  if (value != (1UL << 31))
8460 	    RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
8461 	}
8462 
8463       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8464     }
8465 }
8466 
8467 static void
8468 do_UCVTF (sim_cpu *cpu)
8469 {
8470   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8471      instr[30,23] = 001 1110 0
8472      instr[22]    = type: single (0)/ double (1)
8473      instr[21]    = enable (0)/disable(1) precision
8474      instr[20,16] = 0 0011
8475      instr[15,10] = precision
8476      instr[9,5]   = Rs
8477      instr[4,0]   = Rd.  */
8478 
8479   unsigned rs = INSTR (9, 5);
8480   unsigned rd = INSTR (4, 0);
8481 
8482   NYI_assert (30, 23, 0x3C);
8483   NYI_assert (20, 16, 0x03);
8484 
8485   if (INSTR (21, 21) != 1)
8486     HALT_NYI;
8487 
8488   /* FIXME: Add exception raising.  */
8489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8490   if (INSTR (31, 31))
8491     {
8492       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
8493 
8494       if (INSTR (22, 22))
8495 	aarch64_set_FP_double (cpu, rd, (double) value);
8496       else
8497 	aarch64_set_FP_float (cpu, rd, (float) value);
8498     }
8499   else
8500     {
8501       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
8502 
8503       if (INSTR (22, 22))
8504 	aarch64_set_FP_double (cpu, rd, (double) value);
8505       else
8506 	aarch64_set_FP_float (cpu, rd, (float) value);
8507     }
8508 }
8509 
8510 static void
8511 float_vector_move (sim_cpu *cpu)
8512 {
8513   /* instr[31,17] == 100 1111 0101 0111
8514      instr[16]    ==> direction 0=> to GR, 1=> from GR
8515      instr[15,10] => ???
8516      instr[9,5]   ==> source
8517      instr[4,0]   ==> dest.  */
8518 
8519   unsigned rn = INSTR (9, 5);
8520   unsigned rd = INSTR (4, 0);
8521 
8522   NYI_assert (31, 17, 0x4F57);
8523 
8524   if (INSTR (15, 10) != 0)
8525     HALT_UNALLOC;
8526 
8527   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8528   if (INSTR (16, 16))
8529     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
8530   else
8531     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
8532 }
8533 
8534 static void
8535 dexSimpleFPIntegerConvert (sim_cpu *cpu)
8536 {
8537   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
8538      instr[30     = 0
8539      instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
8540      instr[28,25] = 1111
8541      instr[24]    = 0
8542      instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
8543      instr[21]    = 1
8544      instr[20,19] = rmode
8545      instr[18,16] = opcode
8546      instr[15,10] = 10 0000  */
8547 
8548   uint32_t rmode_opcode;
8549   uint32_t size_type;
8550   uint32_t type;
8551   uint32_t size;
8552   uint32_t S;
8553 
8554   if (INSTR (31, 17) == 0x4F57)
8555     {
8556       float_vector_move (cpu);
8557       return;
8558     }
8559 
8560   size = INSTR (31, 31);
8561   S = INSTR (29, 29);
8562   if (S != 0)
8563     HALT_UNALLOC;
8564 
8565   type = INSTR (23, 22);
8566   if (type > 1)
8567     HALT_UNALLOC;
8568 
8569   rmode_opcode = INSTR (20, 16);
8570   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
8571 
8572   switch (rmode_opcode)
8573     {
8574     case 2:			/* SCVTF.  */
8575       switch (size_type)
8576 	{
8577 	case 0: scvtf32 (cpu); return;
8578 	case 1: scvtd32 (cpu); return;
8579 	case 2: scvtf (cpu); return;
8580 	case 3: scvtd (cpu); return;
8581 	default: HALT_UNALLOC;
8582 	}
8583 
8584     case 6:			/* FMOV GR, Vec.  */
8585       switch (size_type)
8586 	{
8587 	case 0:  gfmovs (cpu); return;
8588 	case 3:  gfmovd (cpu); return;
8589 	default: HALT_UNALLOC;
8590 	}
8591 
8592     case 7:			/* FMOV vec, GR.  */
8593       switch (size_type)
8594 	{
8595 	case 0:  fgmovs (cpu); return;
8596 	case 3:  fgmovd (cpu); return;
8597 	default: HALT_UNALLOC;
8598 	}
8599 
8600     case 24:			/* FCVTZS.  */
8601       switch (size_type)
8602 	{
8603 	case 0: fcvtszs32 (cpu); return;
8604 	case 1: fcvtszd32 (cpu); return;
8605 	case 2: fcvtszs (cpu); return;
8606 	case 3: fcvtszd (cpu); return;
8607 	default: HALT_UNALLOC;
8608 	}
8609 
8610     case 25: do_fcvtzu (cpu); return;
8611     case 3:  do_UCVTF (cpu); return;
8612 
8613     case 0:	/* FCVTNS.  */
8614     case 1:	/* FCVTNU.  */
8615     case 4:	/* FCVTAS.  */
8616     case 5:	/* FCVTAU.  */
8617     case 8:	/* FCVPTS.  */
8618     case 9:	/* FCVTPU.  */
8619     case 16:	/* FCVTMS.  */
8620     case 17:	/* FCVTMU.  */
8621     default:
8622       HALT_NYI;
8623     }
8624 }
8625 
8626 static void
8627 set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
8628 {
8629   uint32_t flags;
8630 
8631   /* FIXME: Add exception raising.  */
8632   if (isnan (fvalue1) || isnan (fvalue2))
8633     flags = C|V;
8634   else if (isinf (fvalue1) && isinf (fvalue2))
8635     {
8636       /* Subtracting two infinities may give a NaN.  We only need to compare
8637 	 the signs, which we can get from isinf.  */
8638       int result = isinf (fvalue1) - isinf (fvalue2);
8639 
8640       if (result == 0)
8641 	flags = Z|C;
8642       else if (result < 0)
8643 	flags = N;
8644       else /* (result > 0).  */
8645 	flags = C;
8646     }
8647   else
8648     {
8649       float result = fvalue1 - fvalue2;
8650 
8651       if (result == 0.0)
8652 	flags = Z|C;
8653       else if (result < 0)
8654 	flags = N;
8655       else /* (result > 0).  */
8656 	flags = C;
8657     }
8658 
8659   aarch64_set_CPSR (cpu, flags);
8660 }
8661 
8662 static void
8663 fcmps (sim_cpu *cpu)
8664 {
8665   unsigned sm = INSTR (20, 16);
8666   unsigned sn = INSTR ( 9,  5);
8667 
8668   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8669   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8670 
8671   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8672   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8673 }
8674 
8675 /* Float compare to zero -- Invalid Operation exception
8676    only on signaling NaNs.  */
8677 static void
8678 fcmpzs (sim_cpu *cpu)
8679 {
8680   unsigned sn = INSTR ( 9,  5);
8681   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8682 
8683   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8684   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8685 }
8686 
8687 /* Float compare -- Invalid Operation exception on all NaNs.  */
8688 static void
8689 fcmpes (sim_cpu *cpu)
8690 {
8691   unsigned sm = INSTR (20, 16);
8692   unsigned sn = INSTR ( 9,  5);
8693 
8694   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8695   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8696 
8697   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8698   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8699 }
8700 
8701 /* Float compare to zero -- Invalid Operation exception on all NaNs.  */
8702 static void
8703 fcmpzes (sim_cpu *cpu)
8704 {
8705   unsigned sn = INSTR ( 9,  5);
8706   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8707 
8708   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8709   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8710 }
8711 
8712 static void
8713 set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
8714 {
8715   uint32_t flags;
8716 
8717   /* FIXME: Add exception raising.  */
8718   if (isnan (dval1) || isnan (dval2))
8719     flags = C|V;
8720   else if (isinf (dval1) && isinf (dval2))
8721     {
8722       /* Subtracting two infinities may give a NaN.  We only need to compare
8723 	 the signs, which we can get from isinf.  */
8724       int result = isinf (dval1) - isinf (dval2);
8725 
8726       if (result == 0)
8727 	flags = Z|C;
8728       else if (result < 0)
8729 	flags = N;
8730       else /* (result > 0).  */
8731 	flags = C;
8732     }
8733   else
8734     {
8735       double result = dval1 - dval2;
8736 
8737       if (result == 0.0)
8738 	flags = Z|C;
8739       else if (result < 0)
8740 	flags = N;
8741       else /* (result > 0).  */
8742 	flags = C;
8743     }
8744 
8745   aarch64_set_CPSR (cpu, flags);
8746 }
8747 
8748 /* Double compare -- Invalid Operation exception only on signaling NaNs.  */
8749 static void
8750 fcmpd (sim_cpu *cpu)
8751 {
8752   unsigned sm = INSTR (20, 16);
8753   unsigned sn = INSTR ( 9,  5);
8754 
8755   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8756   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8757 
8758   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8759   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8760 }
8761 
8762 /* Double compare to zero -- Invalid Operation exception
8763    only on signaling NaNs.  */
8764 static void
8765 fcmpzd (sim_cpu *cpu)
8766 {
8767   unsigned sn = INSTR ( 9,  5);
8768   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8769 
8770   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8771   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8772 }
8773 
8774 /* Double compare -- Invalid Operation exception on all NaNs.  */
8775 static void
8776 fcmped (sim_cpu *cpu)
8777 {
8778   unsigned sm = INSTR (20, 16);
8779   unsigned sn = INSTR ( 9,  5);
8780 
8781   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8782   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8783 
8784   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8785   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8786 }
8787 
8788 /* Double compare to zero -- Invalid Operation exception on all NaNs.  */
8789 static void
8790 fcmpzed (sim_cpu *cpu)
8791 {
8792   unsigned sn = INSTR ( 9,  5);
8793   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8794 
8795   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8796   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8797 }
8798 
8799 static void
8800 dexSimpleFPCompare (sim_cpu *cpu)
8801 {
8802   /* assert instr[28,25] == 1111
8803      instr[30:24:21:13,10] = 0011000
8804      instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
8805      instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
8806      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
8807      instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
8808      instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
8809                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
8810                               ow ==> UNALLOC  */
8811   uint32_t dispatch;
8812   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8813   uint32_t type = INSTR (23, 22);
8814   uint32_t op = INSTR (15, 14);
8815   uint32_t op2_2_0 = INSTR (2, 0);
8816 
8817   if (op2_2_0 != 0)
8818     HALT_UNALLOC;
8819 
8820   if (M_S != 0)
8821     HALT_UNALLOC;
8822 
8823   if (type > 1)
8824     HALT_UNALLOC;
8825 
8826   if (op != 0)
8827     HALT_UNALLOC;
8828 
8829   /* dispatch on type and top 2 bits of opcode.  */
8830   dispatch = (type << 2) | INSTR (4, 3);
8831 
8832   switch (dispatch)
8833     {
8834     case 0: fcmps (cpu); return;
8835     case 1: fcmpzs (cpu); return;
8836     case 2: fcmpes (cpu); return;
8837     case 3: fcmpzes (cpu); return;
8838     case 4: fcmpd (cpu); return;
8839     case 5: fcmpzd (cpu); return;
8840     case 6: fcmped (cpu); return;
8841     case 7: fcmpzed (cpu); return;
8842     }
8843 }
8844 
8845 static void
8846 do_scalar_FADDP (sim_cpu *cpu)
8847 {
8848   /* instr [31,23] = 0111 1110 0
8849      instr [22]    = single(0)/double(1)
8850      instr [21,10] = 11 0000 1101 10
8851      instr [9,5]   = Fn
8852      instr [4,0]   = Fd.  */
8853 
8854   unsigned Fn = INSTR (9, 5);
8855   unsigned Fd = INSTR (4, 0);
8856 
8857   NYI_assert (31, 23, 0x0FC);
8858   NYI_assert (21, 10, 0xC36);
8859 
8860   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8861   if (INSTR (22, 22))
8862     {
8863       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
8864       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
8865 
8866       aarch64_set_FP_double (cpu, Fd, val1 + val2);
8867     }
8868   else
8869     {
8870       float val1 = aarch64_get_vec_float (cpu, Fn, 0);
8871       float val2 = aarch64_get_vec_float (cpu, Fn, 1);
8872 
8873       aarch64_set_FP_float (cpu, Fd, val1 + val2);
8874     }
8875 }
8876 
8877 /* Floating point absolute difference.  */
8878 
8879 static void
8880 do_scalar_FABD (sim_cpu *cpu)
8881 {
8882   /* instr [31,23] = 0111 1110 1
8883      instr [22]    = float(0)/double(1)
8884      instr [21]    = 1
8885      instr [20,16] = Rm
8886      instr [15,10] = 1101 01
8887      instr [9, 5]  = Rn
8888      instr [4, 0]  = Rd.  */
8889 
8890   unsigned rm = INSTR (20, 16);
8891   unsigned rn = INSTR (9, 5);
8892   unsigned rd = INSTR (4, 0);
8893 
8894   NYI_assert (31, 23, 0x0FD);
8895   NYI_assert (21, 21, 1);
8896   NYI_assert (15, 10, 0x35);
8897 
8898   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8899   if (INSTR (22, 22))
8900     aarch64_set_FP_double (cpu, rd,
8901 			   fabs (aarch64_get_FP_double (cpu, rn)
8902 				 - aarch64_get_FP_double (cpu, rm)));
8903   else
8904     aarch64_set_FP_float (cpu, rd,
8905 			  fabsf (aarch64_get_FP_float (cpu, rn)
8906 				 - aarch64_get_FP_float (cpu, rm)));
8907 }
8908 
8909 static void
8910 do_scalar_CMGT (sim_cpu *cpu)
8911 {
8912   /* instr [31,21] = 0101 1110 111
8913      instr [20,16] = Rm
8914      instr [15,10] = 00 1101
8915      instr [9, 5]  = Rn
8916      instr [4, 0]  = Rd.  */
8917 
8918   unsigned rm = INSTR (20, 16);
8919   unsigned rn = INSTR (9, 5);
8920   unsigned rd = INSTR (4, 0);
8921 
8922   NYI_assert (31, 21, 0x2F7);
8923   NYI_assert (15, 10, 0x0D);
8924 
8925   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8926   aarch64_set_vec_u64 (cpu, rd, 0,
8927 		       aarch64_get_vec_u64 (cpu, rn, 0) >
8928 		       aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
8929 }
8930 
8931 static void
8932 do_scalar_USHR (sim_cpu *cpu)
8933 {
8934   /* instr [31,23] = 0111 1111 0
8935      instr [22,16] = shift amount
8936      instr [15,10] = 0000 01
8937      instr [9, 5]  = Rn
8938      instr [4, 0]  = Rd.  */
8939 
8940   unsigned amount = 128 - INSTR (22, 16);
8941   unsigned rn = INSTR (9, 5);
8942   unsigned rd = INSTR (4, 0);
8943 
8944   NYI_assert (31, 23, 0x0FE);
8945   NYI_assert (15, 10, 0x01);
8946 
8947   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8948   aarch64_set_vec_u64 (cpu, rd, 0,
8949 		       aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
8950 }
8951 
8952 static void
8953 do_scalar_SSHL (sim_cpu *cpu)
8954 {
8955   /* instr [31,21] = 0101 1110 111
8956      instr [20,16] = Rm
8957      instr [15,10] = 0100 01
8958      instr [9, 5]  = Rn
8959      instr [4, 0]  = Rd.  */
8960 
8961   unsigned rm = INSTR (20, 16);
8962   unsigned rn = INSTR (9, 5);
8963   unsigned rd = INSTR (4, 0);
8964   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
8965 
8966   NYI_assert (31, 21, 0x2F7);
8967   NYI_assert (15, 10, 0x11);
8968 
8969   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8970   if (shift >= 0)
8971     aarch64_set_vec_s64 (cpu, rd, 0,
8972 			 aarch64_get_vec_s64 (cpu, rn, 0) << shift);
8973   else
8974     aarch64_set_vec_s64 (cpu, rd, 0,
8975 			 aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
8976 }
8977 
8978 /* Floating point scalar compare greater than or equal to 0.  */
8979 static void
8980 do_scalar_FCMGE_zero (sim_cpu *cpu)
8981 {
8982   /* instr [31,23] = 0111 1110 1
8983      instr [22,22] = size
8984      instr [21,16] = 1000 00
8985      instr [15,10] = 1100 10
8986      instr [9, 5]  = Rn
8987      instr [4, 0]  = Rd.  */
8988 
8989   unsigned size = INSTR (22, 22);
8990   unsigned rn = INSTR (9, 5);
8991   unsigned rd = INSTR (4, 0);
8992 
8993   NYI_assert (31, 23, 0x0FD);
8994   NYI_assert (21, 16, 0x20);
8995   NYI_assert (15, 10, 0x32);
8996 
8997   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8998   if (size)
8999     aarch64_set_vec_u64 (cpu, rd, 0,
9000 			 aarch64_get_vec_double (cpu, rn, 0) >= 0.0 ? -1 : 0);
9001   else
9002     aarch64_set_vec_u32 (cpu, rd, 0,
9003 			 aarch64_get_vec_float (cpu, rn, 0) >= 0.0 ? -1 : 0);
9004 }
9005 
9006 /* Floating point scalar compare less than or equal to 0.  */
9007 static void
9008 do_scalar_FCMLE_zero (sim_cpu *cpu)
9009 {
9010   /* instr [31,23] = 0111 1110 1
9011      instr [22,22] = size
9012      instr [21,16] = 1000 00
9013      instr [15,10] = 1101 10
9014      instr [9, 5]  = Rn
9015      instr [4, 0]  = Rd.  */
9016 
9017   unsigned size = INSTR (22, 22);
9018   unsigned rn = INSTR (9, 5);
9019   unsigned rd = INSTR (4, 0);
9020 
9021   NYI_assert (31, 23, 0x0FD);
9022   NYI_assert (21, 16, 0x20);
9023   NYI_assert (15, 10, 0x36);
9024 
9025   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9026   if (size)
9027     aarch64_set_vec_u64 (cpu, rd, 0,
9028 			 aarch64_get_vec_double (cpu, rn, 0) <= 0.0 ? -1 : 0);
9029   else
9030     aarch64_set_vec_u32 (cpu, rd, 0,
9031 			 aarch64_get_vec_float (cpu, rn, 0) <= 0.0 ? -1 : 0);
9032 }
9033 
9034 /* Floating point scalar compare greater than 0.  */
9035 static void
9036 do_scalar_FCMGT_zero (sim_cpu *cpu)
9037 {
9038   /* instr [31,23] = 0101 1110 1
9039      instr [22,22] = size
9040      instr [21,16] = 1000 00
9041      instr [15,10] = 1100 10
9042      instr [9, 5]  = Rn
9043      instr [4, 0]  = Rd.  */
9044 
9045   unsigned size = INSTR (22, 22);
9046   unsigned rn = INSTR (9, 5);
9047   unsigned rd = INSTR (4, 0);
9048 
9049   NYI_assert (31, 23, 0x0BD);
9050   NYI_assert (21, 16, 0x20);
9051   NYI_assert (15, 10, 0x32);
9052 
9053   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9054   if (size)
9055     aarch64_set_vec_u64 (cpu, rd, 0,
9056 			 aarch64_get_vec_double (cpu, rn, 0) > 0.0 ? -1 : 0);
9057   else
9058     aarch64_set_vec_u32 (cpu, rd, 0,
9059 			 aarch64_get_vec_float (cpu, rn, 0) > 0.0 ? -1 : 0);
9060 }
9061 
9062 /* Floating point scalar compare equal to 0.  */
9063 static void
9064 do_scalar_FCMEQ_zero (sim_cpu *cpu)
9065 {
9066   /* instr [31,23] = 0101 1110 1
9067      instr [22,22] = size
9068      instr [21,16] = 1000 00
9069      instr [15,10] = 1101 10
9070      instr [9, 5]  = Rn
9071      instr [4, 0]  = Rd.  */
9072 
9073   unsigned size = INSTR (22, 22);
9074   unsigned rn = INSTR (9, 5);
9075   unsigned rd = INSTR (4, 0);
9076 
9077   NYI_assert (31, 23, 0x0BD);
9078   NYI_assert (21, 16, 0x20);
9079   NYI_assert (15, 10, 0x36);
9080 
9081   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9082   if (size)
9083     aarch64_set_vec_u64 (cpu, rd, 0,
9084 			 aarch64_get_vec_double (cpu, rn, 0) == 0.0 ? -1 : 0);
9085   else
9086     aarch64_set_vec_u32 (cpu, rd, 0,
9087 			 aarch64_get_vec_float (cpu, rn, 0) == 0.0 ? -1 : 0);
9088 }
9089 
9090 /* Floating point scalar compare less than 0.  */
9091 static void
9092 do_scalar_FCMLT_zero (sim_cpu *cpu)
9093 {
9094   /* instr [31,23] = 0101 1110 1
9095      instr [22,22] = size
9096      instr [21,16] = 1000 00
9097      instr [15,10] = 1110 10
9098      instr [9, 5]  = Rn
9099      instr [4, 0]  = Rd.  */
9100 
9101   unsigned size = INSTR (22, 22);
9102   unsigned rn = INSTR (9, 5);
9103   unsigned rd = INSTR (4, 0);
9104 
9105   NYI_assert (31, 23, 0x0BD);
9106   NYI_assert (21, 16, 0x20);
9107   NYI_assert (15, 10, 0x3A);
9108 
9109   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9110   if (size)
9111     aarch64_set_vec_u64 (cpu, rd, 0,
9112 			 aarch64_get_vec_double (cpu, rn, 0) < 0.0 ? -1 : 0);
9113   else
9114     aarch64_set_vec_u32 (cpu, rd, 0,
9115 			 aarch64_get_vec_float (cpu, rn, 0) < 0.0 ? -1 : 0);
9116 }
9117 
9118 static void
9119 do_scalar_shift (sim_cpu *cpu)
9120 {
9121   /* instr [31,23] = 0101 1111 0
9122      instr [22,16] = shift amount
9123      instr [15,10] = 0101 01   [SHL]
9124      instr [15,10] = 0000 01   [SSHR]
9125      instr [9, 5]  = Rn
9126      instr [4, 0]  = Rd.  */
9127 
9128   unsigned rn = INSTR (9, 5);
9129   unsigned rd = INSTR (4, 0);
9130   unsigned amount;
9131 
9132   NYI_assert (31, 23, 0x0BE);
9133 
9134   if (INSTR (22, 22) == 0)
9135     HALT_UNALLOC;
9136 
9137   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9138   switch (INSTR (15, 10))
9139     {
9140     case 0x01: /* SSHR */
9141       amount = 128 - INSTR (22, 16);
9142       aarch64_set_vec_s64 (cpu, rd, 0,
9143 			   aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
9144       return;
9145     case 0x15: /* SHL */
9146       amount = INSTR (22, 16) - 64;
9147       aarch64_set_vec_u64 (cpu, rd, 0,
9148 			   aarch64_get_vec_u64 (cpu, rn, 0) << amount);
9149       return;
9150     default:
9151       HALT_NYI;
9152     }
9153 }
9154 
9155 /* FCMEQ FCMGT FCMGE.  */
9156 static void
9157 do_scalar_FCM (sim_cpu *cpu)
9158 {
9159   /* instr [31,30] = 01
9160      instr [29]    = U
9161      instr [28,24] = 1 1110
9162      instr [23]    = E
9163      instr [22]    = size
9164      instr [21]    = 1
9165      instr [20,16] = Rm
9166      instr [15,12] = 1110
9167      instr [11]    = AC
9168      instr [10]    = 1
9169      instr [9, 5]  = Rn
9170      instr [4, 0]  = Rd.  */
9171 
9172   unsigned rm = INSTR (20, 16);
9173   unsigned rn = INSTR (9, 5);
9174   unsigned rd = INSTR (4, 0);
9175   unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
9176   unsigned result;
9177   float val1;
9178   float val2;
9179 
9180   NYI_assert (31, 30, 1);
9181   NYI_assert (28, 24, 0x1E);
9182   NYI_assert (21, 21, 1);
9183   NYI_assert (15, 12, 0xE);
9184   NYI_assert (10, 10, 1);
9185 
9186   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9187   if (INSTR (22, 22))
9188     {
9189       double dval1 = aarch64_get_FP_double (cpu, rn);
9190       double dval2 = aarch64_get_FP_double (cpu, rm);
9191 
9192       switch (EUac)
9193 	{
9194 	case 0: /* 000 */
9195 	  result = dval1 == dval2;
9196 	  break;
9197 
9198 	case 3: /* 011 */
9199 	  dval1 = fabs (dval1);
9200 	  dval2 = fabs (dval2);
9201 	  ATTRIBUTE_FALLTHROUGH;
9202 	case 2: /* 010 */
9203 	  result = dval1 >= dval2;
9204 	  break;
9205 
9206 	case 7: /* 111 */
9207 	  dval1 = fabs (dval1);
9208 	  dval2 = fabs (dval2);
9209 	  ATTRIBUTE_FALLTHROUGH;
9210 	case 6: /* 110 */
9211 	  result = dval1 > dval2;
9212 	  break;
9213 
9214 	default:
9215 	  HALT_UNALLOC;
9216 	}
9217 
9218       aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
9219       return;
9220     }
9221 
9222   val1 = aarch64_get_FP_float (cpu, rn);
9223   val2 = aarch64_get_FP_float (cpu, rm);
9224 
9225   switch (EUac)
9226     {
9227     case 0: /* 000 */
9228       result = val1 == val2;
9229       break;
9230 
9231     case 3: /* 011 */
9232       val1 = fabsf (val1);
9233       val2 = fabsf (val2);
9234       ATTRIBUTE_FALLTHROUGH;
9235     case 2: /* 010 */
9236       result = val1 >= val2;
9237       break;
9238 
9239     case 7: /* 111 */
9240       val1 = fabsf (val1);
9241       val2 = fabsf (val2);
9242       ATTRIBUTE_FALLTHROUGH;
9243     case 6: /* 110 */
9244       result = val1 > val2;
9245       break;
9246 
9247     default:
9248       HALT_UNALLOC;
9249     }
9250 
9251   aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
9252 }
9253 
9254 /* An alias of DUP.  */
9255 static void
9256 do_scalar_MOV (sim_cpu *cpu)
9257 {
9258   /* instr [31,21] = 0101 1110 000
9259      instr [20,16] = imm5
9260      instr [15,10] = 0000 01
9261      instr [9, 5]  = Rn
9262      instr [4, 0]  = Rd.  */
9263 
9264   unsigned rn = INSTR (9, 5);
9265   unsigned rd = INSTR (4, 0);
9266   unsigned index;
9267 
9268   NYI_assert (31, 21, 0x2F0);
9269   NYI_assert (15, 10, 0x01);
9270 
9271   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9272   if (INSTR (16, 16))
9273     {
9274       /* 8-bit.  */
9275       index = INSTR (20, 17);
9276       aarch64_set_vec_u8
9277 	(cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
9278     }
9279   else if (INSTR (17, 17))
9280     {
9281       /* 16-bit.  */
9282       index = INSTR (20, 18);
9283       aarch64_set_vec_u16
9284 	(cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
9285     }
9286   else if (INSTR (18, 18))
9287     {
9288       /* 32-bit.  */
9289       index = INSTR (20, 19);
9290       aarch64_set_vec_u32
9291 	(cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
9292     }
9293   else if (INSTR (19, 19))
9294     {
9295       /* 64-bit.  */
9296       index = INSTR (20, 20);
9297       aarch64_set_vec_u64
9298 	(cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
9299     }
9300   else
9301     HALT_UNALLOC;
9302 }
9303 
9304 static void
9305 do_scalar_NEG (sim_cpu *cpu)
9306 {
9307   /* instr [31,10] = 0111 1110 1110 0000 1011 10
9308      instr [9, 5]  = Rn
9309      instr [4, 0]  = Rd.  */
9310 
9311   unsigned rn = INSTR (9, 5);
9312   unsigned rd = INSTR (4, 0);
9313 
9314   NYI_assert (31, 10, 0x1FB82E);
9315 
9316   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9317   aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
9318 }
9319 
9320 static void
9321 do_scalar_USHL (sim_cpu *cpu)
9322 {
9323   /* instr [31,21] = 0111 1110 111
9324      instr [20,16] = Rm
9325      instr [15,10] = 0100 01
9326      instr [9, 5]  = Rn
9327      instr [4, 0]  = Rd.  */
9328 
9329   unsigned rm = INSTR (20, 16);
9330   unsigned rn = INSTR (9, 5);
9331   unsigned rd = INSTR (4, 0);
9332   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
9333 
9334   NYI_assert (31, 21, 0x3F7);
9335   NYI_assert (15, 10, 0x11);
9336 
9337   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9338   if (shift >= 0)
9339     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
9340   else
9341     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
9342 }
9343 
9344 static void
9345 do_double_add (sim_cpu *cpu)
9346 {
9347   /* instr [31,21] = 0101 1110 111
9348      instr [20,16] = Fn
9349      instr [15,10] = 1000 01
9350      instr [9,5]   = Fm
9351      instr [4,0]   = Fd.  */
9352   unsigned Fd;
9353   unsigned Fm;
9354   unsigned Fn;
9355   double val1;
9356   double val2;
9357 
9358   NYI_assert (31, 21, 0x2F7);
9359   NYI_assert (15, 10, 0x21);
9360 
9361   Fd = INSTR (4, 0);
9362   Fm = INSTR (9, 5);
9363   Fn = INSTR (20, 16);
9364 
9365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9366   val1 = aarch64_get_FP_double (cpu, Fm);
9367   val2 = aarch64_get_FP_double (cpu, Fn);
9368 
9369   aarch64_set_FP_double (cpu, Fd, val1 + val2);
9370 }
9371 
9372 static void
9373 do_scalar_UCVTF (sim_cpu *cpu)
9374 {
9375   /* instr [31,23] = 0111 1110 0
9376      instr [22]    = single(0)/double(1)
9377      instr [21,10] = 10 0001 1101 10
9378      instr [9,5]   = rn
9379      instr [4,0]   = rd.  */
9380 
9381   unsigned rn = INSTR (9, 5);
9382   unsigned rd = INSTR (4, 0);
9383 
9384   NYI_assert (31, 23, 0x0FC);
9385   NYI_assert (21, 10, 0x876);
9386 
9387   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9388   if (INSTR (22, 22))
9389     {
9390       uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
9391 
9392       aarch64_set_vec_double (cpu, rd, 0, (double) val);
9393     }
9394   else
9395     {
9396       uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
9397 
9398       aarch64_set_vec_float (cpu, rd, 0, (float) val);
9399     }
9400 }
9401 
9402 static void
9403 do_scalar_vec (sim_cpu *cpu)
9404 {
9405   /* instr [30] = 1.  */
9406   /* instr [28,25] = 1111.  */
9407   switch (INSTR (31, 23))
9408     {
9409     case 0xBC:
9410       switch (INSTR (15, 10))
9411 	{
9412 	case 0x01: do_scalar_MOV (cpu); return;
9413 	case 0x39: do_scalar_FCM (cpu); return;
9414 	case 0x3B: do_scalar_FCM (cpu); return;
9415 	}
9416       break;
9417 
9418     case 0xBE: do_scalar_shift (cpu); return;
9419 
9420     case 0xFC:
9421       switch (INSTR (15, 10))
9422 	{
9423 	case 0x36:
9424 	  switch (INSTR (21, 16))
9425 	    {
9426 	    case 0x30: do_scalar_FADDP (cpu); return;
9427 	    case 0x21: do_scalar_UCVTF (cpu); return;
9428 	    }
9429 	  HALT_NYI;
9430 	case 0x39: do_scalar_FCM (cpu); return;
9431 	case 0x3B: do_scalar_FCM (cpu); return;
9432 	}
9433       break;
9434 
9435     case 0xFD:
9436       switch (INSTR (15, 10))
9437 	{
9438 	case 0x0D: do_scalar_CMGT (cpu); return;
9439 	case 0x11: do_scalar_USHL (cpu); return;
9440 	case 0x2E: do_scalar_NEG (cpu); return;
9441 	case 0x32: do_scalar_FCMGE_zero (cpu); return;
9442 	case 0x35: do_scalar_FABD (cpu); return;
9443 	case 0x36: do_scalar_FCMLE_zero (cpu); return;
9444 	case 0x39: do_scalar_FCM (cpu); return;
9445 	case 0x3B: do_scalar_FCM (cpu); return;
9446 	default:
9447 	  HALT_NYI;
9448 	}
9449 
9450     case 0xFE: do_scalar_USHR (cpu); return;
9451 
9452     case 0xBD:
9453       switch (INSTR (15, 10))
9454 	{
9455 	case 0x21: do_double_add (cpu); return;
9456 	case 0x11: do_scalar_SSHL (cpu); return;
9457 	case 0x32: do_scalar_FCMGT_zero (cpu); return;
9458 	case 0x36: do_scalar_FCMEQ_zero (cpu); return;
9459 	case 0x3A: do_scalar_FCMLT_zero (cpu); return;
9460 	default:
9461 	  HALT_NYI;
9462 	}
9463 
9464     default:
9465       HALT_NYI;
9466     }
9467 }
9468 
9469 static void
9470 dexAdvSIMD1 (sim_cpu *cpu)
9471 {
9472   /* instr [28,25] = 1 111.  */
9473 
9474   /* We are currently only interested in the basic
9475      scalar fp routines which all have bit 30 = 0.  */
9476   if (INSTR (30, 30))
9477     do_scalar_vec (cpu);
9478 
9479   /* instr[24] is set for FP data processing 3-source and clear for
9480      all other basic scalar fp instruction groups.  */
9481   else if (INSTR (24, 24))
9482     dexSimpleFPDataProc3Source (cpu);
9483 
9484   /* instr[21] is clear for floating <-> fixed conversions and set for
9485      all other basic scalar fp instruction groups.  */
9486   else if (!INSTR (21, 21))
9487     dexSimpleFPFixedConvert (cpu);
9488 
9489   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
9490      11 ==> cond select,  00 ==> other.  */
9491   else
9492     switch (INSTR (11, 10))
9493       {
9494       case 1: dexSimpleFPCondCompare (cpu); return;
9495       case 2: dexSimpleFPDataProc2Source (cpu); return;
9496       case 3: dexSimpleFPCondSelect (cpu); return;
9497 
9498       default:
9499 	/* Now an ordered cascade of tests.
9500 	   FP immediate has instr [12] == 1.
9501 	   FP compare has   instr [13] == 1.
9502 	   FP Data Proc 1 Source has instr [14] == 1.
9503 	   FP floating <--> integer conversions has instr [15] == 0.  */
9504 	if (INSTR (12, 12))
9505 	  dexSimpleFPImmediate (cpu);
9506 
9507 	else if (INSTR (13, 13))
9508 	  dexSimpleFPCompare (cpu);
9509 
9510 	else if (INSTR (14, 14))
9511 	  dexSimpleFPDataProc1Source (cpu);
9512 
9513 	else if (!INSTR (15, 15))
9514 	  dexSimpleFPIntegerConvert (cpu);
9515 
9516 	else
9517 	  /* If we get here then instr[15] == 1 which means UNALLOC.  */
9518 	  HALT_UNALLOC;
9519       }
9520 }
9521 
9522 /* PC relative addressing.  */
9523 
9524 static void
9525 pcadr (sim_cpu *cpu)
9526 {
9527   /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
9528      instr[30,29] = immlo
9529      instr[23,5] = immhi.  */
9530   uint64_t address;
9531   unsigned rd = INSTR (4, 0);
9532   uint32_t isPage = INSTR (31, 31);
9533   union { int64_t u64; uint64_t s64; } imm;
9534   uint64_t offset;
9535 
9536   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
9537   offset = imm.u64;
9538   offset = (offset << 2) | INSTR (30, 29);
9539 
9540   address = aarch64_get_PC (cpu);
9541 
9542   if (isPage)
9543     {
9544       offset <<= 12;
9545       address &= ~0xfff;
9546     }
9547 
9548   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9549   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
9550 }
9551 
9552 /* Specific decode and execute for group Data Processing Immediate.  */
9553 
9554 static void
9555 dexPCRelAddressing (sim_cpu *cpu)
9556 {
9557   /* assert instr[28,24] = 10000.  */
9558   pcadr (cpu);
9559 }
9560 
9561 /* Immediate logical.
9562    The bimm32/64 argument is constructed by replicating a 2, 4, 8,
9563    16, 32 or 64 bit sequence pulled out at decode and possibly
9564    inverting it..
9565 
9566    N.B. the output register (dest) can normally be Xn or SP
9567    the exception occurs for flag setting instructions which may
9568    only use Xn for the output (dest).  The input register can
9569    never be SP.  */
9570 
9571 /* 32 bit and immediate.  */
9572 static void
9573 and32 (sim_cpu *cpu, uint32_t bimm)
9574 {
9575   unsigned rn = INSTR (9, 5);
9576   unsigned rd = INSTR (4, 0);
9577 
9578   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9579   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9580 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
9581 }
9582 
9583 /* 64 bit and immediate.  */
9584 static void
9585 and64 (sim_cpu *cpu, uint64_t bimm)
9586 {
9587   unsigned rn = INSTR (9, 5);
9588   unsigned rd = INSTR (4, 0);
9589 
9590   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9591   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9592 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
9593 }
9594 
9595 /* 32 bit and immediate set flags.  */
9596 static void
9597 ands32 (sim_cpu *cpu, uint32_t bimm)
9598 {
9599   unsigned rn = INSTR (9, 5);
9600   unsigned rd = INSTR (4, 0);
9601 
9602   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9603   uint32_t value2 = bimm;
9604 
9605   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9606   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9607   set_flags_for_binop32 (cpu, value1 & value2);
9608 }
9609 
9610 /* 64 bit and immediate set flags.  */
9611 static void
9612 ands64 (sim_cpu *cpu, uint64_t bimm)
9613 {
9614   unsigned rn = INSTR (9, 5);
9615   unsigned rd = INSTR (4, 0);
9616 
9617   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9618   uint64_t value2 = bimm;
9619 
9620   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9621   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9622   set_flags_for_binop64 (cpu, value1 & value2);
9623 }
9624 
9625 /* 32 bit exclusive or immediate.  */
9626 static void
9627 eor32 (sim_cpu *cpu, uint32_t bimm)
9628 {
9629   unsigned rn = INSTR (9, 5);
9630   unsigned rd = INSTR (4, 0);
9631 
9632   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9633   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9634 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
9635 }
9636 
9637 /* 64 bit exclusive or immediate.  */
9638 static void
9639 eor64 (sim_cpu *cpu, uint64_t bimm)
9640 {
9641   unsigned rn = INSTR (9, 5);
9642   unsigned rd = INSTR (4, 0);
9643 
9644   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9645   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9646 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
9647 }
9648 
9649 /* 32 bit or immediate.  */
9650 static void
9651 orr32 (sim_cpu *cpu, uint32_t bimm)
9652 {
9653   unsigned rn = INSTR (9, 5);
9654   unsigned rd = INSTR (4, 0);
9655 
9656   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9657   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9658 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
9659 }
9660 
9661 /* 64 bit or immediate.  */
9662 static void
9663 orr64 (sim_cpu *cpu, uint64_t bimm)
9664 {
9665   unsigned rn = INSTR (9, 5);
9666   unsigned rd = INSTR (4, 0);
9667 
9668   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9669   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9670 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
9671 }
9672 
9673 /* Logical shifted register.
9674    These allow an optional LSL, ASR, LSR or ROR to the second source
9675    register with a count up to the register bit count.
9676    N.B register args may not be SP.  */
9677 
9678 /* 32 bit AND shifted register.  */
9679 static void
9680 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9681 {
9682   unsigned rm = INSTR (20, 16);
9683   unsigned rn = INSTR (9, 5);
9684   unsigned rd = INSTR (4, 0);
9685 
9686   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9687   aarch64_set_reg_u64
9688     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9689      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9690 }
9691 
9692 /* 64 bit AND shifted register.  */
9693 static void
9694 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9695 {
9696   unsigned rm = INSTR (20, 16);
9697   unsigned rn = INSTR (9, 5);
9698   unsigned rd = INSTR (4, 0);
9699 
9700   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9701   aarch64_set_reg_u64
9702     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9703      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9704 }
9705 
9706 /* 32 bit AND shifted register setting flags.  */
9707 static void
9708 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9709 {
9710   unsigned rm = INSTR (20, 16);
9711   unsigned rn = INSTR (9, 5);
9712   unsigned rd = INSTR (4, 0);
9713 
9714   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9715   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9716 			       shift, count);
9717 
9718   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9719   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9720   set_flags_for_binop32 (cpu, value1 & value2);
9721 }
9722 
9723 /* 64 bit AND shifted register setting flags.  */
9724 static void
9725 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9726 {
9727   unsigned rm = INSTR (20, 16);
9728   unsigned rn = INSTR (9, 5);
9729   unsigned rd = INSTR (4, 0);
9730 
9731   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9732   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9733 			       shift, count);
9734 
9735   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9736   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9737   set_flags_for_binop64 (cpu, value1 & value2);
9738 }
9739 
9740 /* 32 bit BIC shifted register.  */
9741 static void
9742 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9743 {
9744   unsigned rm = INSTR (20, 16);
9745   unsigned rn = INSTR (9, 5);
9746   unsigned rd = INSTR (4, 0);
9747 
9748   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9749   aarch64_set_reg_u64
9750     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9751      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9752 }
9753 
9754 /* 64 bit BIC shifted register.  */
9755 static void
9756 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9757 {
9758   unsigned rm = INSTR (20, 16);
9759   unsigned rn = INSTR (9, 5);
9760   unsigned rd = INSTR (4, 0);
9761 
9762   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9763   aarch64_set_reg_u64
9764     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9765      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9766 }
9767 
9768 /* 32 bit BIC shifted register setting flags.  */
9769 static void
9770 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9771 {
9772   unsigned rm = INSTR (20, 16);
9773   unsigned rn = INSTR (9, 5);
9774   unsigned rd = INSTR (4, 0);
9775 
9776   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9777   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9778 				 shift, count);
9779 
9780   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9781   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9782   set_flags_for_binop32 (cpu, value1 & value2);
9783 }
9784 
9785 /* 64 bit BIC shifted register setting flags.  */
9786 static void
9787 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9788 {
9789   unsigned rm = INSTR (20, 16);
9790   unsigned rn = INSTR (9, 5);
9791   unsigned rd = INSTR (4, 0);
9792 
9793   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9794   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9795 				 shift, count);
9796 
9797   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9798   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9799   set_flags_for_binop64 (cpu, value1 & value2);
9800 }
9801 
9802 /* 32 bit EON shifted register.  */
9803 static void
9804 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9805 {
9806   unsigned rm = INSTR (20, 16);
9807   unsigned rn = INSTR (9, 5);
9808   unsigned rd = INSTR (4, 0);
9809 
9810   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9811   aarch64_set_reg_u64
9812     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9813      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9814 }
9815 
9816 /* 64 bit EON shifted register.  */
9817 static void
9818 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9819 {
9820   unsigned rm = INSTR (20, 16);
9821   unsigned rn = INSTR (9, 5);
9822   unsigned rd = INSTR (4, 0);
9823 
9824   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9825   aarch64_set_reg_u64
9826     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9827      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9828 }
9829 
9830 /* 32 bit EOR shifted register.  */
9831 static void
9832 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9833 {
9834   unsigned rm = INSTR (20, 16);
9835   unsigned rn = INSTR (9, 5);
9836   unsigned rd = INSTR (4, 0);
9837 
9838   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9839   aarch64_set_reg_u64
9840     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9841      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9842 }
9843 
9844 /* 64 bit EOR shifted register.  */
9845 static void
9846 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9847 {
9848   unsigned rm = INSTR (20, 16);
9849   unsigned rn = INSTR (9, 5);
9850   unsigned rd = INSTR (4, 0);
9851 
9852   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9853   aarch64_set_reg_u64
9854     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9855      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9856 }
9857 
9858 /* 32 bit ORR shifted register.  */
9859 static void
9860 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9861 {
9862   unsigned rm = INSTR (20, 16);
9863   unsigned rn = INSTR (9, 5);
9864   unsigned rd = INSTR (4, 0);
9865 
9866   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9867   aarch64_set_reg_u64
9868     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9869      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9870 }
9871 
9872 /* 64 bit ORR shifted register.  */
9873 static void
9874 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9875 {
9876   unsigned rm = INSTR (20, 16);
9877   unsigned rn = INSTR (9, 5);
9878   unsigned rd = INSTR (4, 0);
9879 
9880   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9881   aarch64_set_reg_u64
9882     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9883      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9884 }
9885 
9886 /* 32 bit ORN shifted register.  */
9887 static void
9888 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9889 {
9890   unsigned rm = INSTR (20, 16);
9891   unsigned rn = INSTR (9, 5);
9892   unsigned rd = INSTR (4, 0);
9893 
9894   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9895   aarch64_set_reg_u64
9896     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9897      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9898 }
9899 
9900 /* 64 bit ORN shifted register.  */
9901 static void
9902 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9903 {
9904   unsigned rm = INSTR (20, 16);
9905   unsigned rn = INSTR (9, 5);
9906   unsigned rd = INSTR (4, 0);
9907 
9908   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9909   aarch64_set_reg_u64
9910     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9911      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9912 }
9913 
9914 static void
9915 dexLogicalImmediate (sim_cpu *cpu)
9916 {
9917   /* assert instr[28,23] = 1001000
9918      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9919      instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
9920      instr[22] = N : used to construct immediate mask
9921      instr[21,16] = immr
9922      instr[15,10] = imms
9923      instr[9,5] = Rn
9924      instr[4,0] = Rd  */
9925 
9926   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
9927   uint32_t size = INSTR (31, 31);
9928   uint32_t n = INSTR (22, 22);
9929   /* uint32_t immr = INSTR (21, 16);.  */
9930   /* uint32_t imms = INSTR (15, 10);.  */
9931   uint32_t index = INSTR (22, 10);
9932   uint64_t bimm64 = LITable [index];
9933   uint32_t dispatch = INSTR (30, 29);
9934 
9935   if (~size & n)
9936     HALT_UNALLOC;
9937 
9938   if (!bimm64)
9939     HALT_UNALLOC;
9940 
9941   if (size == 0)
9942     {
9943       uint32_t bimm = (uint32_t) bimm64;
9944 
9945       switch (dispatch)
9946 	{
9947 	case 0: and32 (cpu, bimm); return;
9948 	case 1: orr32 (cpu, bimm); return;
9949 	case 2: eor32 (cpu, bimm); return;
9950 	case 3: ands32 (cpu, bimm); return;
9951 	}
9952     }
9953   else
9954     {
9955       switch (dispatch)
9956 	{
9957 	case 0: and64 (cpu, bimm64); return;
9958 	case 1: orr64 (cpu, bimm64); return;
9959 	case 2: eor64 (cpu, bimm64); return;
9960 	case 3: ands64 (cpu, bimm64); return;
9961 	}
9962     }
9963   HALT_UNALLOC;
9964 }
9965 
9966 /* Immediate move.
9967    The uimm argument is a 16 bit value to be inserted into the
9968    target register the pos argument locates the 16 bit word in the
9969    dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
9970    3} for 64 bit.
9971    N.B register arg may not be SP so it should be.
9972    accessed using the setGZRegisterXXX accessors.  */
9973 
9974 /* 32 bit move 16 bit immediate zero remaining shorts.  */
9975 static void
9976 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9977 {
9978   unsigned rd = INSTR (4, 0);
9979 
9980   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9981   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
9982 }
9983 
9984 /* 64 bit move 16 bit immediate zero remaining shorts.  */
9985 static void
9986 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9987 {
9988   unsigned rd = INSTR (4, 0);
9989 
9990   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9991   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
9992 }
9993 
9994 /* 32 bit move 16 bit immediate negated.  */
9995 static void
9996 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9997 {
9998   unsigned rd = INSTR (4, 0);
9999 
10000   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10001   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
10002 }
10003 
10004 /* 64 bit move 16 bit immediate negated.  */
10005 static void
10006 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
10007 {
10008   unsigned rd = INSTR (4, 0);
10009 
10010   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10011   aarch64_set_reg_u64
10012     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
10013 		      ^ 0xffffffffffffffffULL));
10014 }
10015 
10016 /* 32 bit move 16 bit immediate keep remaining shorts.  */
10017 static void
10018 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
10019 {
10020   unsigned rd = INSTR (4, 0);
10021   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
10022   uint32_t value = val << (pos * 16);
10023   uint32_t mask = ~(0xffffU << (pos * 16));
10024 
10025   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10026   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
10027 }
10028 
10029 /* 64 bit move 16 it immediate keep remaining shorts.  */
10030 static void
10031 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
10032 {
10033   unsigned rd = INSTR (4, 0);
10034   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
10035   uint64_t value = (uint64_t) val << (pos * 16);
10036   uint64_t mask = ~(0xffffULL << (pos * 16));
10037 
10038   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10039   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
10040 }
10041 
10042 static void
10043 dexMoveWideImmediate (sim_cpu *cpu)
10044 {
10045   /* assert instr[28:23] = 100101
10046      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10047      instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
10048      instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
10049      instr[20,5] = uimm16
10050      instr[4,0] = Rd  */
10051 
10052   /* N.B. the (multiple of 16) shift is applied by the called routine,
10053      we just pass the multiplier.  */
10054 
10055   uint32_t imm;
10056   uint32_t size = INSTR (31, 31);
10057   uint32_t op = INSTR (30, 29);
10058   uint32_t shift = INSTR (22, 21);
10059 
10060   /* 32 bit can only shift 0 or 1 lot of 16.
10061      anything else is an unallocated instruction.  */
10062   if (size == 0 && (shift > 1))
10063     HALT_UNALLOC;
10064 
10065   if (op == 1)
10066     HALT_UNALLOC;
10067 
10068   imm = INSTR (20, 5);
10069 
10070   if (size == 0)
10071     {
10072       if (op == 0)
10073 	movn32 (cpu, imm, shift);
10074       else if (op == 2)
10075 	movz32 (cpu, imm, shift);
10076       else
10077 	movk32 (cpu, imm, shift);
10078     }
10079   else
10080     {
10081       if (op == 0)
10082 	movn64 (cpu, imm, shift);
10083       else if (op == 2)
10084 	movz64 (cpu, imm, shift);
10085       else
10086 	movk64 (cpu, imm, shift);
10087     }
10088 }
10089 
10090 /* Bitfield operations.
10091    These take a pair of bit positions r and s which are in {0..31}
10092    or {0..63} depending on the instruction word size.
10093    N.B register args may not be SP.  */
10094 
10095 /* OK, we start with ubfm which just needs to pick
10096    some bits out of source zero the rest and write
10097    the result to dest.  Just need two logical shifts.  */
10098 
10099 /* 32 bit bitfield move, left and right of affected zeroed
10100    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
10101 static void
10102 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
10103 {
10104   unsigned rd;
10105   unsigned rn = INSTR (9, 5);
10106   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10107 
10108   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
10109   if (r <= s)
10110     {
10111       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
10112          We want only bits s:xxx:r at the bottom of the word
10113          so we LSL bit s up to bit 31 i.e. by 31 - s
10114          and then we LSR to bring bit 31 down to bit s - r
10115 	 i.e. by 31 + r - s.  */
10116       value <<= 31 - s;
10117       value >>= 31 + r - s;
10118     }
10119   else
10120     {
10121       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
10122          We want only bits s:xxx:0 starting at it 31-(r-1)
10123          so we LSL bit s up to bit 31 i.e. by 31 - s
10124          and then we LSL to bring bit 31 down to 31-(r-1)+s
10125 	 i.e. by r - (s + 1).  */
10126       value <<= 31 - s;
10127       value >>= r - (s + 1);
10128     }
10129 
10130   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10131   rd = INSTR (4, 0);
10132   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
10133 }
10134 
10135 /* 64 bit bitfield move, left and right of affected zeroed
10136    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10137 static void
10138 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10139 {
10140   unsigned rd;
10141   unsigned rn = INSTR (9, 5);
10142   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
10143 
10144   if (r <= s)
10145     {
10146       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
10147          We want only bits s:xxx:r at the bottom of the word.
10148          So we LSL bit s up to bit 63 i.e. by 63 - s
10149          and then we LSR to bring bit 63 down to bit s - r
10150 	 i.e. by 63 + r - s.  */
10151       value <<= 63 - s;
10152       value >>= 63 + r - s;
10153     }
10154   else
10155     {
10156       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
10157          We want only bits s:xxx:0 starting at it 63-(r-1).
10158          So we LSL bit s up to bit 63 i.e. by 63 - s
10159          and then we LSL to bring bit 63 down to 63-(r-1)+s
10160 	 i.e. by r - (s + 1).  */
10161       value <<= 63 - s;
10162       value >>= r - (s + 1);
10163     }
10164 
10165   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10166   rd = INSTR (4, 0);
10167   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
10168 }
10169 
10170 /* The signed versions need to insert sign bits
10171    on the left of the inserted bit field. so we do
10172    much the same as the unsigned version except we
10173    use an arithmetic shift right -- this just means
10174    we need to operate on signed values.  */
10175 
10176 /* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
10177 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
10178 static void
10179 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
10180 {
10181   unsigned rd;
10182   unsigned rn = INSTR (9, 5);
10183   /* as per ubfm32 but use an ASR instead of an LSR.  */
10184   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
10185 
10186   if (r <= s)
10187     {
10188       value <<= 31 - s;
10189       value >>= 31 + r - s;
10190     }
10191   else
10192     {
10193       value <<= 31 - s;
10194       value >>= r - (s + 1);
10195     }
10196 
10197   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10198   rd = INSTR (4, 0);
10199   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
10200 }
10201 
10202 /* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
10203 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10204 static void
10205 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10206 {
10207   unsigned rd;
10208   unsigned rn = INSTR (9, 5);
10209   /* acpu per ubfm but use an ASR instead of an LSR.  */
10210   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
10211 
10212   if (r <= s)
10213     {
10214       value <<= 63 - s;
10215       value >>= 63 + r - s;
10216     }
10217   else
10218     {
10219       value <<= 63 - s;
10220       value >>= r - (s + 1);
10221     }
10222 
10223   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10224   rd = INSTR (4, 0);
10225   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
10226 }
10227 
10228 /* Finally, these versions leave non-affected bits
10229    as is. so we need to generate the bits as per
10230    ubfm and also generate a mask to pick the
10231    bits from the original and computed values.  */
10232 
10233 /* 32 bit bitfield move, non-affected bits left as is.
10234    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
10235 static void
10236 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
10237 {
10238   unsigned rn = INSTR (9, 5);
10239   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10240   uint32_t mask = -1;
10241   unsigned rd;
10242   uint32_t value2;
10243 
10244   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
10245   if (r <= s)
10246     {
10247       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
10248          We want only bits s:xxx:r at the bottom of the word
10249          so we LSL bit s up to bit 31 i.e. by 31 - s
10250          and then we LSR to bring bit 31 down to bit s - r
10251 	 i.e. by 31 + r - s.  */
10252       value <<= 31 - s;
10253       value >>= 31 + r - s;
10254       /* the mask must include the same bits.  */
10255       mask <<= 31 - s;
10256       mask >>= 31 + r - s;
10257     }
10258   else
10259     {
10260       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
10261          We want only bits s:xxx:0 starting at it 31-(r-1)
10262          so we LSL bit s up to bit 31 i.e. by 31 - s
10263          and then we LSL to bring bit 31 down to 31-(r-1)+s
10264 	 i.e. by r - (s + 1).  */
10265       value <<= 31 - s;
10266       value >>= r - (s + 1);
10267       /* The mask must include the same bits.  */
10268       mask <<= 31 - s;
10269       mask >>= r - (s + 1);
10270     }
10271 
10272   rd = INSTR (4, 0);
10273   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
10274 
10275   value2 &= ~mask;
10276   value2 |= value;
10277 
10278   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10279   aarch64_set_reg_u64 (cpu, rd, NO_SP, value2);
10280 }
10281 
10282 /* 64 bit bitfield move, non-affected bits left as is.
10283    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10284 static void
10285 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10286 {
10287   unsigned rd;
10288   unsigned rn = INSTR (9, 5);
10289   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
10290   uint64_t mask = 0xffffffffffffffffULL;
10291 
10292   if (r <= s)
10293     {
10294       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
10295          We want only bits s:xxx:r at the bottom of the word
10296          so we LSL bit s up to bit 63 i.e. by 63 - s
10297          and then we LSR to bring bit 63 down to bit s - r
10298 	 i.e. by 63 + r - s.  */
10299       value <<= 63 - s;
10300       value >>= 63 + r - s;
10301       /* The mask must include the same bits.  */
10302       mask <<= 63 - s;
10303       mask >>= 63 + r - s;
10304     }
10305   else
10306     {
10307       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
10308          We want only bits s:xxx:0 starting at it 63-(r-1)
10309          so we LSL bit s up to bit 63 i.e. by 63 - s
10310          and then we LSL to bring bit 63 down to 63-(r-1)+s
10311 	 i.e. by r - (s + 1).  */
10312       value <<= 63 - s;
10313       value >>= r - (s + 1);
10314       /* The mask must include the same bits.  */
10315       mask <<= 63 - s;
10316       mask >>= r - (s + 1);
10317     }
10318 
10319   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10320   rd = INSTR (4, 0);
10321   aarch64_set_reg_u64
10322     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
10323 }
10324 
10325 static void
10326 dexBitfieldImmediate (sim_cpu *cpu)
10327 {
10328   /* assert instr[28:23] = 100110
10329      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10330      instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
10331      instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
10332      instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
10333      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10334      instr[9,5] = Rn
10335      instr[4,0] = Rd  */
10336 
10337   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10338   uint32_t dispatch;
10339   uint32_t imms;
10340   uint32_t size = INSTR (31, 31);
10341   uint32_t n = INSTR (22, 22);
10342   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
10343   /* or else we have an UNALLOC.  */
10344   uint32_t immr = INSTR (21, 16);
10345 
10346   if (~size & n)
10347     HALT_UNALLOC;
10348 
10349   if (!size && uimm (immr, 5, 5))
10350     HALT_UNALLOC;
10351 
10352   imms = INSTR (15, 10);
10353   if (!size && uimm (imms, 5, 5))
10354     HALT_UNALLOC;
10355 
10356   /* Switch on combined size and op.  */
10357   dispatch = INSTR (31, 29);
10358   switch (dispatch)
10359     {
10360     case 0: sbfm32 (cpu, immr, imms); return;
10361     case 1: bfm32 (cpu, immr, imms); return;
10362     case 2: ubfm32 (cpu, immr, imms); return;
10363     case 4: sbfm (cpu, immr, imms); return;
10364     case 5: bfm (cpu, immr, imms); return;
10365     case 6: ubfm (cpu, immr, imms); return;
10366     default: HALT_UNALLOC;
10367     }
10368 }
10369 
10370 static void
10371 do_EXTR_32 (sim_cpu *cpu)
10372 {
10373   /* instr[31:21] = 00010011100
10374      instr[20,16] = Rm
10375      instr[15,10] = imms :  0xxxxx for 32 bit
10376      instr[9,5]   = Rn
10377      instr[4,0]   = Rd  */
10378   unsigned rm   = INSTR (20, 16);
10379   unsigned imms = INSTR (15, 10) & 31;
10380   unsigned rn   = INSTR ( 9,  5);
10381   unsigned rd   = INSTR ( 4,  0);
10382   uint64_t val1;
10383   uint64_t val2;
10384 
10385   val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
10386   val1 >>= imms;
10387   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10388   val2 <<= (32 - imms);
10389 
10390   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10391   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
10392 }
10393 
10394 static void
10395 do_EXTR_64 (sim_cpu *cpu)
10396 {
10397   /* instr[31:21] = 10010011100
10398      instr[20,16] = Rm
10399      instr[15,10] = imms
10400      instr[9,5]   = Rn
10401      instr[4,0]   = Rd  */
10402   unsigned rm   = INSTR (20, 16);
10403   unsigned imms = INSTR (15, 10) & 63;
10404   unsigned rn   = INSTR ( 9,  5);
10405   unsigned rd   = INSTR ( 4,  0);
10406   uint64_t val;
10407 
10408   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
10409   val >>= imms;
10410   val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));
10411 
10412   aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
10413 }
10414 
10415 static void
10416 dexExtractImmediate (sim_cpu *cpu)
10417 {
10418   /* assert instr[28:23] = 100111
10419      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
10420      instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
10421      instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
10422      instr[21]    = op0 : must be 0 or UNALLOC
10423      instr[20,16] = Rm
10424      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10425      instr[9,5]   = Rn
10426      instr[4,0]   = Rd  */
10427 
10428   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10429   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
10430   uint32_t dispatch;
10431   uint32_t size = INSTR (31, 31);
10432   uint32_t n = INSTR (22, 22);
10433   /* 32 bit operations must have imms[5] = 0
10434      or else we have an UNALLOC.  */
10435   uint32_t imms = INSTR (15, 10);
10436 
10437   if (size ^ n)
10438     HALT_UNALLOC;
10439 
10440   if (!size && uimm (imms, 5, 5))
10441     HALT_UNALLOC;
10442 
10443   /* Switch on combined size and op.  */
10444   dispatch = INSTR (31, 29);
10445 
10446   if (dispatch == 0)
10447     do_EXTR_32 (cpu);
10448 
10449   else if (dispatch == 4)
10450     do_EXTR_64 (cpu);
10451 
10452   else if (dispatch == 1)
10453     HALT_NYI;
10454   else
10455     HALT_UNALLOC;
10456 }
10457 
10458 static void
10459 dexDPImm (sim_cpu *cpu)
10460 {
10461   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
10462      assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
10463      bits [25,23] of a DPImm are the secondary dispatch vector.  */
10464   uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));
10465 
10466   switch (group2)
10467     {
10468     case DPIMM_PCADR_000:
10469     case DPIMM_PCADR_001:
10470       dexPCRelAddressing (cpu);
10471       return;
10472 
10473     case DPIMM_ADDSUB_010:
10474     case DPIMM_ADDSUB_011:
10475       dexAddSubtractImmediate (cpu);
10476       return;
10477 
10478     case DPIMM_LOG_100:
10479       dexLogicalImmediate (cpu);
10480       return;
10481 
10482     case DPIMM_MOV_101:
10483       dexMoveWideImmediate (cpu);
10484       return;
10485 
10486     case DPIMM_BITF_110:
10487       dexBitfieldImmediate (cpu);
10488       return;
10489 
10490     case DPIMM_EXTR_111:
10491       dexExtractImmediate (cpu);
10492       return;
10493 
10494     default:
10495       /* Should never reach here.  */
10496       HALT_NYI;
10497     }
10498 }
10499 
10500 static void
10501 dexLoadUnscaledImmediate (sim_cpu *cpu)
10502 {
10503   /* instr[29,24] == 111_00
10504      instr[21] == 0
10505      instr[11,10] == 00
10506      instr[31,30] = size
10507      instr[26] = V
10508      instr[23,22] = opc
10509      instr[20,12] = simm9
10510      instr[9,5] = rn may be SP.  */
10511   /* unsigned rt = INSTR (4, 0);  */
10512   uint32_t v = INSTR (26, 26);
10513   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10514   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
10515 
10516   if (!v)
10517     {
10518       /* GReg operations.  */
10519       switch (dispatch)
10520 	{
10521 	case 0:	 sturb (cpu, imm); return;
10522 	case 1:	 ldurb32 (cpu, imm); return;
10523 	case 2:	 ldursb64 (cpu, imm); return;
10524 	case 3:	 ldursb32 (cpu, imm); return;
10525 	case 4:	 sturh (cpu, imm); return;
10526 	case 5:	 ldurh32 (cpu, imm); return;
10527 	case 6:	 ldursh64 (cpu, imm); return;
10528 	case 7:	 ldursh32 (cpu, imm); return;
10529 	case 8:	 stur32 (cpu, imm); return;
10530 	case 9:	 ldur32 (cpu, imm); return;
10531 	case 10: ldursw (cpu, imm); return;
10532 	case 12: stur64 (cpu, imm); return;
10533 	case 13: ldur64 (cpu, imm); return;
10534 
10535 	case 14:
10536 	  /* PRFUM NYI.  */
10537 	  HALT_NYI;
10538 
10539 	default:
10540 	case 11:
10541 	case 15:
10542 	  HALT_UNALLOC;
10543 	}
10544     }
10545 
10546   /* FReg operations.  */
10547   switch (dispatch)
10548     {
10549     case 2:  fsturq (cpu, imm); return;
10550     case 3:  fldurq (cpu, imm); return;
10551     case 8:  fsturs (cpu, imm); return;
10552     case 9:  fldurs (cpu, imm); return;
10553     case 12: fsturd (cpu, imm); return;
10554     case 13: fldurd (cpu, imm); return;
10555 
10556     case 0: /* STUR 8 bit FP.  */
10557     case 1: /* LDUR 8 bit FP.  */
10558     case 4: /* STUR 16 bit FP.  */
10559     case 5: /* LDUR 8 bit FP.  */
10560       HALT_NYI;
10561 
10562     default:
10563     case 6:
10564     case 7:
10565     case 10:
10566     case 11:
10567     case 14:
10568     case 15:
10569       HALT_UNALLOC;
10570     }
10571 }
10572 
10573 /*  N.B. A preliminary note regarding all the ldrs<x>32
10574     instructions
10575 
10576    The signed value loaded by these instructions is cast to unsigned
10577    before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
10578    64 bit element of the GReg union. this performs a 32 bit sign extension
10579    (as required) but avoids 64 bit sign extension, thus ensuring that the
10580    top half of the register word is zero. this is what the spec demands
10581    when a 32 bit load occurs.  */
10582 
10583 /* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
10584 static void
10585 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
10586 {
10587   unsigned int rn = INSTR (9, 5);
10588   unsigned int rt = INSTR (4, 0);
10589 
10590   /* The target register may not be SP but the source may be
10591      there is no scaling required for a byte load.  */
10592   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
10593   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10594 		       (int64_t) aarch64_get_mem_s8 (cpu, address));
10595 }
10596 
10597 /* 32 bit load sign-extended byte scaled or unscaled zero-
10598    or sign-extended 32-bit register offset.  */
10599 static void
10600 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10601 {
10602   unsigned int rm = INSTR (20, 16);
10603   unsigned int rn = INSTR (9, 5);
10604   unsigned int rt = INSTR (4, 0);
10605 
10606   /* rn may reference SP, rm and rt must reference ZR.  */
10607 
10608   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10609   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10610 				 extension);
10611 
10612   /* There is no scaling required for a byte load.  */
10613   aarch64_set_reg_u64
10614     (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
10615 						   + displacement));
10616 }
10617 
10618 /* 32 bit load sign-extended byte unscaled signed 9 bit with
10619    pre- or post-writeback.  */
10620 static void
10621 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10622 {
10623   uint64_t address;
10624   unsigned int rn = INSTR (9, 5);
10625   unsigned int rt = INSTR (4, 0);
10626 
10627   if (rn == rt && wb != NoWriteBack)
10628     HALT_UNALLOC;
10629 
10630   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10631 
10632   if (wb == Pre)
10633       address += offset;
10634 
10635   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10636 		       (int64_t) aarch64_get_mem_s8 (cpu, address));
10637 
10638   if (wb == Post)
10639     address += offset;
10640 
10641   if (wb != NoWriteBack)
10642     aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
10643 }
10644 
10645 /* 8 bit store scaled.  */
10646 static void
10647 fstrb_abs (sim_cpu *cpu, uint32_t offset)
10648 {
10649   unsigned st = INSTR (4, 0);
10650   unsigned rn = INSTR (9, 5);
10651 
10652   aarch64_set_mem_u8 (cpu,
10653 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
10654 		      aarch64_get_vec_u8 (cpu, st, 0));
10655 }
10656 
10657 /* 8 bit store scaled or unscaled zero- or
10658    sign-extended 8-bit register offset.  */
10659 static void
10660 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10661 {
10662   unsigned rm = INSTR (20, 16);
10663   unsigned rn = INSTR (9, 5);
10664   unsigned st = INSTR (4, 0);
10665 
10666   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10667   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10668 			       extension);
10669   uint64_t  displacement = scaling == Scaled ? extended : 0;
10670 
10671   aarch64_set_mem_u8
10672     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
10673 }
10674 
10675 /* 16 bit store scaled.  */
10676 static void
10677 fstrh_abs (sim_cpu *cpu, uint32_t offset)
10678 {
10679   unsigned st = INSTR (4, 0);
10680   unsigned rn = INSTR (9, 5);
10681 
10682   aarch64_set_mem_u16
10683     (cpu,
10684      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
10685      aarch64_get_vec_u16 (cpu, st, 0));
10686 }
10687 
10688 /* 16 bit store scaled or unscaled zero-
10689    or sign-extended 16-bit register offset.  */
10690 static void
10691 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10692 {
10693   unsigned rm = INSTR (20, 16);
10694   unsigned rn = INSTR (9, 5);
10695   unsigned st = INSTR (4, 0);
10696 
10697   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10698   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10699 			       extension);
10700   uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
10701 
10702   aarch64_set_mem_u16
10703     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
10704 }
10705 
10706 /* 32 bit store scaled unsigned 12 bit.  */
10707 static void
10708 fstrs_abs (sim_cpu *cpu, uint32_t offset)
10709 {
10710   unsigned st = INSTR (4, 0);
10711   unsigned rn = INSTR (9, 5);
10712 
10713   aarch64_set_mem_u32
10714     (cpu,
10715      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
10716      aarch64_get_vec_u32 (cpu, st, 0));
10717 }
10718 
10719 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10720 static void
10721 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10722 {
10723   unsigned rn = INSTR (9, 5);
10724   unsigned st = INSTR (4, 0);
10725 
10726   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10727 
10728   if (wb != Post)
10729     address += offset;
10730 
10731   aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
10732 
10733   if (wb == Post)
10734     address += offset;
10735 
10736   if (wb != NoWriteBack)
10737     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10738 }
10739 
10740 /* 32 bit store scaled or unscaled zero-
10741    or sign-extended 32-bit register offset.  */
10742 static void
10743 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10744 {
10745   unsigned rm = INSTR (20, 16);
10746   unsigned rn = INSTR (9, 5);
10747   unsigned st = INSTR (4, 0);
10748 
10749   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10750   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10751 			       extension);
10752   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
10753 
10754   aarch64_set_mem_u32
10755     (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
10756 }
10757 
10758 /* 64 bit store scaled unsigned 12 bit.  */
10759 static void
10760 fstrd_abs (sim_cpu *cpu, uint32_t offset)
10761 {
10762   unsigned st = INSTR (4, 0);
10763   unsigned rn = INSTR (9, 5);
10764 
10765   aarch64_set_mem_u64
10766     (cpu,
10767      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
10768      aarch64_get_vec_u64 (cpu, st, 0));
10769 }
10770 
10771 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10772 static void
10773 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10774 {
10775   unsigned rn = INSTR (9, 5);
10776   unsigned st = INSTR (4, 0);
10777 
10778   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10779 
10780   if (wb != Post)
10781     address += offset;
10782 
10783   aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
10784 
10785   if (wb == Post)
10786     address += offset;
10787 
10788   if (wb != NoWriteBack)
10789     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10790 }
10791 
10792 /* 64 bit store scaled or unscaled zero-
10793    or sign-extended 32-bit register offset.  */
10794 static void
10795 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10796 {
10797   unsigned rm = INSTR (20, 16);
10798   unsigned rn = INSTR (9, 5);
10799   unsigned st = INSTR (4, 0);
10800 
10801   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10802   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10803 			       extension);
10804   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
10805 
10806   aarch64_set_mem_u64
10807     (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
10808 }
10809 
10810 /* 128 bit store scaled unsigned 12 bit.  */
10811 static void
10812 fstrq_abs (sim_cpu *cpu, uint32_t offset)
10813 {
10814   FRegister a;
10815   unsigned st = INSTR (4, 0);
10816   unsigned rn = INSTR (9, 5);
10817   uint64_t addr;
10818 
10819   aarch64_get_FP_long_double (cpu, st, & a);
10820 
10821   addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
10822   aarch64_set_mem_long_double (cpu, addr, a);
10823 }
10824 
10825 /* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10826 static void
10827 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10828 {
10829   FRegister a;
10830   unsigned rn = INSTR (9, 5);
10831   unsigned st = INSTR (4, 0);
10832   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10833 
10834   if (wb != Post)
10835     address += offset;
10836 
10837   aarch64_get_FP_long_double (cpu, st, & a);
10838   aarch64_set_mem_long_double (cpu, address, a);
10839 
10840   if (wb == Post)
10841     address += offset;
10842 
10843   if (wb != NoWriteBack)
10844     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10845 }
10846 
10847 /* 128 bit store scaled or unscaled zero-
10848    or sign-extended 32-bit register offset.  */
10849 static void
10850 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10851 {
10852   unsigned rm = INSTR (20, 16);
10853   unsigned rn = INSTR (9, 5);
10854   unsigned st = INSTR (4, 0);
10855 
10856   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10857   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10858 			       extension);
10859   uint64_t  displacement = OPT_SCALE (extended, 128, scaling);
10860 
10861   FRegister a;
10862 
10863   aarch64_get_FP_long_double (cpu, st, & a);
10864   aarch64_set_mem_long_double (cpu, address + displacement, a);
10865 }
10866 
10867 static void
10868 dexLoadImmediatePrePost (sim_cpu *cpu)
10869 {
10870   /* instr[31,30] = size
10871      instr[29,27] = 111
10872      instr[26]    = V
10873      instr[25,24] = 00
10874      instr[23,22] = opc
10875      instr[21]    = 0
10876      instr[20,12] = simm9
10877      instr[11]    = wb : 0 ==> Post, 1 ==> Pre
10878      instr[10]    = 0
10879      instr[9,5]   = Rn may be SP.
10880      instr[4,0]   = Rt */
10881 
10882   uint32_t  v        = INSTR (26, 26);
10883   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10884   int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
10885   WriteBack wb       = INSTR (11, 11);
10886 
10887   if (!v)
10888     {
10889       /* GReg operations.  */
10890       switch (dispatch)
10891 	{
10892 	case 0:	 strb_wb (cpu, imm, wb); return;
10893 	case 1:	 ldrb32_wb (cpu, imm, wb); return;
10894 	case 2:	 ldrsb_wb (cpu, imm, wb); return;
10895 	case 3:	 ldrsb32_wb (cpu, imm, wb); return;
10896 	case 4:	 strh_wb (cpu, imm, wb); return;
10897 	case 5:	 ldrh32_wb (cpu, imm, wb); return;
10898 	case 6:	 ldrsh64_wb (cpu, imm, wb); return;
10899 	case 7:	 ldrsh32_wb (cpu, imm, wb); return;
10900 	case 8:	 str32_wb (cpu, imm, wb); return;
10901 	case 9:	 ldr32_wb (cpu, imm, wb); return;
10902 	case 10: ldrsw_wb (cpu, imm, wb); return;
10903 	case 12: str_wb (cpu, imm, wb); return;
10904 	case 13: ldr_wb (cpu, imm, wb); return;
10905 
10906 	default:
10907 	case 11:
10908 	case 14:
10909 	case 15:
10910 	  HALT_UNALLOC;
10911 	}
10912     }
10913 
10914   /* FReg operations.  */
10915   switch (dispatch)
10916     {
10917     case 2:  fstrq_wb (cpu, imm, wb); return;
10918     case 3:  fldrq_wb (cpu, imm, wb); return;
10919     case 8:  fstrs_wb (cpu, imm, wb); return;
10920     case 9:  fldrs_wb (cpu, imm, wb); return;
10921     case 12: fstrd_wb (cpu, imm, wb); return;
10922     case 13: fldrd_wb (cpu, imm, wb); return;
10923 
10924     case 0:	  /* STUR 8 bit FP.  */
10925     case 1:	  /* LDUR 8 bit FP.  */
10926     case 4:	  /* STUR 16 bit FP.  */
10927     case 5:	  /* LDUR 8 bit FP.  */
10928       HALT_NYI;
10929 
10930     default:
10931     case 6:
10932     case 7:
10933     case 10:
10934     case 11:
10935     case 14:
10936     case 15:
10937       HALT_UNALLOC;
10938     }
10939 }
10940 
10941 static void
10942 dexLoadRegisterOffset (sim_cpu *cpu)
10943 {
10944   /* instr[31,30] = size
10945      instr[29,27] = 111
10946      instr[26]    = V
10947      instr[25,24] = 00
10948      instr[23,22] = opc
10949      instr[21]    = 1
10950      instr[20,16] = rm
10951      instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
10952                              110 ==> SXTW, 111 ==> SXTX,
10953                              ow ==> RESERVED
10954      instr[12]    = scaled
10955      instr[11,10] = 10
10956      instr[9,5]   = rn
10957      instr[4,0]   = rt.  */
10958 
10959   uint32_t  v = INSTR (26, 26);
10960   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10961   Scaling   scale = INSTR (12, 12);
10962   Extension extensionType = INSTR (15, 13);
10963 
10964   /* Check for illegal extension types.  */
10965   if (uimm (extensionType, 1, 1) == 0)
10966     HALT_UNALLOC;
10967 
10968   if (extensionType == UXTX || extensionType == SXTX)
10969     extensionType = NoExtension;
10970 
10971   if (!v)
10972     {
10973       /* GReg operations.  */
10974       switch (dispatch)
10975 	{
10976 	case 0:	 strb_scale_ext (cpu, scale, extensionType); return;
10977 	case 1:	 ldrb32_scale_ext (cpu, scale, extensionType); return;
10978 	case 2:	 ldrsb_scale_ext (cpu, scale, extensionType); return;
10979 	case 3:	 ldrsb32_scale_ext (cpu, scale, extensionType); return;
10980 	case 4:	 strh_scale_ext (cpu, scale, extensionType); return;
10981 	case 5:	 ldrh32_scale_ext (cpu, scale, extensionType); return;
10982 	case 6:	 ldrsh_scale_ext (cpu, scale, extensionType); return;
10983 	case 7:	 ldrsh32_scale_ext (cpu, scale, extensionType); return;
10984 	case 8:	 str32_scale_ext (cpu, scale, extensionType); return;
10985 	case 9:	 ldr32_scale_ext (cpu, scale, extensionType); return;
10986 	case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
10987 	case 12: str_scale_ext (cpu, scale, extensionType); return;
10988 	case 13: ldr_scale_ext (cpu, scale, extensionType); return;
10989 	case 14: prfm_scale_ext (cpu, scale, extensionType); return;
10990 
10991 	default:
10992 	case 11:
10993 	case 15:
10994 	  HALT_UNALLOC;
10995 	}
10996     }
10997 
10998   /* FReg operations.  */
10999   switch (dispatch)
11000     {
11001     case 1: /* LDUR 8 bit FP.  */
11002       HALT_NYI;
11003     case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
11004     case 5: /* LDUR 8 bit FP.  */
11005       HALT_NYI;
11006     case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
11007     case 13: fldrd_scale_ext (cpu, scale, extensionType); return;
11008 
11009     case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
11010     case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
11011     case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
11012     case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
11013     case 12: fstrd_scale_ext (cpu, scale, extensionType); return;
11014 
11015     default:
11016     case 6:
11017     case 7:
11018     case 10:
11019     case 11:
11020     case 14:
11021     case 15:
11022       HALT_UNALLOC;
11023     }
11024 }
11025 
11026 static void
11027 dexLoadUnsignedImmediate (sim_cpu *cpu)
11028 {
11029   /* instr[29,24] == 111_01
11030      instr[31,30] = size
11031      instr[26]    = V
11032      instr[23,22] = opc
11033      instr[21,10] = uimm12 : unsigned immediate offset
11034      instr[9,5]   = rn may be SP.
11035      instr[4,0]   = rt.  */
11036 
11037   uint32_t v = INSTR (26,26);
11038   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
11039   uint32_t imm = INSTR (21, 10);
11040 
11041   if (!v)
11042     {
11043       /* GReg operations.  */
11044       switch (dispatch)
11045 	{
11046 	case 0:  strb_abs (cpu, imm); return;
11047 	case 1:  ldrb32_abs (cpu, imm); return;
11048 	case 2:  ldrsb_abs (cpu, imm); return;
11049 	case 3:  ldrsb32_abs (cpu, imm); return;
11050 	case 4:  strh_abs (cpu, imm); return;
11051 	case 5:  ldrh32_abs (cpu, imm); return;
11052 	case 6:  ldrsh_abs (cpu, imm); return;
11053 	case 7:  ldrsh32_abs (cpu, imm); return;
11054 	case 8:  str32_abs (cpu, imm); return;
11055 	case 9:  ldr32_abs (cpu, imm); return;
11056 	case 10: ldrsw_abs (cpu, imm); return;
11057 	case 12: str_abs (cpu, imm); return;
11058 	case 13: ldr_abs (cpu, imm); return;
11059 	case 14: prfm_abs (cpu, imm); return;
11060 
11061 	default:
11062 	case 11:
11063 	case 15:
11064 	  HALT_UNALLOC;
11065 	}
11066     }
11067 
11068   /* FReg operations.  */
11069   switch (dispatch)
11070     {
11071     case 0:  fstrb_abs (cpu, imm); return;
11072     case 4:  fstrh_abs (cpu, imm); return;
11073     case 8:  fstrs_abs (cpu, imm); return;
11074     case 12: fstrd_abs (cpu, imm); return;
11075     case 2:  fstrq_abs (cpu, imm); return;
11076 
11077     case 1:  fldrb_abs (cpu, imm); return;
11078     case 5:  fldrh_abs (cpu, imm); return;
11079     case 9:  fldrs_abs (cpu, imm); return;
11080     case 13: fldrd_abs (cpu, imm); return;
11081     case 3:  fldrq_abs (cpu, imm); return;
11082 
11083     default:
11084     case 6:
11085     case 7:
11086     case 10:
11087     case 11:
11088     case 14:
11089     case 15:
11090       HALT_UNALLOC;
11091     }
11092 }
11093 
11094 static void
11095 dexLoadExclusive (sim_cpu *cpu)
11096 {
11097   /* assert instr[29:24] = 001000;
11098      instr[31,30] = size
11099      instr[23] = 0 if exclusive
11100      instr[22] = L : 1 if load, 0 if store
11101      instr[21] = 1 if pair
11102      instr[20,16] = Rs
11103      instr[15] = o0 : 1 if ordered
11104      instr[14,10] = Rt2
11105      instr[9,5] = Rn
11106      instr[4.0] = Rt.  */
11107 
11108   switch (INSTR (22, 21))
11109     {
11110     case 2:   ldxr (cpu); return;
11111     case 0:   stxr (cpu); return;
11112     default:  HALT_NYI;
11113     }
11114 }
11115 
11116 static void
11117 dexLoadOther (sim_cpu *cpu)
11118 {
11119   uint32_t dispatch;
11120 
11121   /* instr[29,25] = 111_0
11122      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
11123      instr[21:11,10] is the secondary dispatch.  */
11124   if (INSTR (24, 24))
11125     {
11126       dexLoadUnsignedImmediate (cpu);
11127       return;
11128     }
11129 
11130   dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
11131   switch (dispatch)
11132     {
11133     case 0: dexLoadUnscaledImmediate (cpu); return;
11134     case 1: dexLoadImmediatePrePost (cpu); return;
11135     case 3: dexLoadImmediatePrePost (cpu); return;
11136     case 6: dexLoadRegisterOffset (cpu); return;
11137 
11138     default:
11139     case 2:
11140     case 4:
11141     case 5:
11142     case 7:
11143       HALT_NYI;
11144     }
11145 }
11146 
11147 static void
11148 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11149 {
11150   unsigned rn = INSTR (14, 10);
11151   unsigned rd = INSTR (9, 5);
11152   unsigned rm = INSTR (4, 0);
11153   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11154 
11155   if ((rn == rd || rm == rd) && wb != NoWriteBack)
11156     HALT_UNALLOC; /* ??? */
11157 
11158   offset <<= 2;
11159 
11160   if (wb != Post)
11161     address += offset;
11162 
11163   aarch64_set_mem_u32 (cpu, address,
11164 		       aarch64_get_reg_u32 (cpu, rm, NO_SP));
11165   aarch64_set_mem_u32 (cpu, address + 4,
11166 		       aarch64_get_reg_u32 (cpu, rn, NO_SP));
11167 
11168   if (wb == Post)
11169     address += offset;
11170 
11171   if (wb != NoWriteBack)
11172     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11173 }
11174 
11175 static void
11176 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11177 {
11178   unsigned rn = INSTR (14, 10);
11179   unsigned rd = INSTR (9, 5);
11180   unsigned rm = INSTR (4, 0);
11181   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11182 
11183   if ((rn == rd || rm == rd) && wb != NoWriteBack)
11184     HALT_UNALLOC; /* ??? */
11185 
11186   offset <<= 3;
11187 
11188   if (wb != Post)
11189     address += offset;
11190 
11191   aarch64_set_mem_u64 (cpu, address,
11192 		       aarch64_get_reg_u64 (cpu, rm, NO_SP));
11193   aarch64_set_mem_u64 (cpu, address + 8,
11194 		       aarch64_get_reg_u64 (cpu, rn, NO_SP));
11195 
11196   if (wb == Post)
11197     address += offset;
11198 
11199   if (wb != NoWriteBack)
11200     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11201 }
11202 
11203 static void
11204 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11205 {
11206   unsigned rn = INSTR (14, 10);
11207   unsigned rd = INSTR (9, 5);
11208   unsigned rm = INSTR (4, 0);
11209   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11210 
11211   /* Treat this as unalloc to make sure we don't do it.  */
11212   if (rn == rm)
11213     HALT_UNALLOC;
11214 
11215   offset <<= 2;
11216 
11217   if (wb != Post)
11218     address += offset;
11219 
11220   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
11221   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));
11222 
11223   if (wb == Post)
11224     address += offset;
11225 
11226   if (wb != NoWriteBack)
11227     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11228 }
11229 
11230 static void
11231 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11232 {
11233   unsigned rn = INSTR (14, 10);
11234   unsigned rd = INSTR (9, 5);
11235   unsigned rm = INSTR (4, 0);
11236   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11237 
11238   /* Treat this as unalloc to make sure we don't do it.  */
11239   if (rn == rm)
11240     HALT_UNALLOC;
11241 
11242   offset <<= 2;
11243 
11244   if (wb != Post)
11245     address += offset;
11246 
11247   aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
11248   aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));
11249 
11250   if (wb == Post)
11251     address += offset;
11252 
11253   if (wb != NoWriteBack)
11254     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11255 }
11256 
11257 static void
11258 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11259 {
11260   unsigned rn = INSTR (14, 10);
11261   unsigned rd = INSTR (9, 5);
11262   unsigned rm = INSTR (4, 0);
11263   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11264 
11265   /* Treat this as unalloc to make sure we don't do it.  */
11266   if (rn == rm)
11267     HALT_UNALLOC;
11268 
11269   offset <<= 3;
11270 
11271   if (wb != Post)
11272     address += offset;
11273 
11274   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
11275   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));
11276 
11277   if (wb == Post)
11278     address += offset;
11279 
11280   if (wb != NoWriteBack)
11281     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11282 }
11283 
11284 static void
11285 dex_load_store_pair_gr (sim_cpu *cpu)
11286 {
11287   /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
11288      instr[29,25] = instruction encoding: 101_0
11289      instr[26]    = V : 1 if fp 0 if gp
11290      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11291      instr[22]    = load/store (1=> load)
11292      instr[21,15] = signed, scaled, offset
11293      instr[14,10] = Rn
11294      instr[ 9, 5] = Rd
11295      instr[ 4, 0] = Rm.  */
11296 
11297   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11298   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11299 
11300   switch (dispatch)
11301     {
11302     case 2: store_pair_u32 (cpu, offset, Post); return;
11303     case 3: load_pair_u32  (cpu, offset, Post); return;
11304     case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
11305     case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
11306     case 6: store_pair_u32 (cpu, offset, Pre); return;
11307     case 7: load_pair_u32  (cpu, offset, Pre); return;
11308 
11309     case 11: load_pair_s32  (cpu, offset, Post); return;
11310     case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
11311     case 15: load_pair_s32  (cpu, offset, Pre); return;
11312 
11313     case 18: store_pair_u64 (cpu, offset, Post); return;
11314     case 19: load_pair_u64  (cpu, offset, Post); return;
11315     case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
11316     case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
11317     case 22: store_pair_u64 (cpu, offset, Pre); return;
11318     case 23: load_pair_u64  (cpu, offset, Pre); return;
11319 
11320     default:
11321       HALT_UNALLOC;
11322     }
11323 }
11324 
11325 static void
11326 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11327 {
11328   unsigned rn = INSTR (14, 10);
11329   unsigned rd = INSTR (9, 5);
11330   unsigned rm = INSTR (4, 0);
11331   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11332 
11333   offset <<= 2;
11334 
11335   if (wb != Post)
11336     address += offset;
11337 
11338   aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
11339   aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
11340 
11341   if (wb == Post)
11342     address += offset;
11343 
11344   if (wb != NoWriteBack)
11345     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11346 }
11347 
11348 static void
11349 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11350 {
11351   unsigned rn = INSTR (14, 10);
11352   unsigned rd = INSTR (9, 5);
11353   unsigned rm = INSTR (4, 0);
11354   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11355 
11356   offset <<= 3;
11357 
11358   if (wb != Post)
11359     address += offset;
11360 
11361   aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
11362   aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
11363 
11364   if (wb == Post)
11365     address += offset;
11366 
11367   if (wb != NoWriteBack)
11368     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11369 }
11370 
11371 static void
11372 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11373 {
11374   FRegister a;
11375   unsigned rn = INSTR (14, 10);
11376   unsigned rd = INSTR (9, 5);
11377   unsigned rm = INSTR (4, 0);
11378   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11379 
11380   offset <<= 4;
11381 
11382   if (wb != Post)
11383     address += offset;
11384 
11385   aarch64_get_FP_long_double (cpu, rm, & a);
11386   aarch64_set_mem_long_double (cpu, address, a);
11387   aarch64_get_FP_long_double (cpu, rn, & a);
11388   aarch64_set_mem_long_double (cpu, address + 16, a);
11389 
11390   if (wb == Post)
11391     address += offset;
11392 
11393   if (wb != NoWriteBack)
11394     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11395 }
11396 
11397 static void
11398 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11399 {
11400   unsigned rn = INSTR (14, 10);
11401   unsigned rd = INSTR (9, 5);
11402   unsigned rm = INSTR (4, 0);
11403   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11404 
11405   if (rm == rn)
11406     HALT_UNALLOC;
11407 
11408   offset <<= 2;
11409 
11410   if (wb != Post)
11411     address += offset;
11412 
11413   aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
11414   aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
11415 
11416   if (wb == Post)
11417     address += offset;
11418 
11419   if (wb != NoWriteBack)
11420     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11421 }
11422 
11423 static void
11424 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11425 {
11426   unsigned rn = INSTR (14, 10);
11427   unsigned rd = INSTR (9, 5);
11428   unsigned rm = INSTR (4, 0);
11429   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11430 
11431   if (rm == rn)
11432     HALT_UNALLOC;
11433 
11434   offset <<= 3;
11435 
11436   if (wb != Post)
11437     address += offset;
11438 
11439   aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
11440   aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
11441 
11442   if (wb == Post)
11443     address += offset;
11444 
11445   if (wb != NoWriteBack)
11446     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11447 }
11448 
11449 static void
11450 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11451 {
11452   FRegister a;
11453   unsigned rn = INSTR (14, 10);
11454   unsigned rd = INSTR (9, 5);
11455   unsigned rm = INSTR (4, 0);
11456   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11457 
11458   if (rm == rn)
11459     HALT_UNALLOC;
11460 
11461   offset <<= 4;
11462 
11463   if (wb != Post)
11464     address += offset;
11465 
11466   aarch64_get_mem_long_double (cpu, address, & a);
11467   aarch64_set_FP_long_double (cpu, rm, a);
11468   aarch64_get_mem_long_double (cpu, address + 16, & a);
11469   aarch64_set_FP_long_double (cpu, rn, a);
11470 
11471   if (wb == Post)
11472     address += offset;
11473 
11474   if (wb != NoWriteBack)
11475     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11476 }
11477 
11478 static void
11479 dex_load_store_pair_fp (sim_cpu *cpu)
11480 {
11481   /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
11482      instr[29,25] = instruction encoding
11483      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11484      instr[22]    = load/store (1=> load)
11485      instr[21,15] = signed, scaled, offset
11486      instr[14,10] = Rn
11487      instr[ 9, 5] = Rd
11488      instr[ 4, 0] = Rm  */
11489 
11490   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11491   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11492 
11493   switch (dispatch)
11494     {
11495     case 2: store_pair_float (cpu, offset, Post); return;
11496     case 3: load_pair_float  (cpu, offset, Post); return;
11497     case 4: store_pair_float (cpu, offset, NoWriteBack); return;
11498     case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
11499     case 6: store_pair_float (cpu, offset, Pre); return;
11500     case 7: load_pair_float  (cpu, offset, Pre); return;
11501 
11502     case 10: store_pair_double (cpu, offset, Post); return;
11503     case 11: load_pair_double  (cpu, offset, Post); return;
11504     case 12: store_pair_double (cpu, offset, NoWriteBack); return;
11505     case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
11506     case 14: store_pair_double (cpu, offset, Pre); return;
11507     case 15: load_pair_double  (cpu, offset, Pre); return;
11508 
11509     case 18: store_pair_long_double (cpu, offset, Post); return;
11510     case 19: load_pair_long_double  (cpu, offset, Post); return;
11511     case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
11512     case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
11513     case 22: store_pair_long_double (cpu, offset, Pre); return;
11514     case 23: load_pair_long_double  (cpu, offset, Pre); return;
11515 
11516     default:
11517       HALT_UNALLOC;
11518     }
11519 }
11520 
11521 static inline unsigned
11522 vec_reg (unsigned v, unsigned o)
11523 {
11524   return (v + o) & 0x3F;
11525 }
11526 
11527 /* Load multiple N-element structures to M consecutive registers.  */
11528 static void
11529 vec_load (sim_cpu *cpu, uint64_t address, unsigned n, unsigned m)
11530 {
11531   int      all  = INSTR (30, 30);
11532   unsigned size = INSTR (11, 10);
11533   unsigned vd   = INSTR (4, 0);
11534   unsigned rpt = (n == m) ? 1 : m;
11535   unsigned selem = n;
11536   unsigned i, j, k;
11537 
11538   switch (size)
11539     {
11540     case 0: /* 8-bit operations.  */
11541       for (i = 0; i < rpt; i++)
11542 	for (j = 0; j < (8 + (8 * all)); j++)
11543 	  for (k = 0; k < selem; k++)
11544 	    {
11545 	      aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
11546 				  aarch64_get_mem_u8 (cpu, address));
11547 	      address += 1;
11548 	    }
11549       return;
11550 
11551     case 1: /* 16-bit operations.  */
11552       for (i = 0; i < rpt; i++)
11553 	for (j = 0; j < (4 + (4 * all)); j++)
11554 	  for (k = 0; k < selem; k++)
11555 	    {
11556 	      aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
11557 				   aarch64_get_mem_u16 (cpu, address));
11558 	      address += 2;
11559 	    }
11560       return;
11561 
11562     case 2: /* 32-bit operations.  */
11563       for (i = 0; i < rpt; i++)
11564 	for (j = 0; j < (2 + (2 * all)); j++)
11565 	  for (k = 0; k < selem; k++)
11566 	    {
11567 	      aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
11568 				   aarch64_get_mem_u32 (cpu, address));
11569 	      address += 4;
11570 	    }
11571       return;
11572 
11573     case 3: /* 64-bit operations.  */
11574       for (i = 0; i < rpt; i++)
11575 	for (j = 0; j < (1 + all); j++)
11576 	  for (k = 0; k < selem; k++)
11577 	    {
11578 	      aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
11579 				   aarch64_get_mem_u64 (cpu, address));
11580 	      address += 8;
11581 	    }
11582       return;
11583     }
11584 }
11585 
11586 /* Load multiple 4-element structures into four consecutive registers.  */
11587 static void
11588 LD4 (sim_cpu *cpu, uint64_t address)
11589 {
11590   vec_load (cpu, address, 4, 4);
11591 }
11592 
11593 /* Load multiple 3-element structures into three consecutive registers.  */
11594 static void
11595 LD3 (sim_cpu *cpu, uint64_t address)
11596 {
11597   vec_load (cpu, address, 3, 3);
11598 }
11599 
11600 /* Load multiple 2-element structures into two consecutive registers.  */
11601 static void
11602 LD2 (sim_cpu *cpu, uint64_t address)
11603 {
11604   vec_load (cpu, address, 2, 2);
11605 }
11606 
11607 /* Load multiple 1-element structures into one register.  */
11608 static void
11609 LD1_1 (sim_cpu *cpu, uint64_t address)
11610 {
11611   vec_load (cpu, address, 1, 1);
11612 }
11613 
11614 /* Load multiple 1-element structures into two registers.  */
11615 static void
11616 LD1_2 (sim_cpu *cpu, uint64_t address)
11617 {
11618   vec_load (cpu, address, 1, 2);
11619 }
11620 
11621 /* Load multiple 1-element structures into three registers.  */
11622 static void
11623 LD1_3 (sim_cpu *cpu, uint64_t address)
11624 {
11625   vec_load (cpu, address, 1, 3);
11626 }
11627 
11628 /* Load multiple 1-element structures into four registers.  */
11629 static void
11630 LD1_4 (sim_cpu *cpu, uint64_t address)
11631 {
11632   vec_load (cpu, address, 1, 4);
11633 }
11634 
11635 /* Store multiple N-element structures from M consecutive registers.  */
11636 static void
11637 vec_store (sim_cpu *cpu, uint64_t address, unsigned n, unsigned m)
11638 {
11639   int      all  = INSTR (30, 30);
11640   unsigned size = INSTR (11, 10);
11641   unsigned vd   = INSTR (4, 0);
11642   unsigned rpt = (n == m) ? 1 : m;
11643   unsigned selem = n;
11644   unsigned i, j, k;
11645 
11646   switch (size)
11647     {
11648     case 0: /* 8-bit operations.  */
11649       for (i = 0; i < rpt; i++)
11650 	for (j = 0; j < (8 + (8 * all)); j++)
11651 	  for (k = 0; k < selem; k++)
11652 	    {
11653 	      aarch64_set_mem_u8
11654 		(cpu, address,
11655 		 aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
11656 	      address += 1;
11657 	    }
11658       return;
11659 
11660     case 1: /* 16-bit operations.  */
11661       for (i = 0; i < rpt; i++)
11662 	for (j = 0; j < (4 + (4 * all)); j++)
11663 	  for (k = 0; k < selem; k++)
11664 	    {
11665 	      aarch64_set_mem_u16
11666 		(cpu, address,
11667 		 aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
11668 	      address += 2;
11669 	    }
11670       return;
11671 
11672     case 2: /* 32-bit operations.  */
11673       for (i = 0; i < rpt; i++)
11674 	for (j = 0; j < (2 + (2 * all)); j++)
11675 	  for (k = 0; k < selem; k++)
11676 	    {
11677 	      aarch64_set_mem_u32
11678 		(cpu, address,
11679 		 aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
11680 	      address += 4;
11681 	    }
11682       return;
11683 
11684     case 3: /* 64-bit operations.  */
11685       for (i = 0; i < rpt; i++)
11686 	for (j = 0; j < (1 + all); j++)
11687 	  for (k = 0; k < selem; k++)
11688 	    {
11689 	      aarch64_set_mem_u64
11690 		(cpu, address,
11691 		 aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
11692 	      address += 8;
11693 	    }
11694       return;
11695     }
11696 }
11697 
11698 /* Store multiple 4-element structure from four consecutive registers.  */
11699 static void
11700 ST4 (sim_cpu *cpu, uint64_t address)
11701 {
11702   vec_store (cpu, address, 4, 4);
11703 }
11704 
11705 /* Store multiple 3-element structures from three consecutive registers.  */
11706 static void
11707 ST3 (sim_cpu *cpu, uint64_t address)
11708 {
11709   vec_store (cpu, address, 3, 3);
11710 }
11711 
11712 /* Store multiple 2-element structures from two consecutive registers.  */
11713 static void
11714 ST2 (sim_cpu *cpu, uint64_t address)
11715 {
11716   vec_store (cpu, address, 2, 2);
11717 }
11718 
11719 /* Store multiple 1-element structures from one register.  */
11720 static void
11721 ST1_1 (sim_cpu *cpu, uint64_t address)
11722 {
11723   vec_store (cpu, address, 1, 1);
11724 }
11725 
11726 /* Store multiple 1-element structures from two registers.  */
11727 static void
11728 ST1_2 (sim_cpu *cpu, uint64_t address)
11729 {
11730   vec_store (cpu, address, 1, 2);
11731 }
11732 
11733 /* Store multiple 1-element structures from three registers.  */
11734 static void
11735 ST1_3 (sim_cpu *cpu, uint64_t address)
11736 {
11737   vec_store (cpu, address, 1, 3);
11738 }
11739 
11740 /* Store multiple 1-element structures from four registers.  */
11741 static void
11742 ST1_4 (sim_cpu *cpu, uint64_t address)
11743 {
11744   vec_store (cpu, address, 1, 4);
11745 }
11746 
11747 #define LDn_STn_SINGLE_LANE_AND_SIZE()				\
11748   do								\
11749     {								\
11750       switch (INSTR (15, 14))					\
11751 	{							\
11752 	case 0:							\
11753 	  lane = (full << 3) | (s << 2) | size;			\
11754 	  size = 0;						\
11755 	  break;						\
11756 								\
11757 	case 1:							\
11758 	  if ((size & 1) == 1)					\
11759 	    HALT_UNALLOC;					\
11760 	  lane = (full << 2) | (s << 1) | (size >> 1);		\
11761 	  size = 1;						\
11762 	  break;						\
11763 								\
11764 	case 2:							\
11765 	  if ((size & 2) == 2)					\
11766 	    HALT_UNALLOC;					\
11767 								\
11768 	  if ((size & 1) == 0)					\
11769 	    {							\
11770 	      lane = (full << 1) | s;				\
11771 	      size = 2;						\
11772 	    }							\
11773 	  else							\
11774 	    {							\
11775 	      if (s)						\
11776 		HALT_UNALLOC;					\
11777 	      lane = full;					\
11778 	      size = 3;						\
11779 	    }							\
11780 	  break;						\
11781 								\
11782 	default:						\
11783 	  HALT_UNALLOC;						\
11784 	}							\
11785     }								\
11786   while (0)
11787 
11788 /* Load single structure into one lane of N registers.  */
11789 static void
11790 do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
11791 {
11792   /* instr[31]    = 0
11793      instr[30]    = element selector 0=>half, 1=>all elements
11794      instr[29,24] = 00 1101
11795      instr[23]    = 0=>simple, 1=>post
11796      instr[22]    = 1
11797      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11798      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11799                       11111 (immediate post inc)
11800      instr[15,13] = opcode
11801      instr[12]    = S, used for lane number
11802      instr[11,10] = size, also used for lane number
11803      instr[9,5]   = address
11804      instr[4,0]   = Vd  */
11805 
11806   unsigned full = INSTR (30, 30);
11807   unsigned vd = INSTR (4, 0);
11808   unsigned size = INSTR (11, 10);
11809   unsigned s = INSTR (12, 12);
11810   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11811   int lane = 0;
11812   int i;
11813 
11814   NYI_assert (29, 24, 0x0D);
11815   NYI_assert (22, 22, 1);
11816 
11817   /* Compute the lane number first (using size), and then compute size.  */
11818   LDn_STn_SINGLE_LANE_AND_SIZE ();
11819 
11820   for (i = 0; i < nregs; i++)
11821     switch (size)
11822       {
11823       case 0:
11824 	{
11825 	  uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
11826 	  aarch64_set_vec_u8 (cpu, vd + i, lane, val);
11827 	  break;
11828 	}
11829 
11830       case 1:
11831 	{
11832 	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
11833 	  aarch64_set_vec_u16 (cpu, vd + i, lane, val);
11834 	  break;
11835 	}
11836 
11837       case 2:
11838 	{
11839 	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
11840 	  aarch64_set_vec_u32 (cpu, vd + i, lane, val);
11841 	  break;
11842 	}
11843 
11844       case 3:
11845 	{
11846 	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
11847 	  aarch64_set_vec_u64 (cpu, vd + i, lane, val);
11848 	  break;
11849 	}
11850       }
11851 }
11852 
11853 /* Store single structure from one lane from N registers.  */
11854 static void
11855 do_vec_STn_single (sim_cpu *cpu, uint64_t address)
11856 {
11857   /* instr[31]    = 0
11858      instr[30]    = element selector 0=>half, 1=>all elements
11859      instr[29,24] = 00 1101
11860      instr[23]    = 0=>simple, 1=>post
11861      instr[22]    = 0
11862      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11863      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11864                       11111 (immediate post inc)
11865      instr[15,13] = opcode
11866      instr[12]    = S, used for lane number
11867      instr[11,10] = size, also used for lane number
11868      instr[9,5]   = address
11869      instr[4,0]   = Vd  */
11870 
11871   unsigned full = INSTR (30, 30);
11872   unsigned vd = INSTR (4, 0);
11873   unsigned size = INSTR (11, 10);
11874   unsigned s = INSTR (12, 12);
11875   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11876   int lane = 0;
11877   int i;
11878 
11879   NYI_assert (29, 24, 0x0D);
11880   NYI_assert (22, 22, 0);
11881 
11882   /* Compute the lane number first (using size), and then compute size.  */
11883   LDn_STn_SINGLE_LANE_AND_SIZE ();
11884 
11885   for (i = 0; i < nregs; i++)
11886     switch (size)
11887       {
11888       case 0:
11889 	{
11890 	  uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
11891 	  aarch64_set_mem_u8 (cpu, address + i, val);
11892 	  break;
11893 	}
11894 
11895       case 1:
11896 	{
11897 	  uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
11898 	  aarch64_set_mem_u16 (cpu, address + (i * 2), val);
11899 	  break;
11900 	}
11901 
11902       case 2:
11903 	{
11904 	  uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
11905 	  aarch64_set_mem_u32 (cpu, address + (i * 4), val);
11906 	  break;
11907 	}
11908 
11909       case 3:
11910 	{
11911 	  uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
11912 	  aarch64_set_mem_u64 (cpu, address + (i * 8), val);
11913 	  break;
11914 	}
11915       }
11916 }
11917 
11918 /* Load single structure into all lanes of N registers.  */
11919 static void
11920 do_vec_LDnR (sim_cpu *cpu, uint64_t address)
11921 {
11922   /* instr[31]    = 0
11923      instr[30]    = element selector 0=>half, 1=>all elements
11924      instr[29,24] = 00 1101
11925      instr[23]    = 0=>simple, 1=>post
11926      instr[22]    = 1
11927      instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
11928      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11929                       11111 (immediate post inc)
11930      instr[15,14] = 11
11931      instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
11932      instr[12]    = 0
11933      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11934                                  10=> word(s), 11=> double(d)
11935      instr[9,5]   = address
11936      instr[4,0]   = Vd  */
11937 
11938   unsigned full = INSTR (30, 30);
11939   unsigned vd = INSTR (4, 0);
11940   unsigned size = INSTR (11, 10);
11941   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11942   int i, n;
11943 
11944   NYI_assert (29, 24, 0x0D);
11945   NYI_assert (22, 22, 1);
11946   NYI_assert (15, 14, 3);
11947   NYI_assert (12, 12, 0);
11948 
11949   for (n = 0; n < nregs; n++)
11950     switch (size)
11951       {
11952       case 0:
11953 	{
11954 	  uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
11955 	  for (i = 0; i < (full ? 16 : 8); i++)
11956 	    aarch64_set_vec_u8 (cpu, vd + n, i, val);
11957 	  break;
11958 	}
11959 
11960       case 1:
11961 	{
11962 	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
11963 	  for (i = 0; i < (full ? 8 : 4); i++)
11964 	    aarch64_set_vec_u16 (cpu, vd + n, i, val);
11965 	  break;
11966 	}
11967 
11968       case 2:
11969 	{
11970 	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
11971 	  for (i = 0; i < (full ? 4 : 2); i++)
11972 	    aarch64_set_vec_u32 (cpu, vd + n, i, val);
11973 	  break;
11974 	}
11975 
11976       case 3:
11977 	{
11978 	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
11979 	  for (i = 0; i < (full ? 2 : 1); i++)
11980 	    aarch64_set_vec_u64 (cpu, vd + n, i, val);
11981 	  break;
11982 	}
11983 
11984       default:
11985 	HALT_UNALLOC;
11986       }
11987 }
11988 
11989 static void
11990 do_vec_load_store (sim_cpu *cpu)
11991 {
11992   /* {LD|ST}<N>   {Vd..Vd+N}, vaddr
11993 
11994      instr[31]    = 0
11995      instr[30]    = element selector 0=>half, 1=>all elements
11996      instr[29,25] = 00110
11997      instr[24]    = 0=>multiple struct, 1=>single struct
11998      instr[23]    = 0=>simple, 1=>post
11999      instr[22]    = 0=>store, 1=>load
12000      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
12001      instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
12002                     11111 (immediate post inc)
12003      instr[15,12] = elements and destinations.  eg for load:
12004                      0000=>LD4 => load multiple 4-element to
12005 		     four consecutive registers
12006                      0100=>LD3 => load multiple 3-element to
12007 		     three consecutive registers
12008                      1000=>LD2 => load multiple 2-element to
12009 		     two consecutive registers
12010                      0010=>LD1 => load multiple 1-element to
12011 		     four consecutive registers
12012                      0110=>LD1 => load multiple 1-element to
12013 		     three consecutive registers
12014                      1010=>LD1 => load multiple 1-element to
12015 		     two consecutive registers
12016                      0111=>LD1 => load multiple 1-element to
12017 		     one register
12018                      1100=>LDR1,LDR2
12019                      1110=>LDR3,LDR4
12020      instr[11,10] = element size 00=> byte(b), 01=> half(h),
12021                                  10=> word(s), 11=> double(d)
12022      instr[9,5]   = Vn, can be SP
12023      instr[4,0]   = Vd  */
12024 
12025   int single;
12026   int post;
12027   int load;
12028   unsigned vn;
12029   uint64_t address;
12030   int type;
12031 
12032   if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
12033     HALT_NYI;
12034 
12035   single = INSTR (24, 24);
12036   post = INSTR (23, 23);
12037   load = INSTR (22, 22);
12038   type = INSTR (15, 12);
12039   vn = INSTR (9, 5);
12040   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
12041 
12042   if (! single && INSTR (21, 21) != 0)
12043     HALT_UNALLOC;
12044 
12045   if (post)
12046     {
12047       unsigned vm = INSTR (20, 16);
12048 
12049       if (vm == R31)
12050 	{
12051 	  unsigned sizeof_operation;
12052 
12053 	  if (single)
12054 	    {
12055 	      if ((type >= 0) && (type <= 11))
12056 		{
12057 		  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
12058 		  switch (INSTR (15, 14))
12059 		    {
12060 		    case 0:
12061 		      sizeof_operation = nregs * 1;
12062 		      break;
12063 		    case 1:
12064 		      sizeof_operation = nregs * 2;
12065 		      break;
12066 		    case 2:
12067 		      if (INSTR (10, 10) == 0)
12068 			sizeof_operation = nregs * 4;
12069 		      else
12070 			sizeof_operation = nregs * 8;
12071 		      break;
12072 		    default:
12073 		      HALT_UNALLOC;
12074 		    }
12075 		}
12076 	      else if (type == 0xC)
12077 		{
12078 		  sizeof_operation = INSTR (21, 21) ? 2 : 1;
12079 		  sizeof_operation <<= INSTR (11, 10);
12080 		}
12081 	      else if (type == 0xE)
12082 		{
12083 		  sizeof_operation = INSTR (21, 21) ? 4 : 3;
12084 		  sizeof_operation <<= INSTR (11, 10);
12085 		}
12086 	      else
12087 		HALT_UNALLOC;
12088 	    }
12089 	  else
12090 	    {
12091 	      switch (type)
12092 		{
12093 		case 0: sizeof_operation = 32; break;
12094 		case 4: sizeof_operation = 24; break;
12095 		case 8: sizeof_operation = 16; break;
12096 
12097 		case 7:
12098 		  /* One register, immediate offset variant.  */
12099 		  sizeof_operation = 8;
12100 		  break;
12101 
12102 		case 10:
12103 		  /* Two registers, immediate offset variant.  */
12104 		  sizeof_operation = 16;
12105 		  break;
12106 
12107 		case 6:
12108 		  /* Three registers, immediate offset variant.  */
12109 		  sizeof_operation = 24;
12110 		  break;
12111 
12112 		case 2:
12113 		  /* Four registers, immediate offset variant.  */
12114 		  sizeof_operation = 32;
12115 		  break;
12116 
12117 		default:
12118 		  HALT_UNALLOC;
12119 		}
12120 
12121 	      if (INSTR (30, 30))
12122 		sizeof_operation *= 2;
12123 	    }
12124 
12125 	  aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
12126 	}
12127       else
12128 	aarch64_set_reg_u64 (cpu, vn, SP_OK,
12129 			     address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
12130     }
12131   else
12132     {
12133       NYI_assert (20, 16, 0);
12134     }
12135 
12136   if (single)
12137     {
12138       if (load)
12139 	{
12140 	  if ((type >= 0) && (type <= 11))
12141 	    do_vec_LDn_single (cpu, address);
12142 	  else if ((type == 0xC) || (type == 0xE))
12143 	    do_vec_LDnR (cpu, address);
12144 	  else
12145 	    HALT_UNALLOC;
12146 	  return;
12147 	}
12148 
12149       /* Stores.  */
12150       if ((type >= 0) && (type <= 11))
12151 	{
12152 	  do_vec_STn_single (cpu, address);
12153 	  return;
12154 	}
12155 
12156       HALT_UNALLOC;
12157     }
12158 
12159   if (load)
12160     {
12161       switch (type)
12162 	{
12163 	case 0:  LD4 (cpu, address); return;
12164 	case 4:  LD3 (cpu, address); return;
12165 	case 8:  LD2 (cpu, address); return;
12166 	case 2:  LD1_4 (cpu, address); return;
12167 	case 6:  LD1_3 (cpu, address); return;
12168 	case 10: LD1_2 (cpu, address); return;
12169 	case 7:  LD1_1 (cpu, address); return;
12170 
12171 	default:
12172 	  HALT_UNALLOC;
12173 	}
12174     }
12175 
12176   /* Stores.  */
12177   switch (type)
12178     {
12179     case 0:  ST4 (cpu, address); return;
12180     case 4:  ST3 (cpu, address); return;
12181     case 8:  ST2 (cpu, address); return;
12182     case 2:  ST1_4 (cpu, address); return;
12183     case 6:  ST1_3 (cpu, address); return;
12184     case 10: ST1_2 (cpu, address); return;
12185     case 7:  ST1_1 (cpu, address); return;
12186     default:
12187       HALT_UNALLOC;
12188     }
12189 }
12190 
12191 static void
12192 dexLdSt (sim_cpu *cpu)
12193 {
12194   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
12195      assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
12196              group == GROUP_LDST_1100 || group == GROUP_LDST_1110
12197      bits [29,28:26] of a LS are the secondary dispatch vector.  */
12198   uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));
12199 
12200   switch (group2)
12201     {
12202     case LS_EXCL_000:
12203       dexLoadExclusive (cpu); return;
12204 
12205     case LS_LIT_010:
12206     case LS_LIT_011:
12207       dexLoadLiteral (cpu); return;
12208 
12209     case LS_OTHER_110:
12210     case LS_OTHER_111:
12211       dexLoadOther (cpu); return;
12212 
12213     case LS_ADVSIMD_001:
12214       do_vec_load_store (cpu); return;
12215 
12216     case LS_PAIR_100:
12217       dex_load_store_pair_gr (cpu); return;
12218 
12219     case LS_PAIR_101:
12220       dex_load_store_pair_fp (cpu); return;
12221 
12222     default:
12223       /* Should never reach here.  */
12224       HALT_NYI;
12225     }
12226 }
12227 
12228 /* Specific decode and execute for group Data Processing Register.  */
12229 
12230 static void
12231 dexLogicalShiftedRegister (sim_cpu *cpu)
12232 {
12233   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12234      instr[30,29] = op
12235      instr[28:24] = 01010
12236      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
12237      instr[21]    = N
12238      instr[20,16] = Rm
12239      instr[15,10] = count : must be 0xxxxx for 32 bit
12240      instr[9,5]   = Rn
12241      instr[4,0]   = Rd  */
12242 
12243   uint32_t size      = INSTR (31, 31);
12244   Shift    shiftType = INSTR (23, 22);
12245   uint32_t count     = INSTR (15, 10);
12246 
12247   /* 32 bit operations must have count[5] = 0.
12248      or else we have an UNALLOC.  */
12249   if (size == 0 && uimm (count, 5, 5))
12250     HALT_UNALLOC;
12251 
12252   /* Dispatch on size:op:N.  */
12253   switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
12254     {
12255     case 0: and32_shift  (cpu, shiftType, count); return;
12256     case 1: bic32_shift  (cpu, shiftType, count); return;
12257     case 2: orr32_shift  (cpu, shiftType, count); return;
12258     case 3: orn32_shift  (cpu, shiftType, count); return;
12259     case 4: eor32_shift  (cpu, shiftType, count); return;
12260     case 5: eon32_shift  (cpu, shiftType, count); return;
12261     case 6: ands32_shift (cpu, shiftType, count); return;
12262     case 7: bics32_shift (cpu, shiftType, count); return;
12263     case 8: and64_shift  (cpu, shiftType, count); return;
12264     case 9: bic64_shift  (cpu, shiftType, count); return;
12265     case 10:orr64_shift  (cpu, shiftType, count); return;
12266     case 11:orn64_shift  (cpu, shiftType, count); return;
12267     case 12:eor64_shift  (cpu, shiftType, count); return;
12268     case 13:eon64_shift  (cpu, shiftType, count); return;
12269     case 14:ands64_shift (cpu, shiftType, count); return;
12270     case 15:bics64_shift (cpu, shiftType, count); return;
12271     }
12272 }
12273 
12274 /* 32 bit conditional select.  */
12275 static void
12276 csel32 (sim_cpu *cpu, CondCode cc)
12277 {
12278   unsigned rm = INSTR (20, 16);
12279   unsigned rn = INSTR (9, 5);
12280   unsigned rd = INSTR (4, 0);
12281 
12282   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12283 		       testConditionCode (cpu, cc)
12284 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12285 		       : aarch64_get_reg_u32 (cpu, rm, NO_SP));
12286 }
12287 
12288 /* 64 bit conditional select.  */
12289 static void
12290 csel64 (sim_cpu *cpu, CondCode cc)
12291 {
12292   unsigned rm = INSTR (20, 16);
12293   unsigned rn = INSTR (9, 5);
12294   unsigned rd = INSTR (4, 0);
12295 
12296   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12297 		       testConditionCode (cpu, cc)
12298 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12299 		       : aarch64_get_reg_u64 (cpu, rm, NO_SP));
12300 }
12301 
12302 /* 32 bit conditional increment.  */
12303 static void
12304 csinc32 (sim_cpu *cpu, CondCode cc)
12305 {
12306   unsigned rm = INSTR (20, 16);
12307   unsigned rn = INSTR (9, 5);
12308   unsigned rd = INSTR (4, 0);
12309 
12310   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12311 		       testConditionCode (cpu, cc)
12312 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12313 		       : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
12314 }
12315 
12316 /* 64 bit conditional increment.  */
12317 static void
12318 csinc64 (sim_cpu *cpu, CondCode cc)
12319 {
12320   unsigned rm = INSTR (20, 16);
12321   unsigned rn = INSTR (9, 5);
12322   unsigned rd = INSTR (4, 0);
12323 
12324   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12325 		       testConditionCode (cpu, cc)
12326 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12327 		       : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
12328 }
12329 
12330 /* 32 bit conditional invert.  */
12331 static void
12332 csinv32 (sim_cpu *cpu, CondCode cc)
12333 {
12334   unsigned rm = INSTR (20, 16);
12335   unsigned rn = INSTR (9, 5);
12336   unsigned rd = INSTR (4, 0);
12337 
12338   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12339 		       testConditionCode (cpu, cc)
12340 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12341 		       : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
12342 }
12343 
12344 /* 64 bit conditional invert.  */
12345 static void
12346 csinv64 (sim_cpu *cpu, CondCode cc)
12347 {
12348   unsigned rm = INSTR (20, 16);
12349   unsigned rn = INSTR (9, 5);
12350   unsigned rd = INSTR (4, 0);
12351 
12352   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12353 		       testConditionCode (cpu, cc)
12354 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12355 		       : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
12356 }
12357 
12358 /* 32 bit conditional negate.  */
12359 static void
12360 csneg32 (sim_cpu *cpu, CondCode cc)
12361 {
12362   unsigned rm = INSTR (20, 16);
12363   unsigned rn = INSTR (9, 5);
12364   unsigned rd = INSTR (4, 0);
12365 
12366   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12367 		       testConditionCode (cpu, cc)
12368 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12369 		       : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
12370 }
12371 
12372 /* 64 bit conditional negate.  */
12373 static void
12374 csneg64 (sim_cpu *cpu, CondCode cc)
12375 {
12376   unsigned rm = INSTR (20, 16);
12377   unsigned rn = INSTR (9, 5);
12378   unsigned rd = INSTR (4, 0);
12379 
12380   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12381 		       testConditionCode (cpu, cc)
12382 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12383 		       : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
12384 }
12385 
12386 static void
12387 dexCondSelect (sim_cpu *cpu)
12388 {
12389   /* instr[28,21] = 11011011
12390      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12391      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
12392                             100 ==> CSINV, 101 ==> CSNEG,
12393                             _1_ ==> UNALLOC
12394      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12395      instr[15,12] = cond
12396      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
12397 
12398   CondCode cc = INSTR (15, 12);
12399   uint32_t S = INSTR (29, 29);
12400   uint32_t op2 = INSTR (11, 10);
12401 
12402   if (S == 1)
12403     HALT_UNALLOC;
12404 
12405   if (op2 & 0x2)
12406     HALT_UNALLOC;
12407 
12408   switch ((INSTR (31, 30) << 1) | op2)
12409     {
12410     case 0: csel32  (cpu, cc); return;
12411     case 1: csinc32 (cpu, cc); return;
12412     case 2: csinv32 (cpu, cc); return;
12413     case 3: csneg32 (cpu, cc); return;
12414     case 4: csel64  (cpu, cc); return;
12415     case 5: csinc64 (cpu, cc); return;
12416     case 6: csinv64 (cpu, cc); return;
12417     case 7: csneg64 (cpu, cc); return;
12418     }
12419 }
12420 
12421 /* Some helpers for counting leading 1 or 0 bits.  */
12422 
12423 /* Counts the number of leading bits which are the same
12424    in a 32 bit value in the range 1 to 32.  */
12425 static uint32_t
12426 leading32 (uint32_t value)
12427 {
12428   int32_t mask= 0xffff0000;
12429   uint32_t count= 16; /* Counts number of bits set in mask.  */
12430   uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
12431   uint32_t hi = 32;   /* Upper bound for number of sign bits.  */
12432 
12433   while (lo + 1 < hi)
12434     {
12435       int32_t test = (value & mask);
12436 
12437       if (test == 0 || test == mask)
12438 	{
12439 	  lo = count;
12440 	  count = (lo + hi) / 2;
12441 	  mask >>= (count - lo);
12442 	}
12443       else
12444 	{
12445 	  hi = count;
12446 	  count = (lo + hi) / 2;
12447 	  mask <<= hi - count;
12448 	}
12449     }
12450 
12451   if (lo != hi)
12452     {
12453       int32_t test;
12454 
12455       mask >>= 1;
12456       test = (value & mask);
12457 
12458       if (test == 0 || test == mask)
12459 	count = hi;
12460       else
12461 	count = lo;
12462     }
12463 
12464   return count;
12465 }
12466 
12467 /* Counts the number of leading bits which are the same
12468    in a 64 bit value in the range 1 to 64.  */
12469 static uint64_t
12470 leading64 (uint64_t value)
12471 {
12472   int64_t mask= 0xffffffff00000000LL;
12473   uint64_t count = 32; /* Counts number of bits set in mask.  */
12474   uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
12475   uint64_t hi = 64;    /* Upper bound for number of sign bits.  */
12476 
12477   while (lo + 1 < hi)
12478     {
12479       int64_t test = (value & mask);
12480 
12481       if (test == 0 || test == mask)
12482 	{
12483 	  lo = count;
12484 	  count = (lo + hi) / 2;
12485 	  mask >>= (count - lo);
12486 	}
12487       else
12488 	{
12489 	  hi = count;
12490 	  count = (lo + hi) / 2;
12491 	  mask <<= hi - count;
12492 	}
12493     }
12494 
12495   if (lo != hi)
12496     {
12497       int64_t test;
12498 
12499       mask >>= 1;
12500       test = (value & mask);
12501 
12502       if (test == 0 || test == mask)
12503 	count = hi;
12504       else
12505 	count = lo;
12506     }
12507 
12508   return count;
12509 }
12510 
12511 /* Bit operations.  */
12512 /* N.B register args may not be SP.  */
12513 
12514 /* 32 bit count leading sign bits.  */
12515 static void
12516 cls32 (sim_cpu *cpu)
12517 {
12518   unsigned rn = INSTR (9, 5);
12519   unsigned rd = INSTR (4, 0);
12520 
12521   /* N.B. the result needs to exclude the leading bit.  */
12522   aarch64_set_reg_u64
12523     (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
12524 }
12525 
12526 /* 64 bit count leading sign bits.  */
12527 static void
12528 cls64 (sim_cpu *cpu)
12529 {
12530   unsigned rn = INSTR (9, 5);
12531   unsigned rd = INSTR (4, 0);
12532 
12533   /* N.B. the result needs to exclude the leading bit.  */
12534   aarch64_set_reg_u64
12535     (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
12536 }
12537 
12538 /* 32 bit count leading zero bits.  */
12539 static void
12540 clz32 (sim_cpu *cpu)
12541 {
12542   unsigned rn = INSTR (9, 5);
12543   unsigned rd = INSTR (4, 0);
12544   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12545 
12546   /* if the sign (top) bit is set then the count is 0.  */
12547   if (pick32 (value, 31, 31))
12548     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12549   else
12550     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
12551 }
12552 
12553 /* 64 bit count leading zero bits.  */
12554 static void
12555 clz64 (sim_cpu *cpu)
12556 {
12557   unsigned rn = INSTR (9, 5);
12558   unsigned rd = INSTR (4, 0);
12559   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12560 
12561   /* if the sign (top) bit is set then the count is 0.  */
12562   if (pick64 (value, 63, 63))
12563     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12564   else
12565     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
12566 }
12567 
12568 /* 32 bit reverse bits.  */
12569 static void
12570 rbit32 (sim_cpu *cpu)
12571 {
12572   unsigned rn = INSTR (9, 5);
12573   unsigned rd = INSTR (4, 0);
12574   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12575   uint32_t result = 0;
12576   int i;
12577 
12578   for (i = 0; i < 32; i++)
12579     {
12580       result <<= 1;
12581       result |= (value & 1);
12582       value >>= 1;
12583     }
12584   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12585 }
12586 
12587 /* 64 bit reverse bits.  */
12588 static void
12589 rbit64 (sim_cpu *cpu)
12590 {
12591   unsigned rn = INSTR (9, 5);
12592   unsigned rd = INSTR (4, 0);
12593   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12594   uint64_t result = 0;
12595   int i;
12596 
12597   for (i = 0; i < 64; i++)
12598     {
12599       result <<= 1;
12600       result |= (value & 1UL);
12601       value >>= 1;
12602     }
12603   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12604 }
12605 
12606 /* 32 bit reverse bytes.  */
12607 static void
12608 rev32 (sim_cpu *cpu)
12609 {
12610   unsigned rn = INSTR (9, 5);
12611   unsigned rd = INSTR (4, 0);
12612   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12613   uint32_t result = 0;
12614   int i;
12615 
12616   for (i = 0; i < 4; i++)
12617     {
12618       result <<= 8;
12619       result |= (value & 0xff);
12620       value >>= 8;
12621     }
12622   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12623 }
12624 
12625 /* 64 bit reverse bytes.  */
12626 static void
12627 rev64 (sim_cpu *cpu)
12628 {
12629   unsigned rn = INSTR (9, 5);
12630   unsigned rd = INSTR (4, 0);
12631   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12632   uint64_t result = 0;
12633   int i;
12634 
12635   for (i = 0; i < 8; i++)
12636     {
12637       result <<= 8;
12638       result |= (value & 0xffULL);
12639       value >>= 8;
12640     }
12641   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12642 }
12643 
12644 /* 32 bit reverse shorts.  */
12645 /* N.B.this reverses the order of the bytes in each half word.  */
12646 static void
12647 revh32 (sim_cpu *cpu)
12648 {
12649   unsigned rn = INSTR (9, 5);
12650   unsigned rd = INSTR (4, 0);
12651   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12652   uint32_t result = 0;
12653   int i;
12654 
12655   for (i = 0; i < 2; i++)
12656     {
12657       result <<= 8;
12658       result |= (value & 0x00ff00ff);
12659       value >>= 8;
12660     }
12661   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12662 }
12663 
12664 /* 64 bit reverse shorts.  */
12665 /* N.B.this reverses the order of the bytes in each half word.  */
12666 static void
12667 revh64 (sim_cpu *cpu)
12668 {
12669   unsigned rn = INSTR (9, 5);
12670   unsigned rd = INSTR (4, 0);
12671   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12672   uint64_t result = 0;
12673   int i;
12674 
12675   for (i = 0; i < 2; i++)
12676     {
12677       result <<= 8;
12678       result |= (value & 0x00ff00ff00ff00ffULL);
12679       value >>= 8;
12680     }
12681   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12682 }
12683 
12684 static void
12685 dexDataProc1Source (sim_cpu *cpu)
12686 {
12687   /* instr[30]    = 1
12688      instr[28,21] = 111010110
12689      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12690      instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
12691      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
12692      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
12693                              000010 ==> REV, 000011 ==> UNALLOC
12694                              000100 ==> CLZ, 000101 ==> CLS
12695                              ow ==> UNALLOC
12696      instr[9,5]   = rn : may not be SP
12697      instr[4,0]   = rd : may not be SP.  */
12698 
12699   uint32_t S = INSTR (29, 29);
12700   uint32_t opcode2 = INSTR (20, 16);
12701   uint32_t opcode = INSTR (15, 10);
12702   uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
12703 
12704   if (S == 1)
12705     HALT_UNALLOC;
12706 
12707   if (opcode2 != 0)
12708     HALT_UNALLOC;
12709 
12710   if (opcode & 0x38)
12711     HALT_UNALLOC;
12712 
12713   switch (dispatch)
12714     {
12715     case 0: rbit32 (cpu); return;
12716     case 1: revh32 (cpu); return;
12717     case 2: rev32 (cpu); return;
12718     case 4: clz32 (cpu); return;
12719     case 5: cls32 (cpu); return;
12720     case 8: rbit64 (cpu); return;
12721     case 9: revh64 (cpu); return;
12722     case 10:rev32 (cpu); return;
12723     case 11:rev64 (cpu); return;
12724     case 12:clz64 (cpu); return;
12725     case 13:cls64 (cpu); return;
12726     default: HALT_UNALLOC;
12727     }
12728 }
12729 
12730 /* Variable shift.
12731    Shifts by count supplied in register.
12732    N.B register args may not be SP.
12733    These all use the shifted auxiliary function for
12734    simplicity and clarity.  Writing the actual shift
12735    inline would avoid a branch and so be faster but
12736    would also necessitate getting signs right.  */
12737 
12738 /* 32 bit arithmetic shift right.  */
12739 static void
12740 asrv32 (sim_cpu *cpu)
12741 {
12742   unsigned rm = INSTR (20, 16);
12743   unsigned rn = INSTR (9, 5);
12744   unsigned rd = INSTR (4, 0);
12745 
12746   aarch64_set_reg_u64
12747     (cpu, rd, NO_SP,
12748      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
12749 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12750 }
12751 
12752 /* 64 bit arithmetic shift right.  */
12753 static void
12754 asrv64 (sim_cpu *cpu)
12755 {
12756   unsigned rm = INSTR (20, 16);
12757   unsigned rn = INSTR (9, 5);
12758   unsigned rd = INSTR (4, 0);
12759 
12760   aarch64_set_reg_u64
12761     (cpu, rd, NO_SP,
12762      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
12763 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12764 }
12765 
12766 /* 32 bit logical shift left.  */
12767 static void
12768 lslv32 (sim_cpu *cpu)
12769 {
12770   unsigned rm = INSTR (20, 16);
12771   unsigned rn = INSTR (9, 5);
12772   unsigned rd = INSTR (4, 0);
12773 
12774   aarch64_set_reg_u64
12775     (cpu, rd, NO_SP,
12776      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
12777 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12778 }
12779 
12780 /* 64 bit arithmetic shift left.  */
12781 static void
12782 lslv64 (sim_cpu *cpu)
12783 {
12784   unsigned rm = INSTR (20, 16);
12785   unsigned rn = INSTR (9, 5);
12786   unsigned rd = INSTR (4, 0);
12787 
12788   aarch64_set_reg_u64
12789     (cpu, rd, NO_SP,
12790      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
12791 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12792 }
12793 
12794 /* 32 bit logical shift right.  */
12795 static void
12796 lsrv32 (sim_cpu *cpu)
12797 {
12798   unsigned rm = INSTR (20, 16);
12799   unsigned rn = INSTR (9, 5);
12800   unsigned rd = INSTR (4, 0);
12801 
12802   aarch64_set_reg_u64
12803     (cpu, rd, NO_SP,
12804      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
12805 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12806 }
12807 
12808 /* 64 bit logical shift right.  */
12809 static void
12810 lsrv64 (sim_cpu *cpu)
12811 {
12812   unsigned rm = INSTR (20, 16);
12813   unsigned rn = INSTR (9, 5);
12814   unsigned rd = INSTR (4, 0);
12815 
12816   aarch64_set_reg_u64
12817     (cpu, rd, NO_SP,
12818      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
12819 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12820 }
12821 
12822 /* 32 bit rotate right.  */
12823 static void
12824 rorv32 (sim_cpu *cpu)
12825 {
12826   unsigned rm = INSTR (20, 16);
12827   unsigned rn = INSTR (9, 5);
12828   unsigned rd = INSTR (4, 0);
12829 
12830   aarch64_set_reg_u64
12831     (cpu, rd, NO_SP,
12832      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
12833 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12834 }
12835 
12836 /* 64 bit rotate right.  */
12837 static void
12838 rorv64 (sim_cpu *cpu)
12839 {
12840   unsigned rm = INSTR (20, 16);
12841   unsigned rn = INSTR (9, 5);
12842   unsigned rd = INSTR (4, 0);
12843 
12844   aarch64_set_reg_u64
12845     (cpu, rd, NO_SP,
12846      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
12847 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12848 }
12849 
12850 
12851 /* divide.  */
12852 
12853 /* 32 bit signed divide.  */
12854 static void
12855 cpuiv32 (sim_cpu *cpu)
12856 {
12857   unsigned rm = INSTR (20, 16);
12858   unsigned rn = INSTR (9, 5);
12859   unsigned rd = INSTR (4, 0);
12860   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12861   /* TODO : check that this rounds towards zero as required.  */
12862   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
12863   int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);
12864 
12865   aarch64_set_reg_s64 (cpu, rd, NO_SP,
12866 		       divisor ? ((int32_t) (dividend / divisor)) : 0);
12867 }
12868 
12869 /* 64 bit signed divide.  */
12870 static void
12871 cpuiv64 (sim_cpu *cpu)
12872 {
12873   unsigned rm = INSTR (20, 16);
12874   unsigned rn = INSTR (9, 5);
12875   unsigned rd = INSTR (4, 0);
12876 
12877   /* TODO : check that this rounds towards zero as required.  */
12878   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
12879 
12880   aarch64_set_reg_s64
12881     (cpu, rd, NO_SP,
12882      divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
12883 }
12884 
12885 /* 32 bit unsigned divide.  */
12886 static void
12887 udiv32 (sim_cpu *cpu)
12888 {
12889   unsigned rm = INSTR (20, 16);
12890   unsigned rn = INSTR (9, 5);
12891   unsigned rd = INSTR (4, 0);
12892 
12893   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12894   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12895   uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);
12896 
12897   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12898 		       divisor ? (uint32_t) (dividend / divisor) : 0);
12899 }
12900 
12901 /* 64 bit unsigned divide.  */
12902 static void
12903 udiv64 (sim_cpu *cpu)
12904 {
12905   unsigned rm = INSTR (20, 16);
12906   unsigned rn = INSTR (9, 5);
12907   unsigned rd = INSTR (4, 0);
12908 
12909   /* TODO : check that this rounds towards zero as required.  */
12910   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12911 
12912   aarch64_set_reg_u64
12913     (cpu, rd, NO_SP,
12914      divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
12915 }
12916 
12917 static void
12918 dexDataProc2Source (sim_cpu *cpu)
12919 {
12920   /* assert instr[30] == 0
12921      instr[28,21] == 11010110
12922      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
12923      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12924      instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
12925                              001000 ==> LSLV, 001001 ==> LSRV
12926                              001010 ==> ASRV, 001011 ==> RORV
12927                              ow ==> UNALLOC.  */
12928 
12929   uint32_t dispatch;
12930   uint32_t S = INSTR (29, 29);
12931   uint32_t opcode = INSTR (15, 10);
12932 
12933   if (S == 1)
12934     HALT_UNALLOC;
12935 
12936   if (opcode & 0x34)
12937     HALT_UNALLOC;
12938 
12939   dispatch = (  (INSTR (31, 31) << 3)
12940 	      | (uimm (opcode, 3, 3) << 2)
12941 	      |  uimm (opcode, 1, 0));
12942   switch (dispatch)
12943     {
12944     case 2:  udiv32 (cpu); return;
12945     case 3:  cpuiv32 (cpu); return;
12946     case 4:  lslv32 (cpu); return;
12947     case 5:  lsrv32 (cpu); return;
12948     case 6:  asrv32 (cpu); return;
12949     case 7:  rorv32 (cpu); return;
12950     case 10: udiv64 (cpu); return;
12951     case 11: cpuiv64 (cpu); return;
12952     case 12: lslv64 (cpu); return;
12953     case 13: lsrv64 (cpu); return;
12954     case 14: asrv64 (cpu); return;
12955     case 15: rorv64 (cpu); return;
12956     default: HALT_UNALLOC;
12957     }
12958 }
12959 
12960 
12961 /* Multiply.  */
12962 
12963 /* 32 bit multiply and add.  */
12964 static void
12965 madd32 (sim_cpu *cpu)
12966 {
12967   unsigned rm = INSTR (20, 16);
12968   unsigned ra = INSTR (14, 10);
12969   unsigned rn = INSTR (9, 5);
12970   unsigned rd = INSTR (4, 0);
12971 
12972   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12973   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12974 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
12975 		       + aarch64_get_reg_u32 (cpu, rn, NO_SP)
12976 		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12977 }
12978 
12979 /* 64 bit multiply and add.  */
12980 static void
12981 madd64 (sim_cpu *cpu)
12982 {
12983   unsigned rm = INSTR (20, 16);
12984   unsigned ra = INSTR (14, 10);
12985   unsigned rn = INSTR (9, 5);
12986   unsigned rd = INSTR (4, 0);
12987 
12988   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12989   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12990 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
12991 		       + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
12992 			  * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
12993 }
12994 
12995 /* 32 bit multiply and sub.  */
12996 static void
12997 msub32 (sim_cpu *cpu)
12998 {
12999   unsigned rm = INSTR (20, 16);
13000   unsigned ra = INSTR (14, 10);
13001   unsigned rn = INSTR (9, 5);
13002   unsigned rd = INSTR (4, 0);
13003 
13004   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13005   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13006 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
13007 		       - aarch64_get_reg_u32 (cpu, rn, NO_SP)
13008 		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
13009 }
13010 
13011 /* 64 bit multiply and sub.  */
13012 static void
13013 msub64 (sim_cpu *cpu)
13014 {
13015   unsigned rm = INSTR (20, 16);
13016   unsigned ra = INSTR (14, 10);
13017   unsigned rn = INSTR (9, 5);
13018   unsigned rd = INSTR (4, 0);
13019 
13020   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13021   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13022 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
13023 		       - aarch64_get_reg_u64 (cpu, rn, NO_SP)
13024 		       * aarch64_get_reg_u64 (cpu, rm, NO_SP));
13025 }
13026 
13027 /* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
13028 static void
13029 smaddl (sim_cpu *cpu)
13030 {
13031   unsigned rm = INSTR (20, 16);
13032   unsigned ra = INSTR (14, 10);
13033   unsigned rn = INSTR (9, 5);
13034   unsigned rd = INSTR (4, 0);
13035 
13036   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13037      obtain a 64 bit product.  */
13038   aarch64_set_reg_s64
13039     (cpu, rd, NO_SP,
13040      aarch64_get_reg_s64 (cpu, ra, NO_SP)
13041      + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
13042      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
13043 }
13044 
13045 /* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
13046 static void
13047 smsubl (sim_cpu *cpu)
13048 {
13049   unsigned rm = INSTR (20, 16);
13050   unsigned ra = INSTR (14, 10);
13051   unsigned rn = INSTR (9, 5);
13052   unsigned rd = INSTR (4, 0);
13053 
13054   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13055      obtain a 64 bit product.  */
13056   aarch64_set_reg_s64
13057     (cpu, rd, NO_SP,
13058      aarch64_get_reg_s64 (cpu, ra, NO_SP)
13059      - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
13060      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
13061 }
13062 
13063 /* Integer Multiply/Divide.  */
13064 
13065 /* First some macros and a helper function.  */
13066 /* Macros to test or access elements of 64 bit words.  */
13067 
13068 /* Mask used to access lo 32 bits of 64 bit unsigned int.  */
13069 #define LOW_WORD_MASK ((1ULL << 32) - 1)
13070 /* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
13071 #define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
13072 /* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
13073 #define highWordToU64(_value_u64) ((_value_u64) >> 32)
13074 
13075 /* Offset of sign bit in 64 bit signed integger.  */
13076 #define SIGN_SHIFT_U64 63
13077 /* The sign bit itself -- also identifies the minimum negative int value.  */
13078 #define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
13079 /* Return true if a 64 bit signed int presented as an unsigned int is the
13080    most negative value.  */
13081 #define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
13082 /* Return true (non-zero) if a 64 bit signed int presented as an unsigned
13083    int has its sign bit set to false.  */
13084 #define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
13085 /* Return 1L or -1L according to whether a 64 bit signed int presented as
13086    an unsigned int has its sign bit set or not.  */
13087 #define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
13088 /* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
13089 #define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)
13090 
13091 /* Multiply two 64 bit ints and return.
13092    the hi 64 bits of the 128 bit product.  */
13093 
13094 static uint64_t
13095 mul64hi (uint64_t value1, uint64_t value2)
13096 {
13097   uint64_t resultmid1;
13098   uint64_t result;
13099   uint64_t value1_lo = lowWordToU64 (value1);
13100   uint64_t value1_hi = highWordToU64 (value1) ;
13101   uint64_t value2_lo = lowWordToU64 (value2);
13102   uint64_t value2_hi = highWordToU64 (value2);
13103 
13104   /* Cross-multiply and collect results.  */
13105   uint64_t xproductlo = value1_lo * value2_lo;
13106   uint64_t xproductmid1 = value1_lo * value2_hi;
13107   uint64_t xproductmid2 = value1_hi * value2_lo;
13108   uint64_t xproducthi = value1_hi * value2_hi;
13109   uint64_t carry = 0;
13110   /* Start accumulating 64 bit results.  */
13111   /* Drop bottom half of lowest cross-product.  */
13112   uint64_t resultmid = xproductlo >> 32;
13113   /* Add in middle products.  */
13114   resultmid = resultmid + xproductmid1;
13115 
13116   /* Check for overflow.  */
13117   if (resultmid < xproductmid1)
13118     /* Carry over 1 into top cross-product.  */
13119     carry++;
13120 
13121   resultmid1  = resultmid + xproductmid2;
13122 
13123   /* Check for overflow.  */
13124   if (resultmid1 < xproductmid2)
13125     /* Carry over 1 into top cross-product.  */
13126     carry++;
13127 
13128   /* Drop lowest 32 bits of middle cross-product.  */
13129   result = resultmid1 >> 32;
13130   /* Move carry bit to just above middle cross-product highest bit.  */
13131   carry = carry << 32;
13132 
13133   /* Add top cross-product plus and any carry.  */
13134   result += xproducthi + carry;
13135 
13136   return result;
13137 }
13138 
13139 /* Signed multiply high, source, source2 :
13140    64 bit, dest <-- high 64-bit of result.  */
13141 static void
13142 smulh (sim_cpu *cpu)
13143 {
13144   uint64_t uresult;
13145   int64_t  result;
13146   unsigned rm = INSTR (20, 16);
13147   unsigned rn = INSTR (9, 5);
13148   unsigned rd = INSTR (4, 0);
13149   GReg     ra = INSTR (14, 10);
13150   int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
13151   int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
13152   uint64_t uvalue1;
13153   uint64_t uvalue2;
13154   int  negate = 0;
13155 
13156   if (ra != R31)
13157     HALT_UNALLOC;
13158 
13159   /* Convert to unsigned and use the unsigned mul64hi routine
13160      the fix the sign up afterwards.  */
13161   if (value1 < 0)
13162     {
13163       negate = !negate;
13164       uvalue1 = -value1;
13165     }
13166   else
13167     {
13168       uvalue1 = value1;
13169     }
13170 
13171   if (value2 < 0)
13172     {
13173       negate = !negate;
13174       uvalue2 = -value2;
13175     }
13176   else
13177     {
13178       uvalue2 = value2;
13179     }
13180 
13181   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13182 
13183   uresult = mul64hi (uvalue1, uvalue2);
13184   result = uresult;
13185 
13186   if (negate)
13187     {
13188       /* Multiply 128-bit result by -1, which means highpart gets inverted,
13189 	 and has carry in added only if low part is 0.  */
13190       result = ~result;
13191       if ((uvalue1 * uvalue2) == 0)
13192 	result += 1;
13193     }
13194 
13195   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
13196 }
13197 
13198 /* Unsigned multiply add long -- source, source2 :
13199    32 bit, source3 : 64 bit.  */
13200 static void
13201 umaddl (sim_cpu *cpu)
13202 {
13203   unsigned rm = INSTR (20, 16);
13204   unsigned ra = INSTR (14, 10);
13205   unsigned rn = INSTR (9, 5);
13206   unsigned rd = INSTR (4, 0);
13207 
13208   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13209   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13210      obtain a 64 bit product.  */
13211   aarch64_set_reg_u64
13212     (cpu, rd, NO_SP,
13213      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13214      + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13215      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13216 }
13217 
13218 /* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
13219 static void
13220 umsubl (sim_cpu *cpu)
13221 {
13222   unsigned rm = INSTR (20, 16);
13223   unsigned ra = INSTR (14, 10);
13224   unsigned rn = INSTR (9, 5);
13225   unsigned rd = INSTR (4, 0);
13226 
13227   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13228   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13229      obtain a 64 bit product.  */
13230   aarch64_set_reg_u64
13231     (cpu, rd, NO_SP,
13232      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13233      - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13234      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13235 }
13236 
13237 /* Unsigned multiply high, source, source2 :
13238    64 bit, dest <-- high 64-bit of result.  */
13239 static void
13240 umulh (sim_cpu *cpu)
13241 {
13242   unsigned rm = INSTR (20, 16);
13243   unsigned rn = INSTR (9, 5);
13244   unsigned rd = INSTR (4, 0);
13245   GReg     ra = INSTR (14, 10);
13246 
13247   if (ra != R31)
13248     HALT_UNALLOC;
13249 
13250   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13251   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13252 		       mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
13253 				aarch64_get_reg_u64 (cpu, rm, NO_SP)));
13254 }
13255 
13256 static void
13257 dexDataProc3Source (sim_cpu *cpu)
13258 {
13259   /* assert instr[28,24] == 11011.  */
13260   /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
13261      instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
13262      instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
13263      instr[15] = o0 : 0/1 ==> ok
13264      instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
13265                               0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
13266                               0100 ==> SMULH,                   (64 bit only)
13267                               1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
13268                               1100 ==> UMULH                    (64 bit only)
13269                               ow ==> UNALLOC.  */
13270 
13271   uint32_t dispatch;
13272   uint32_t size = INSTR (31, 31);
13273   uint32_t op54 = INSTR (30, 29);
13274   uint32_t op31 = INSTR (23, 21);
13275   uint32_t o0 = INSTR (15, 15);
13276 
13277   if (op54 != 0)
13278     HALT_UNALLOC;
13279 
13280   if (size == 0)
13281     {
13282       if (op31 != 0)
13283 	HALT_UNALLOC;
13284 
13285       if (o0 == 0)
13286 	madd32 (cpu);
13287       else
13288 	msub32 (cpu);
13289       return;
13290     }
13291 
13292   dispatch = (op31 << 1) | o0;
13293 
13294   switch (dispatch)
13295     {
13296     case 0:  madd64 (cpu); return;
13297     case 1:  msub64 (cpu); return;
13298     case 2:  smaddl (cpu); return;
13299     case 3:  smsubl (cpu); return;
13300     case 4:  smulh (cpu); return;
13301     case 10: umaddl (cpu); return;
13302     case 11: umsubl (cpu); return;
13303     case 12: umulh (cpu); return;
13304     default: HALT_UNALLOC;
13305     }
13306 }
13307 
13308 static void
13309 dexDPReg (sim_cpu *cpu)
13310 {
13311   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13312      assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
13313      bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
13314   uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));
13315 
13316   switch (group2)
13317     {
13318     case DPREG_LOG_000:
13319     case DPREG_LOG_001:
13320       dexLogicalShiftedRegister (cpu); return;
13321 
13322     case DPREG_ADDSHF_010:
13323       dexAddSubtractShiftedRegister (cpu); return;
13324 
13325     case DPREG_ADDEXT_011:
13326       dexAddSubtractExtendedRegister (cpu); return;
13327 
13328     case DPREG_ADDCOND_100:
13329       {
13330 	/* This set bundles a variety of different operations.  */
13331 	/* Check for.  */
13332 	/* 1) add/sub w carry.  */
13333 	uint32_t mask1 = 0x1FE00000U;
13334 	uint32_t val1  = 0x1A000000U;
13335 	/* 2) cond compare register/immediate.  */
13336 	uint32_t mask2 = 0x1FE00000U;
13337 	uint32_t val2  = 0x1A400000U;
13338 	/* 3) cond select.  */
13339 	uint32_t mask3 = 0x1FE00000U;
13340 	uint32_t val3  = 0x1A800000U;
13341 	/* 4) data proc 1/2 source.  */
13342 	uint32_t mask4 = 0x1FE00000U;
13343 	uint32_t val4  = 0x1AC00000U;
13344 
13345 	if ((aarch64_get_instr (cpu) & mask1) == val1)
13346 	  dexAddSubtractWithCarry (cpu);
13347 
13348 	else if ((aarch64_get_instr (cpu) & mask2) == val2)
13349 	  CondCompare (cpu);
13350 
13351 	else if ((aarch64_get_instr (cpu) & mask3) == val3)
13352 	  dexCondSelect (cpu);
13353 
13354 	else if ((aarch64_get_instr (cpu) & mask4) == val4)
13355 	  {
13356 	    /* Bit 30 is clear for data proc 2 source
13357 	       and set for data proc 1 source.  */
13358 	    if (aarch64_get_instr (cpu)  & (1U << 30))
13359 	      dexDataProc1Source (cpu);
13360 	    else
13361 	      dexDataProc2Source (cpu);
13362 	  }
13363 
13364 	else
13365 	  /* Should not reach here.  */
13366 	  HALT_NYI;
13367 
13368 	return;
13369       }
13370 
13371     case DPREG_3SRC_110:
13372       dexDataProc3Source (cpu); return;
13373 
13374     case DPREG_UNALLOC_101:
13375       HALT_UNALLOC;
13376 
13377     case DPREG_3SRC_111:
13378       dexDataProc3Source (cpu); return;
13379 
13380     default:
13381       /* Should never reach here.  */
13382       HALT_NYI;
13383     }
13384 }
13385 
13386 /* Unconditional Branch immediate.
13387    Offset is a PC-relative byte offset in the range +/- 128MiB.
13388    The offset is assumed to be raw from the decode i.e. the
13389    simulator is expected to scale them from word offsets to byte.  */
13390 
13391 /* Unconditional branch.  */
13392 static void
13393 buc (sim_cpu *cpu, int32_t offset)
13394 {
13395   aarch64_set_next_PC_by_offset (cpu, offset);
13396 }
13397 
13398 static unsigned stack_depth = 0;
13399 
13400 /* Unconditional branch and link -- writes return PC to LR.  */
13401 static void
13402 bl (sim_cpu *cpu, int32_t offset)
13403 {
13404   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13405   aarch64_save_LR (cpu);
13406   aarch64_set_next_PC_by_offset (cpu, offset);
13407 
13408   if (TRACE_BRANCH_P (cpu))
13409     {
13410       ++ stack_depth;
13411       TRACE_BRANCH (cpu,
13412 		    " %*scall %" PRIx64 " [%s]"
13413 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13414 		    stack_depth, " ", aarch64_get_next_PC (cpu),
13415 		    aarch64_get_func (CPU_STATE (cpu),
13416 				      aarch64_get_next_PC (cpu)),
13417 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
13418 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
13419 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
13420 		    );
13421     }
13422 }
13423 
13424 /* Unconditional Branch register.
13425    Branch/return address is in source register.  */
13426 
13427 /* Unconditional branch.  */
13428 static void
13429 br (sim_cpu *cpu)
13430 {
13431   unsigned rn = INSTR (9, 5);
13432   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13433   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13434 }
13435 
13436 /* Unconditional branch and link -- writes return PC to LR.  */
13437 static void
13438 blr (sim_cpu *cpu)
13439 {
13440   /* Ensure we read the destination before we write LR.  */
13441   uint64_t target = aarch64_get_reg_u64 (cpu, INSTR (9, 5), NO_SP);
13442 
13443   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13444   aarch64_save_LR (cpu);
13445   aarch64_set_next_PC (cpu, target);
13446 
13447   if (TRACE_BRANCH_P (cpu))
13448     {
13449       ++ stack_depth;
13450       TRACE_BRANCH (cpu,
13451 		    " %*scall %" PRIx64 " [%s]"
13452 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13453 		    stack_depth, " ", aarch64_get_next_PC (cpu),
13454 		    aarch64_get_func (CPU_STATE (cpu),
13455 				      aarch64_get_next_PC (cpu)),
13456 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
13457 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
13458 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
13459 		    );
13460     }
13461 }
13462 
13463 /* Return -- assembler will default source to LR this is functionally
13464    equivalent to br but, presumably, unlike br it side effects the
13465    branch predictor.  */
13466 static void
13467 ret (sim_cpu *cpu)
13468 {
13469   unsigned rn = INSTR (9, 5);
13470   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13471 
13472   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13473   if (TRACE_BRANCH_P (cpu))
13474     {
13475       TRACE_BRANCH (cpu,
13476 		    " %*sreturn [result: %" PRIx64 "]",
13477 		    stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
13478       -- stack_depth;
13479     }
13480 }
13481 
13482 /* NOP -- we implement this and call it from the decode in case we
13483    want to intercept it later.  */
13484 
13485 static void
13486 nop (sim_cpu *cpu)
13487 {
13488   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13489 }
13490 
13491 /* Data synchronization barrier.  */
13492 
13493 static void
13494 dsb (sim_cpu *cpu)
13495 {
13496   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13497 }
13498 
13499 /* Data memory barrier.  */
13500 
13501 static void
13502 dmb (sim_cpu *cpu)
13503 {
13504   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13505 }
13506 
13507 /* Instruction synchronization barrier.  */
13508 
13509 static void
13510 isb (sim_cpu *cpu)
13511 {
13512   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13513 }
13514 
13515 static void
13516 dexBranchImmediate (sim_cpu *cpu)
13517 {
13518   /* assert instr[30,26] == 00101
13519      instr[31] ==> 0 == B, 1 == BL
13520      instr[25,0] == imm26 branch offset counted in words.  */
13521 
13522   uint32_t top = INSTR (31, 31);
13523   /* We have a 26 byte signed word offset which we need to pass to the
13524      execute routine as a signed byte offset.  */
13525   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
13526 
13527   if (top)
13528     bl (cpu, offset);
13529   else
13530     buc (cpu, offset);
13531 }
13532 
13533 /* Control Flow.  */
13534 
13535 /* Conditional branch
13536 
13537    Offset is a PC-relative byte offset in the range +/- 1MiB pos is
13538    a bit position in the range 0 .. 63
13539 
13540    cc is a CondCode enum value as pulled out of the decode
13541 
13542    N.B. any offset register (source) can only be Xn or Wn.  */
13543 
13544 static void
13545 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
13546 {
13547   /* The test returns TRUE if CC is met.  */
13548   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13549   if (testConditionCode (cpu, cc))
13550     aarch64_set_next_PC_by_offset (cpu, offset);
13551 }
13552 
13553 /* 32 bit branch on register non-zero.  */
13554 static void
13555 cbnz32 (sim_cpu *cpu, int32_t offset)
13556 {
13557   unsigned rt = INSTR (4, 0);
13558 
13559   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13560   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
13561     aarch64_set_next_PC_by_offset (cpu, offset);
13562 }
13563 
13564 /* 64 bit branch on register zero.  */
13565 static void
13566 cbnz (sim_cpu *cpu, int32_t offset)
13567 {
13568   unsigned rt = INSTR (4, 0);
13569 
13570   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13571   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
13572     aarch64_set_next_PC_by_offset (cpu, offset);
13573 }
13574 
13575 /* 32 bit branch on register non-zero.  */
13576 static void
13577 cbz32 (sim_cpu *cpu, int32_t offset)
13578 {
13579   unsigned rt = INSTR (4, 0);
13580 
13581   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13582   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
13583     aarch64_set_next_PC_by_offset (cpu, offset);
13584 }
13585 
13586 /* 64 bit branch on register zero.  */
13587 static void
13588 cbz (sim_cpu *cpu, int32_t offset)
13589 {
13590   unsigned rt = INSTR (4, 0);
13591 
13592   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13593   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
13594     aarch64_set_next_PC_by_offset (cpu, offset);
13595 }
13596 
13597 /* Branch on register bit test non-zero -- one size fits all.  */
13598 static void
13599 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13600 {
13601   unsigned rt = INSTR (4, 0);
13602 
13603   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13604   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
13605     aarch64_set_next_PC_by_offset (cpu, offset);
13606 }
13607 
13608 /* Branch on register bit test zero -- one size fits all.  */
13609 static void
13610 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13611 {
13612   unsigned rt = INSTR (4, 0);
13613 
13614   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13615   if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
13616     aarch64_set_next_PC_by_offset (cpu, offset);
13617 }
13618 
13619 static void
13620 dexCompareBranchImmediate (sim_cpu *cpu)
13621 {
13622   /* instr[30,25] = 01 1010
13623      instr[31]    = size : 0 ==> 32, 1 ==> 64
13624      instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
13625      instr[23,5]  = simm19 branch offset counted in words
13626      instr[4,0]   = rt  */
13627 
13628   uint32_t size = INSTR (31, 31);
13629   uint32_t op   = INSTR (24, 24);
13630   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13631 
13632   if (size == 0)
13633     {
13634       if (op == 0)
13635 	cbz32 (cpu, offset);
13636       else
13637 	cbnz32 (cpu, offset);
13638     }
13639   else
13640     {
13641       if (op == 0)
13642 	cbz (cpu, offset);
13643       else
13644 	cbnz (cpu, offset);
13645     }
13646 }
13647 
13648 static void
13649 dexTestBranchImmediate (sim_cpu *cpu)
13650 {
13651   /* instr[31]    = b5 : bit 5 of test bit idx
13652      instr[30,25] = 01 1011
13653      instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
13654      instr[23,19] = b40 : bits 4 to 0 of test bit idx
13655      instr[18,5]  = simm14 : signed offset counted in words
13656      instr[4,0]   = uimm5  */
13657 
13658   uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
13659   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
13660 
13661   NYI_assert (30, 25, 0x1b);
13662 
13663   if (INSTR (24, 24) == 0)
13664     tbz (cpu, pos, offset);
13665   else
13666     tbnz (cpu, pos, offset);
13667 }
13668 
13669 static void
13670 dexCondBranchImmediate (sim_cpu *cpu)
13671 {
13672   /* instr[31,25] = 010 1010
13673      instr[24]    = op1; op => 00 ==> B.cond
13674      instr[23,5]  = simm19 : signed offset counted in words
13675      instr[4]     = op0
13676      instr[3,0]   = cond  */
13677 
13678   int32_t offset;
13679   uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
13680 
13681   NYI_assert (31, 25, 0x2a);
13682 
13683   if (op != 0)
13684     HALT_UNALLOC;
13685 
13686   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13687 
13688   bcc (cpu, offset, INSTR (3, 0));
13689 }
13690 
13691 static void
13692 dexBranchRegister (sim_cpu *cpu)
13693 {
13694   /* instr[31,25] = 110 1011
13695      instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
13696      instr[20,16] = op2 : must be 11111
13697      instr[15,10] = op3 : must be 000000
13698      instr[4,0]   = op2 : must be 11111.  */
13699 
13700   uint32_t op = INSTR (24, 21);
13701   uint32_t op2 = INSTR (20, 16);
13702   uint32_t op3 = INSTR (15, 10);
13703   uint32_t op4 = INSTR (4, 0);
13704 
13705   NYI_assert (31, 25, 0x6b);
13706 
13707   if (op2 != 0x1F || op3 != 0 || op4 != 0)
13708     HALT_UNALLOC;
13709 
13710   if (op == 0)
13711     br (cpu);
13712 
13713   else if (op == 1)
13714     blr (cpu);
13715 
13716   else if (op == 2)
13717     ret (cpu);
13718 
13719   else
13720     {
13721       /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
13722       /* anything else is unallocated.  */
13723       uint32_t rn = INSTR (4, 0);
13724 
13725       if (rn != 0x1f)
13726 	HALT_UNALLOC;
13727 
13728       if (op == 4 || op == 5)
13729 	HALT_NYI;
13730 
13731       HALT_UNALLOC;
13732     }
13733 }
13734 
13735 /* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
13736    but this may not be available.  So instead we define the values we need
13737    here.  */
13738 #define AngelSVC_Reason_Open		0x01
13739 #define AngelSVC_Reason_Close		0x02
13740 #define AngelSVC_Reason_Write		0x05
13741 #define AngelSVC_Reason_Read		0x06
13742 #define AngelSVC_Reason_IsTTY		0x09
13743 #define AngelSVC_Reason_Seek		0x0A
13744 #define AngelSVC_Reason_FLen		0x0C
13745 #define AngelSVC_Reason_Remove		0x0E
13746 #define AngelSVC_Reason_Rename		0x0F
13747 #define AngelSVC_Reason_Clock		0x10
13748 #define AngelSVC_Reason_Time		0x11
13749 #define AngelSVC_Reason_System		0x12
13750 #define AngelSVC_Reason_Errno		0x13
13751 #define AngelSVC_Reason_GetCmdLine	0x15
13752 #define AngelSVC_Reason_HeapInfo	0x16
13753 #define AngelSVC_Reason_ReportException 0x18
13754 #define AngelSVC_Reason_Elapsed         0x30
13755 
13756 
13757 static void
13758 handle_halt (sim_cpu *cpu, uint32_t val)
13759 {
13760   uint64_t result = 0;
13761 
13762   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13763   if (val != 0xf000)
13764     {
13765       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
13766       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13767 		       sim_stopped, SIM_SIGTRAP);
13768     }
13769 
13770   /* We have encountered an Angel SVC call.  See if we can process it.  */
13771   switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
13772     {
13773     case AngelSVC_Reason_HeapInfo:
13774       {
13775 	/* Get the values.  */
13776 	uint64_t stack_top = aarch64_get_stack_start (cpu);
13777 	uint64_t heap_base = aarch64_get_heap_start (cpu);
13778 
13779 	/* Get the pointer  */
13780 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13781 	ptr = aarch64_get_mem_u64 (cpu, ptr);
13782 
13783 	/* Fill in the memory block.  */
13784 	/* Start addr of heap.  */
13785 	aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
13786 	/* End addr of heap.  */
13787 	aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
13788 	/* Lowest stack addr.  */
13789 	aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
13790 	/* Initial stack addr.  */
13791 	aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);
13792 
13793 	TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
13794       }
13795       break;
13796 
13797     case AngelSVC_Reason_Open:
13798       {
13799 	/* Get the pointer  */
13800 	/* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
13801 	/* FIXME: For now we just assume that we will only be asked
13802 	   to open the standard file descriptors.  */
13803 	static int fd = 0;
13804 	result = fd ++;
13805 
13806 	TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
13807       }
13808       break;
13809 
13810     case AngelSVC_Reason_Close:
13811       {
13812 	uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13813 	TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
13814 	result = 0;
13815       }
13816       break;
13817 
13818     case AngelSVC_Reason_Errno:
13819       result = 0;
13820       TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
13821       break;
13822 
13823     case AngelSVC_Reason_Clock:
13824       result =
13825 #ifdef CLOCKS_PER_SEC
13826 	(CLOCKS_PER_SEC >= 100)
13827 	? (clock () / (CLOCKS_PER_SEC / 100))
13828 	: ((clock () * 100) / CLOCKS_PER_SEC)
13829 #else
13830 	/* Presume unix... clock() returns microseconds.  */
13831 	(clock () / 10000)
13832 #endif
13833 	;
13834 	TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
13835       break;
13836 
13837     case AngelSVC_Reason_GetCmdLine:
13838       {
13839 	/* Get the pointer  */
13840 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13841 	ptr = aarch64_get_mem_u64 (cpu, ptr);
13842 
13843 	/* FIXME: No command line for now.  */
13844 	aarch64_set_mem_u64 (cpu, ptr, 0);
13845 	TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
13846       }
13847       break;
13848 
13849     case AngelSVC_Reason_IsTTY:
13850       result = 1;
13851 	TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
13852       break;
13853 
13854     case AngelSVC_Reason_Write:
13855       {
13856 	/* Get the pointer  */
13857 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13858 	/* Get the write control block.  */
13859 	uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
13860 	uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
13861 	uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);
13862 
13863 	TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
13864 		       PRIx64 " on descriptor %" PRIx64,
13865 		       len, buf, fd);
13866 
13867 	if (len > 1280)
13868 	  {
13869 	    TRACE_SYSCALL (cpu,
13870 			   " AngelSVC: Write: Suspiciously long write: %ld",
13871 			   (long) len);
13872 	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13873 			     sim_stopped, SIM_SIGBUS);
13874 	  }
13875 	else if (fd == 1)
13876 	  {
13877 	    printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
13878 	  }
13879 	else if (fd == 2)
13880 	  {
13881 	    TRACE (cpu, 0, "\n");
13882 	    sim_io_eprintf (CPU_STATE (cpu), "%.*s",
13883 			    (int) len, aarch64_get_mem_ptr (cpu, buf));
13884 	    TRACE (cpu, 0, "\n");
13885 	  }
13886 	else
13887 	  {
13888 	    TRACE_SYSCALL (cpu,
13889 			   " AngelSVC: Write: Unexpected file handle: %d",
13890 			   (int) fd);
13891 	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13892 			     sim_stopped, SIM_SIGABRT);
13893 	  }
13894       }
13895       break;
13896 
13897     case AngelSVC_Reason_ReportException:
13898       {
13899 	/* Get the pointer  */
13900 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13901 	/*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
13902 	uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
13903 	uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);
13904 
13905 	TRACE_SYSCALL (cpu,
13906 		       "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
13907 		       type, state);
13908 
13909 	if (type == 0x20026)
13910 	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13911 			   sim_exited, state);
13912 	else
13913 	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13914 			   sim_stopped, SIM_SIGINT);
13915       }
13916       break;
13917 
13918     case AngelSVC_Reason_Read:
13919     case AngelSVC_Reason_FLen:
13920     case AngelSVC_Reason_Seek:
13921     case AngelSVC_Reason_Remove:
13922     case AngelSVC_Reason_Time:
13923     case AngelSVC_Reason_System:
13924     case AngelSVC_Reason_Rename:
13925     case AngelSVC_Reason_Elapsed:
13926     default:
13927       TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
13928 		     aarch64_get_reg_u32 (cpu, 0, NO_SP));
13929       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13930 		       sim_stopped, SIM_SIGTRAP);
13931     }
13932 
13933   aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
13934 }
13935 
13936 static void
13937 dexExcpnGen (sim_cpu *cpu)
13938 {
13939   /* instr[31:24] = 11010100
13940      instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
13941                           010 ==> HLT,       101 ==> DBG GEN EXCPN
13942      instr[20,5]  = imm16
13943      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
13944      instr[1,0]   = LL : discriminates opc  */
13945 
13946   uint32_t opc = INSTR (23, 21);
13947   uint32_t imm16 = INSTR (20, 5);
13948   uint32_t opc2 = INSTR (4, 2);
13949   uint32_t LL;
13950 
13951   NYI_assert (31, 24, 0xd4);
13952 
13953   if (opc2 != 0)
13954     HALT_UNALLOC;
13955 
13956   LL = INSTR (1, 0);
13957 
13958   /* We only implement HLT and BRK for now.  */
13959   if (opc == 1 && LL == 0)
13960     {
13961       TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
13962       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13963 		       sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
13964     }
13965 
13966   if (opc == 2 && LL == 0)
13967     handle_halt (cpu, imm16);
13968 
13969   else if (opc == 0 || opc == 5)
13970     HALT_NYI;
13971 
13972   else
13973     HALT_UNALLOC;
13974 }
13975 
13976 /* Stub for accessing system registers.  */
13977 
13978 static uint64_t
13979 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13980 	    unsigned crm, unsigned op2)
13981 {
13982   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
13983     /* DCZID_EL0 - the Data Cache Zero ID register.
13984        We do not support DC ZVA at the moment, so
13985        we return a value with the disable bit set.
13986        We implement support for the DCZID register since
13987        it is used by the C library's memset function.  */
13988     return ((uint64_t) 1) << 4;
13989 
13990   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
13991     /* Cache Type Register.  */
13992     return 0x80008000UL;
13993 
13994   if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
13995     /* TPIDR_EL0 - thread pointer id.  */
13996     return aarch64_get_thread_id (cpu);
13997 
13998   if (op1 == 3 && crm == 4 && op2 == 0)
13999     return aarch64_get_FPCR (cpu);
14000 
14001   if (op1 == 3 && crm == 4 && op2 == 1)
14002     return aarch64_get_FPSR (cpu);
14003 
14004   else if (op1 == 3 && crm == 2 && op2 == 0)
14005     return aarch64_get_CPSR (cpu);
14006 
14007   HALT_NYI;
14008 }
14009 
14010 static void
14011 system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
14012 	    unsigned crm, unsigned op2, uint64_t val)
14013 {
14014   if (op1 == 3 && crm == 4 && op2 == 0)
14015     aarch64_set_FPCR (cpu, val);
14016 
14017   else if (op1 == 3 && crm == 4 && op2 == 1)
14018     aarch64_set_FPSR (cpu, val);
14019 
14020   else if (op1 == 3 && crm == 2 && op2 == 0)
14021     aarch64_set_CPSR (cpu, val);
14022 
14023   else
14024     HALT_NYI;
14025 }
14026 
14027 static void
14028 do_mrs (sim_cpu *cpu)
14029 {
14030   /* instr[31:20] = 1101 0101 0001 1
14031      instr[19]    = op0
14032      instr[18,16] = op1
14033      instr[15,12] = CRn
14034      instr[11,8]  = CRm
14035      instr[7,5]   = op2
14036      instr[4,0]   = Rt  */
14037   unsigned sys_op0 = INSTR (19, 19) + 2;
14038   unsigned sys_op1 = INSTR (18, 16);
14039   unsigned sys_crn = INSTR (15, 12);
14040   unsigned sys_crm = INSTR (11, 8);
14041   unsigned sys_op2 = INSTR (7, 5);
14042   unsigned rt = INSTR (4, 0);
14043 
14044   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
14045   aarch64_set_reg_u64 (cpu, rt, NO_SP,
14046 		       system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
14047 }
14048 
14049 static void
14050 do_MSR_immediate (sim_cpu *cpu)
14051 {
14052   /* instr[31:19] = 1101 0101 0000 0
14053      instr[18,16] = op1
14054      instr[15,12] = 0100
14055      instr[11,8]  = CRm
14056      instr[7,5]   = op2
14057      instr[4,0]   = 1 1111  */
14058 
14059   unsigned op1 = INSTR (18, 16);
14060   /*unsigned crm = INSTR (11, 8);*/
14061   unsigned op2 = INSTR (7, 5);
14062 
14063   NYI_assert (31, 19, 0x1AA0);
14064   NYI_assert (15, 12, 0x4);
14065   NYI_assert (4,  0,  0x1F);
14066 
14067   if (op1 == 0)
14068     {
14069       if (op2 == 5)
14070 	HALT_NYI; /* set SPSel.  */
14071       else
14072 	HALT_UNALLOC;
14073     }
14074   else if (op1 == 3)
14075     {
14076       if (op2 == 6)
14077 	HALT_NYI; /* set DAIFset.  */
14078       else if (op2 == 7)
14079 	HALT_NYI; /* set DAIFclr.  */
14080       else
14081 	HALT_UNALLOC;
14082     }
14083   else
14084     HALT_UNALLOC;
14085 }
14086 
14087 static void
14088 do_MSR_reg (sim_cpu *cpu)
14089 {
14090   /* instr[31:20] = 1101 0101 0001
14091      instr[19]    = op0
14092      instr[18,16] = op1
14093      instr[15,12] = CRn
14094      instr[11,8]  = CRm
14095      instr[7,5]   = op2
14096      instr[4,0]   = Rt  */
14097 
14098   unsigned sys_op0 = INSTR (19, 19) + 2;
14099   unsigned sys_op1 = INSTR (18, 16);
14100   unsigned sys_crn = INSTR (15, 12);
14101   unsigned sys_crm = INSTR (11, 8);
14102   unsigned sys_op2 = INSTR (7, 5);
14103   unsigned rt = INSTR (4, 0);
14104 
14105   NYI_assert (31, 20, 0xD51);
14106 
14107   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
14108   system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
14109 	      aarch64_get_reg_u64 (cpu, rt, NO_SP));
14110 }
14111 
14112 static void
14113 do_SYS (sim_cpu *cpu)
14114 {
14115   /* instr[31,19] = 1101 0101 0000 1
14116      instr[18,16] = op1
14117      instr[15,12] = CRn
14118      instr[11,8]  = CRm
14119      instr[7,5]   = op2
14120      instr[4,0]   = Rt  */
14121   NYI_assert (31, 19, 0x1AA1);
14122 
14123   /* FIXME: For now we just silently accept system ops.  */
14124 }
14125 
14126 static void
14127 dexSystem (sim_cpu *cpu)
14128 {
14129   /* instr[31:22] = 1101 01010 0
14130      instr[21]    = L
14131      instr[20,19] = op0
14132      instr[18,16] = op1
14133      instr[15,12] = CRn
14134      instr[11,8]  = CRm
14135      instr[7,5]   = op2
14136      instr[4,0]   = uimm5  */
14137 
14138   /* We are interested in HINT, DSB, DMB and ISB
14139 
14140      Hint #0 encodes NOOP (this is the only hint we care about)
14141      L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
14142      CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101
14143 
14144      DSB, DMB, ISB are data store barrier, data memory barrier and
14145      instruction store barrier, respectively, where
14146 
14147      L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
14148      op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
14149      CRm<3:2> ==> domain, CRm<1:0> ==> types,
14150      domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
14151               10 ==> InerShareable, 11 ==> FullSystem
14152      types :  01 ==> Reads, 10 ==> Writes,
14153               11 ==> All, 00 ==> All (domain == FullSystem).  */
14154 
14155   unsigned rt = INSTR (4, 0);
14156 
14157   NYI_assert (31, 22, 0x354);
14158 
14159   switch (INSTR (21, 12))
14160     {
14161     case 0x032:
14162       if (rt == 0x1F)
14163 	{
14164 	  /* NOP has CRm != 0000 OR.  */
14165 	  /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
14166 	  uint32_t crm = INSTR (11, 8);
14167 	  uint32_t op2 = INSTR (7, 5);
14168 
14169 	  if (crm != 0 || (op2 == 0 || op2 > 5))
14170 	    {
14171 	      /* Actually call nop method so we can reimplement it later.  */
14172 	      nop (cpu);
14173 	      return;
14174 	    }
14175 	}
14176       HALT_NYI;
14177 
14178     case 0x033:
14179       {
14180 	uint32_t op2 =  INSTR (7, 5);
14181 
14182 	switch (op2)
14183 	  {
14184 	  case 2: HALT_NYI;
14185 	  case 4: dsb (cpu); return;
14186 	  case 5: dmb (cpu); return;
14187 	  case 6: isb (cpu); return;
14188 	  default: HALT_UNALLOC;
14189 	}
14190       }
14191 
14192     case 0x3B0:
14193     case 0x3B4:
14194     case 0x3BD:
14195       do_mrs (cpu);
14196       return;
14197 
14198     case 0x0B7:
14199       do_SYS (cpu); /* DC is an alias of SYS.  */
14200       return;
14201 
14202     default:
14203       if (INSTR (21, 20) == 0x1)
14204 	do_MSR_reg (cpu);
14205       else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
14206 	do_MSR_immediate (cpu);
14207       else
14208 	HALT_NYI;
14209       return;
14210     }
14211 }
14212 
14213 static void
14214 dexBr (sim_cpu *cpu)
14215 {
14216   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
14217      assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
14218      bits [31,29] of a BrExSys are the secondary dispatch vector.  */
14219   uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));
14220 
14221   switch (group2)
14222     {
14223     case BR_IMM_000:
14224       return dexBranchImmediate (cpu);
14225 
14226     case BR_IMMCMP_001:
14227       /* Compare has bit 25 clear while test has it set.  */
14228       if (!INSTR (25, 25))
14229 	dexCompareBranchImmediate (cpu);
14230       else
14231 	dexTestBranchImmediate (cpu);
14232       return;
14233 
14234     case BR_IMMCOND_010:
14235       /* This is a conditional branch if bit 25 is clear otherwise
14236          unallocated.  */
14237       if (!INSTR (25, 25))
14238 	dexCondBranchImmediate (cpu);
14239       else
14240 	HALT_UNALLOC;
14241       return;
14242 
14243     case BR_UNALLOC_011:
14244       HALT_UNALLOC;
14245 
14246     case BR_IMM_100:
14247       dexBranchImmediate (cpu);
14248       return;
14249 
14250     case BR_IMMCMP_101:
14251       /* Compare has bit 25 clear while test has it set.  */
14252       if (!INSTR (25, 25))
14253 	dexCompareBranchImmediate (cpu);
14254       else
14255 	dexTestBranchImmediate (cpu);
14256       return;
14257 
14258     case BR_REG_110:
14259       /* Unconditional branch reg has bit 25 set.  */
14260       if (INSTR (25, 25))
14261 	dexBranchRegister (cpu);
14262 
14263       /* This includes both Excpn Gen, System and unalloc operations.
14264          We need to decode the Excpn Gen operation BRK so we can plant
14265          debugger entry points.
14266          Excpn Gen operations have instr [24] = 0.
14267          we need to decode at least one of the System operations NOP
14268          which is an alias for HINT #0.
14269          System operations have instr [24,22] = 100.  */
14270       else if (INSTR (24, 24) == 0)
14271 	dexExcpnGen (cpu);
14272 
14273       else if (INSTR (24, 22) == 4)
14274 	dexSystem (cpu);
14275 
14276       else
14277 	HALT_UNALLOC;
14278 
14279       return;
14280 
14281     case BR_UNALLOC_111:
14282       HALT_UNALLOC;
14283 
14284     default:
14285       /* Should never reach here.  */
14286       HALT_NYI;
14287     }
14288 }
14289 
14290 static void
14291 aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
14292 {
14293   /* We need to check if gdb wants an in here.  */
14294   /* checkBreak (cpu);.  */
14295 
14296   uint64_t group = dispatchGroup (aarch64_get_instr (cpu));
14297 
14298   switch (group)
14299     {
14300     case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
14301     case GROUP_LDST_0100:     dexLdSt (cpu); break;
14302     case GROUP_DPREG_0101:    dexDPReg (cpu); break;
14303     case GROUP_LDST_0110:     dexLdSt (cpu); break;
14304     case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
14305     case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
14306     case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
14307     case GROUP_BREXSYS_1010:  dexBr (cpu); break;
14308     case GROUP_BREXSYS_1011:  dexBr (cpu); break;
14309     case GROUP_LDST_1100:     dexLdSt (cpu); break;
14310     case GROUP_DPREG_1101:    dexDPReg (cpu); break;
14311     case GROUP_LDST_1110:     dexLdSt (cpu); break;
14312     case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;
14313 
14314     case GROUP_UNALLOC_0001:
14315     case GROUP_UNALLOC_0010:
14316     case GROUP_UNALLOC_0011:
14317       HALT_UNALLOC;
14318 
14319     default:
14320       /* Should never reach here.  */
14321       HALT_NYI;
14322     }
14323 }
14324 
14325 static bfd_boolean
14326 aarch64_step (sim_cpu *cpu)
14327 {
14328   uint64_t pc = aarch64_get_PC (cpu);
14329 
14330   if (pc == TOP_LEVEL_RETURN_PC)
14331     return FALSE;
14332 
14333   aarch64_set_next_PC (cpu, pc + 4);
14334 
14335   /* Code is always little-endian.  */
14336   sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
14337 			& aarch64_get_instr (cpu), pc, 4);
14338   aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
14339 
14340   TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
14341 	      aarch64_get_instr (cpu));
14342   TRACE_DISASM (cpu, pc);
14343 
14344   aarch64_decode_and_execute (cpu, pc);
14345 
14346   return TRUE;
14347 }
14348 
14349 void
14350 aarch64_run (SIM_DESC sd)
14351 {
14352   sim_cpu *cpu = STATE_CPU (sd, 0);
14353 
14354   while (aarch64_step (cpu))
14355     {
14356       aarch64_update_PC (cpu);
14357 
14358       if (sim_events_tick (sd))
14359 	sim_events_process (sd);
14360     }
14361 
14362   sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
14363 		   sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
14364 }
14365 
14366 void
14367 aarch64_init (sim_cpu *cpu, uint64_t pc)
14368 {
14369   uint64_t sp = aarch64_get_stack_start (cpu);
14370 
14371   /* Install SP, FP and PC and set LR to -20
14372      so we can detect a top-level return.  */
14373   aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
14374   aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
14375   aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
14376   aarch64_set_next_PC (cpu, pc);
14377   aarch64_update_PC (cpu);
14378   aarch64_init_LIT_table ();
14379 }
14380