xref: /netbsd-src/external/gpl3/gdb/dist/sim/aarch64/simulator.c (revision 796c32c94f6e154afc9de0f63da35c91bb739b45)
1 /* simulator.c -- Interface for the AArch64 simulator.
2 
3    Copyright (C) 2015-2017 Free Software Foundation, Inc.
4 
5    Contributed by Red Hat.
6 
7    This file is part of GDB.
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3 of the License, or
12    (at your option) any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include <sys/types.h>
27 #include <math.h>
28 #include <time.h>
29 #include <limits.h>
30 
31 #include "simulator.h"
32 #include "cpustate.h"
33 #include "memory.h"
34 
35 #define NO_SP 0
36 #define SP_OK 1
37 
38 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
39 #define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
40 #define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
41 
42 /* Space saver macro.  */
43 #define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
44 
45 #define HALT_UNALLOC							\
46   do									\
47     {									\
48       TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
49       TRACE_INSN (cpu,							\
50 		  "Unallocated instruction detected at sim line %d,"	\
51 		  " exe addr %" PRIx64,					\
52 		  __LINE__, aarch64_get_PC (cpu));			\
53       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
54 		       sim_stopped, SIM_SIGILL);			\
55     }									\
56   while (0)
57 
58 #define HALT_NYI							\
59   do									\
60     {									\
61       TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
62       TRACE_INSN (cpu,							\
63 		  "Unimplemented instruction detected at sim line %d,"	\
64 		  " exe addr %" PRIx64,					\
65 		  __LINE__, aarch64_get_PC (cpu));			\
66       if (! TRACE_ANY_P (cpu))						\
67         sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
68                         aarch64_get_instr (cpu));			\
69       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
70 		       sim_stopped, SIM_SIGABRT);			\
71     }									\
72   while (0)
73 
74 #define NYI_assert(HI, LO, EXPECTED)					\
75   do									\
76     {									\
77       if (INSTR ((HI), (LO)) != (EXPECTED))				\
78 	HALT_NYI;							\
79     }									\
80   while (0)
81 
82 /* Helper functions used by expandLogicalImmediate.  */
83 
84 /* for i = 1, ... N result<i-1> = 1 other bits are zero  */
85 static inline uint64_t
86 ones (int N)
87 {
88   return (N == 64 ? (uint64_t)-1UL : ((1UL << N) - 1));
89 }
90 
91 /* result<0> to val<N>  */
92 static inline uint64_t
93 pickbit (uint64_t val, int N)
94 {
95   return pickbits64 (val, N, N);
96 }
97 
98 static uint64_t
99 expand_logical_immediate (uint32_t S, uint32_t R, uint32_t N)
100 {
101   uint64_t mask;
102   uint64_t imm;
103   unsigned simd_size;
104 
105   /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
106      (in other words, right rotated by R), then replicated. */
107   if (N != 0)
108     {
109       simd_size = 64;
110       mask = 0xffffffffffffffffull;
111     }
112   else
113     {
114       switch (S)
115 	{
116 	case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
117 	case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; S &= 0xf; break;
118 	case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; S &= 0x7; break;
119 	case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; S &= 0x3; break;
120 	case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; S &= 0x1; break;
121 	default: return 0;
122 	}
123       mask = (1ull << simd_size) - 1;
124       /* Top bits are IGNORED.  */
125       R &= simd_size - 1;
126     }
127 
128   /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
129   if (S == simd_size - 1)
130     return 0;
131 
132   /* S+1 consecutive bits to 1.  */
133   /* NOTE: S can't be 63 due to detection above.  */
134   imm = (1ull << (S + 1)) - 1;
135 
136   /* Rotate to the left by simd_size - R.  */
137   if (R != 0)
138     imm = ((imm << (simd_size - R)) & mask) | (imm >> R);
139 
140   /* Replicate the value according to SIMD size.  */
141   switch (simd_size)
142     {
143     case  2: imm = (imm <<  2) | imm;
144     case  4: imm = (imm <<  4) | imm;
145     case  8: imm = (imm <<  8) | imm;
146     case 16: imm = (imm << 16) | imm;
147     case 32: imm = (imm << 32) | imm;
148     case 64: break;
149     default: return 0;
150     }
151 
152   return imm;
153 }
154 
155 /* Instr[22,10] encodes N immr and imms. we want a lookup table
156    for each possible combination i.e. 13 bits worth of int entries.  */
157 #define  LI_TABLE_SIZE  (1 << 13)
158 static uint64_t LITable[LI_TABLE_SIZE];
159 
160 void
161 aarch64_init_LIT_table (void)
162 {
163   unsigned index;
164 
165   for (index = 0; index < LI_TABLE_SIZE; index++)
166     {
167       uint32_t N    = uimm (index, 12, 12);
168       uint32_t immr = uimm (index, 11, 6);
169       uint32_t imms = uimm (index, 5, 0);
170 
171       LITable [index] = expand_logical_immediate (imms, immr, N);
172     }
173 }
174 
175 static void
176 dexNotify (sim_cpu *cpu)
177 {
178   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
179                            2 ==> exit Java, 3 ==> start next bytecode.  */
180   uint32_t type = INSTR (14, 0);
181 
182   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
183 
184   switch (type)
185     {
186     case 0:
187       /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
188 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
189       break;
190     case 1:
191       /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
192 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
193       break;
194     case 2:
195       /* aarch64_notifyMethodExit ();  */
196       break;
197     case 3:
198       /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
199 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
200       break;
201     }
202 }
203 
204 /* secondary decode within top level groups  */
205 
206 static void
207 dexPseudo (sim_cpu *cpu)
208 {
209   /* assert instr[28,27] = 00
210 
211      We provide 2 pseudo instructions:
212 
213      HALT stops execution of the simulator causing an immediate
214      return to the x86 code which entered it.
215 
216      CALLOUT initiates recursive entry into x86 code.  A register
217      argument holds the address of the x86 routine.  Immediate
218      values in the instruction identify the number of general
219      purpose and floating point register arguments to be passed
220      and the type of any value to be returned.  */
221 
222   uint32_t PSEUDO_HALT      =  0xE0000000U;
223   uint32_t PSEUDO_CALLOUT   =  0x00018000U;
224   uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
225   uint32_t PSEUDO_NOTIFY    =  0x00014000U;
226   uint32_t dispatch;
227 
228   if (aarch64_get_instr (cpu) == PSEUDO_HALT)
229     {
230       TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
231       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
232 		       sim_stopped, SIM_SIGTRAP);
233     }
234 
235   dispatch = INSTR (31, 15);
236 
237   /* We do not handle callouts at the moment.  */
238   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
239     {
240       TRACE_EVENTS (cpu, " Callout");
241       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
242 		       sim_stopped, SIM_SIGABRT);
243     }
244 
245   else if (dispatch == PSEUDO_NOTIFY)
246     dexNotify (cpu);
247 
248   else
249     HALT_UNALLOC;
250 }
251 
252 /* Load-store single register (unscaled offset)
253    These instructions employ a base register plus an unscaled signed
254    9 bit offset.
255 
256    N.B. the base register (source) can be Xn or SP. all other
257    registers may not be SP.  */
258 
259 /* 32 bit load 32 bit unscaled signed 9 bit.  */
260 static void
261 ldur32 (sim_cpu *cpu, int32_t offset)
262 {
263   unsigned rn = INSTR (9, 5);
264   unsigned rt = INSTR (4, 0);
265 
266   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
267   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
268 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
269 			+ offset));
270 }
271 
272 /* 64 bit load 64 bit unscaled signed 9 bit.  */
273 static void
274 ldur64 (sim_cpu *cpu, int32_t offset)
275 {
276   unsigned rn = INSTR (9, 5);
277   unsigned rt = INSTR (4, 0);
278 
279   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
280   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
281 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
282 			+ offset));
283 }
284 
285 /* 32 bit load zero-extended byte unscaled signed 9 bit.  */
286 static void
287 ldurb32 (sim_cpu *cpu, int32_t offset)
288 {
289   unsigned rn = INSTR (9, 5);
290   unsigned rt = INSTR (4, 0);
291 
292   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
293   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
294 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
295 			+ offset));
296 }
297 
298 /* 32 bit load sign-extended byte unscaled signed 9 bit.  */
299 static void
300 ldursb32 (sim_cpu *cpu, int32_t offset)
301 {
302   unsigned rn = INSTR (9, 5);
303   unsigned rt = INSTR (4, 0);
304 
305   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
306   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
307 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
308 			+ offset));
309 }
310 
311 /* 64 bit load sign-extended byte unscaled signed 9 bit.  */
312 static void
313 ldursb64 (sim_cpu *cpu, int32_t offset)
314 {
315   unsigned rn = INSTR (9, 5);
316   unsigned rt = INSTR (4, 0);
317 
318   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
319   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
320 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
321 			+ offset));
322 }
323 
324 /* 32 bit load zero-extended short unscaled signed 9 bit  */
325 static void
326 ldurh32 (sim_cpu *cpu, int32_t offset)
327 {
328   unsigned rn = INSTR (9, 5);
329   unsigned rd = INSTR (4, 0);
330 
331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
332   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
333 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
334 			+ offset));
335 }
336 
337 /* 32 bit load sign-extended short unscaled signed 9 bit  */
338 static void
339 ldursh32 (sim_cpu *cpu, int32_t offset)
340 {
341   unsigned rn = INSTR (9, 5);
342   unsigned rd = INSTR (4, 0);
343 
344   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
345   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
346 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
347 			+ offset));
348 }
349 
350 /* 64 bit load sign-extended short unscaled signed 9 bit  */
351 static void
352 ldursh64 (sim_cpu *cpu, int32_t offset)
353 {
354   unsigned rn = INSTR (9, 5);
355   unsigned rt = INSTR (4, 0);
356 
357   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
358   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
359 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
360 			+ offset));
361 }
362 
363 /* 64 bit load sign-extended word unscaled signed 9 bit  */
364 static void
365 ldursw (sim_cpu *cpu, int32_t offset)
366 {
367   unsigned rn = INSTR (9, 5);
368   unsigned rd = INSTR (4, 0);
369 
370   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
371   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
372 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
373 			+ offset));
374 }
375 
376 /* N.B. with stores the value in source is written to the address
377    identified by source2 modified by offset.  */
378 
379 /* 32 bit store 32 bit unscaled signed 9 bit.  */
380 static void
381 stur32 (sim_cpu *cpu, int32_t offset)
382 {
383   unsigned rn = INSTR (9, 5);
384   unsigned rd = INSTR (4, 0);
385 
386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
387   aarch64_set_mem_u32 (cpu,
388 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
389 		       aarch64_get_reg_u32 (cpu, rd, NO_SP));
390 }
391 
392 /* 64 bit store 64 bit unscaled signed 9 bit  */
393 static void
394 stur64 (sim_cpu *cpu, int32_t offset)
395 {
396   unsigned rn = INSTR (9, 5);
397   unsigned rd = INSTR (4, 0);
398 
399   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
400   aarch64_set_mem_u64 (cpu,
401 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
402 		       aarch64_get_reg_u64 (cpu, rd, NO_SP));
403 }
404 
405 /* 32 bit store byte unscaled signed 9 bit  */
406 static void
407 sturb (sim_cpu *cpu, int32_t offset)
408 {
409   unsigned rn = INSTR (9, 5);
410   unsigned rd = INSTR (4, 0);
411 
412   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
413   aarch64_set_mem_u8 (cpu,
414 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
415 		      aarch64_get_reg_u8 (cpu, rd, NO_SP));
416 }
417 
418 /* 32 bit store short unscaled signed 9 bit  */
419 static void
420 sturh (sim_cpu *cpu, int32_t offset)
421 {
422   unsigned rn = INSTR (9, 5);
423   unsigned rd = INSTR (4, 0);
424 
425   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
426   aarch64_set_mem_u16 (cpu,
427 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
428 		       aarch64_get_reg_u16 (cpu, rd, NO_SP));
429 }
430 
431 /* Load single register pc-relative label
432    Offset is a signed 19 bit immediate count in words
433    rt may not be SP.  */
434 
435 /* 32 bit pc-relative load  */
436 static void
437 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
438 {
439   unsigned rd = INSTR (4, 0);
440 
441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
442   aarch64_set_reg_u64 (cpu, rd, NO_SP,
443 		       aarch64_get_mem_u32
444 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
445 }
446 
447 /* 64 bit pc-relative load  */
448 static void
449 ldr_pcrel (sim_cpu *cpu, int32_t offset)
450 {
451   unsigned rd = INSTR (4, 0);
452 
453   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
454   aarch64_set_reg_u64 (cpu, rd, NO_SP,
455 		       aarch64_get_mem_u64
456 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
457 }
458 
459 /* sign extended 32 bit pc-relative load  */
460 static void
461 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
462 {
463   unsigned rd = INSTR (4, 0);
464 
465   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
466   aarch64_set_reg_u64 (cpu, rd, NO_SP,
467 		       aarch64_get_mem_s32
468 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
469 }
470 
471 /* float pc-relative load  */
472 static void
473 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
474 {
475   unsigned int rd = INSTR (4, 0);
476 
477   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
478   aarch64_set_vec_u32 (cpu, rd, 0,
479 		       aarch64_get_mem_u32
480 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
481 }
482 
483 /* double pc-relative load  */
484 static void
485 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
486 {
487   unsigned int st = INSTR (4, 0);
488 
489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
490   aarch64_set_vec_u64 (cpu, st, 0,
491 		       aarch64_get_mem_u64
492 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
493 }
494 
495 /* long double pc-relative load.  */
496 static void
497 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
498 {
499   unsigned int st = INSTR (4, 0);
500   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
501   FRegister a;
502 
503   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
504   aarch64_get_mem_long_double (cpu, addr, & a);
505   aarch64_set_FP_long_double (cpu, st, a);
506 }
507 
508 /* This can be used to scale an offset by applying
509    the requisite shift. the second argument is either
510    16, 32 or 64.  */
511 
512 #define SCALE(_offset, _elementSize) \
513     ((_offset) << ScaleShift ## _elementSize)
514 
515 /* This can be used to optionally scale a register derived offset
516    by applying the requisite shift as indicated by the Scaling
517    argument.  The second argument is either Byte, Short, Word
518    or Long. The third argument is either Scaled or Unscaled.
519    N.B. when _Scaling is Scaled the shift gets ANDed with
520    all 1s while when it is Unscaled it gets ANDed with 0.  */
521 
522 #define OPT_SCALE(_offset, _elementType, _Scaling) \
523   ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))
524 
525 /* This can be used to zero or sign extend a 32 bit register derived
526    value to a 64 bit value.  the first argument must be the value as
527    a uint32_t and the second must be either UXTW or SXTW. The result
528    is returned as an int64_t.  */
529 
530 static inline int64_t
531 extend (uint32_t value, Extension extension)
532 {
533   union
534   {
535     uint32_t u;
536     int32_t   n;
537   } x;
538 
539   /* A branchless variant of this ought to be possible.  */
540   if (extension == UXTW || extension == NoExtension)
541     return value;
542 
543   x.u = value;
544   return x.n;
545 }
546 
547 /* Scalar Floating Point
548 
549    FP load/store single register (4 addressing modes)
550 
551    N.B. the base register (source) can be the stack pointer.
552    The secondary source register (source2) can only be an Xn register.  */
553 
554 /* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
555 static void
556 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
557 {
558   unsigned rn = INSTR (9, 5);
559   unsigned st = INSTR (4, 0);
560   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
561 
562   if (wb != Post)
563     address += offset;
564 
565   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
566   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
567   if (wb == Post)
568     address += offset;
569 
570   if (wb != NoWriteBack)
571     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
572 }
573 
574 /* Load 8 bit with unsigned 12 bit offset.  */
575 static void
576 fldrb_abs (sim_cpu *cpu, uint32_t offset)
577 {
578   unsigned rd = INSTR (4, 0);
579   unsigned rn = INSTR (9, 5);
580   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
581 
582   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
583   aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
584 }
585 
586 /* Load 16 bit scaled unsigned 12 bit.  */
587 static void
588 fldrh_abs (sim_cpu *cpu, uint32_t offset)
589 {
590   unsigned rd = INSTR (4, 0);
591   unsigned rn = INSTR (9, 5);
592   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
593 
594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
595   aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
596 }
597 
598 /* Load 32 bit scaled unsigned 12 bit.  */
599 static void
600 fldrs_abs (sim_cpu *cpu, uint32_t offset)
601 {
602   unsigned rd = INSTR (4, 0);
603   unsigned rn = INSTR (9, 5);
604   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
605 
606   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
607   aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
608 }
609 
610 /* Load 64 bit scaled unsigned 12 bit.  */
611 static void
612 fldrd_abs (sim_cpu *cpu, uint32_t offset)
613 {
614   unsigned rd = INSTR (4, 0);
615   unsigned rn = INSTR (9, 5);
616   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
617 
618   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
619   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
620 }
621 
622 /* Load 128 bit scaled unsigned 12 bit.  */
623 static void
624 fldrq_abs (sim_cpu *cpu, uint32_t offset)
625 {
626   unsigned rd = INSTR (4, 0);
627   unsigned rn = INSTR (9, 5);
628   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
629 
630   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
631   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
632   aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
633 }
634 
635 /* Load 32 bit scaled or unscaled zero- or sign-extended
636    32-bit register offset.  */
637 static void
638 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
639 {
640   unsigned rm = INSTR (20, 16);
641   unsigned rn = INSTR (9, 5);
642   unsigned st = INSTR (4, 0);
643   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
644   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
645   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
646 
647   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
648   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
649 		       (cpu, address + displacement));
650 }
651 
652 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
653 static void
654 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
655 {
656   unsigned rn = INSTR (9, 5);
657   unsigned st = INSTR (4, 0);
658   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
659 
660   if (wb != Post)
661     address += offset;
662 
663   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
664   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
665 
666   if (wb == Post)
667     address += offset;
668 
669   if (wb != NoWriteBack)
670     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
671 }
672 
673 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
674 static void
675 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
676 {
677   unsigned rm = INSTR (20, 16);
678   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
679   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
680 
681   fldrd_wb (cpu, displacement, NoWriteBack);
682 }
683 
684 /* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
685 static void
686 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
687 {
688   FRegister a;
689   unsigned rn = INSTR (9, 5);
690   unsigned st = INSTR (4, 0);
691   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
692 
693   if (wb != Post)
694     address += offset;
695 
696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
697   aarch64_get_mem_long_double (cpu, address, & a);
698   aarch64_set_FP_long_double (cpu, st, a);
699 
700   if (wb == Post)
701     address += offset;
702 
703   if (wb != NoWriteBack)
704     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
705 }
706 
707 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
708 static void
709 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
710 {
711   unsigned rm = INSTR (20, 16);
712   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
713   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
714 
715   fldrq_wb (cpu, displacement, NoWriteBack);
716 }
717 
718 /* Memory Access
719 
720    load-store single register
721    There are four addressing modes available here which all employ a
722    64 bit source (base) register.
723 
724    N.B. the base register (source) can be the stack pointer.
725    The secondary source register (source2)can only be an Xn register.
726 
727    Scaled, 12-bit, unsigned immediate offset, without pre- and
728    post-index options.
729    Unscaled, 9-bit, signed immediate offset with pre- or post-index
730    writeback.
731    scaled or unscaled 64-bit register offset.
732    scaled or unscaled 32-bit extended register offset.
733 
734    All offsets are assumed to be raw from the decode i.e. the
735    simulator is expected to adjust scaled offsets based on the
736    accessed data size with register or extended register offset
737    versions the same applies except that in the latter case the
738    operation may also require a sign extend.
739 
740    A separate method is provided for each possible addressing mode.  */
741 
742 /* 32 bit load 32 bit scaled unsigned 12 bit  */
743 static void
744 ldr32_abs (sim_cpu *cpu, uint32_t offset)
745 {
746   unsigned rn = INSTR (9, 5);
747   unsigned rt = INSTR (4, 0);
748 
749   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
750   /* The target register may not be SP but the source may be.  */
751   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
752 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
753 			+ SCALE (offset, 32)));
754 }
755 
756 /* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
757 static void
758 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
759 {
760   unsigned rn = INSTR (9, 5);
761   unsigned rt = INSTR (4, 0);
762   uint64_t address;
763 
764   if (rn == rt && wb != NoWriteBack)
765     HALT_UNALLOC;
766 
767   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
768 
769   if (wb != Post)
770     address += offset;
771 
772   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
773   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
774 
775   if (wb == Post)
776     address += offset;
777 
778   if (wb != NoWriteBack)
779     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
780 }
781 
782 /* 32 bit load 32 bit scaled or unscaled
783    zero- or sign-extended 32-bit register offset  */
784 static void
785 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
786 {
787   unsigned rm = INSTR (20, 16);
788   unsigned rn = INSTR (9, 5);
789   unsigned rt = INSTR (4, 0);
790   /* rn may reference SP, rm and rt must reference ZR  */
791 
792   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
793   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
794   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
795 
796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
797   aarch64_set_reg_u64 (cpu, rt, NO_SP,
798 		       aarch64_get_mem_u32 (cpu, address + displacement));
799 }
800 
801 /* 64 bit load 64 bit scaled unsigned 12 bit  */
802 static void
803 ldr_abs (sim_cpu *cpu, uint32_t offset)
804 {
805   unsigned rn = INSTR (9, 5);
806   unsigned rt = INSTR (4, 0);
807 
808   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
809   /* The target register may not be SP but the source may be.  */
810   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
811 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
812 			+ SCALE (offset, 64)));
813 }
814 
815 /* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
816 static void
817 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
818 {
819   unsigned rn = INSTR (9, 5);
820   unsigned rt = INSTR (4, 0);
821   uint64_t address;
822 
823   if (rn == rt && wb != NoWriteBack)
824     HALT_UNALLOC;
825 
826   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
827 
828   if (wb != Post)
829     address += offset;
830 
831   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
832   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
833 
834   if (wb == Post)
835     address += offset;
836 
837   if (wb != NoWriteBack)
838     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
839 }
840 
841 /* 64 bit load 64 bit scaled or unscaled zero-
842    or sign-extended 32-bit register offset.  */
843 static void
844 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
845 {
846   unsigned rm = INSTR (20, 16);
847   unsigned rn = INSTR (9, 5);
848   unsigned rt = INSTR (4, 0);
849   /* rn may reference SP, rm and rt must reference ZR  */
850 
851   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
852   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
853   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
854 
855   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
856   aarch64_set_reg_u64 (cpu, rt, NO_SP,
857 		       aarch64_get_mem_u64 (cpu, address + displacement));
858 }
859 
860 /* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
861 static void
862 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
863 {
864   unsigned rn = INSTR (9, 5);
865   unsigned rt = INSTR (4, 0);
866 
867   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
868   /* The target register may not be SP but the source may be
869      there is no scaling required for a byte load.  */
870   aarch64_set_reg_u64 (cpu, rt, NO_SP,
871 		       aarch64_get_mem_u8
872 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
873 }
874 
875 /* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
876 static void
877 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
878 {
879   unsigned rn = INSTR (9, 5);
880   unsigned rt = INSTR (4, 0);
881   uint64_t address;
882 
883   if (rn == rt && wb != NoWriteBack)
884     HALT_UNALLOC;
885 
886   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
887 
888   if (wb != Post)
889     address += offset;
890 
891   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
892   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
893 
894   if (wb == Post)
895     address += offset;
896 
897   if (wb != NoWriteBack)
898     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
899 }
900 
901 /* 32 bit load zero-extended byte scaled or unscaled zero-
902    or sign-extended 32-bit register offset.  */
903 static void
904 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
905 {
906   unsigned rm = INSTR (20, 16);
907   unsigned rn = INSTR (9, 5);
908   unsigned rt = INSTR (4, 0);
909   /* rn may reference SP, rm and rt must reference ZR  */
910 
911   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
912   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
913 				 extension);
914 
915   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
916   /* There is no scaling required for a byte load.  */
917   aarch64_set_reg_u64 (cpu, rt, NO_SP,
918 		       aarch64_get_mem_u8 (cpu, address + displacement));
919 }
920 
921 /* 64 bit load sign-extended byte unscaled signed 9 bit
922    with pre- or post-writeback.  */
923 static void
924 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
925 {
926   unsigned rn = INSTR (9, 5);
927   unsigned rt = INSTR (4, 0);
928   uint64_t address;
929   int64_t val;
930 
931   if (rn == rt && wb != NoWriteBack)
932     HALT_UNALLOC;
933 
934   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
935 
936   if (wb != Post)
937     address += offset;
938 
939   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
940   val = aarch64_get_mem_s8 (cpu, address);
941   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
942 
943   if (wb == Post)
944     address += offset;
945 
946   if (wb != NoWriteBack)
947     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
948 }
949 
950 /* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
951 static void
952 ldrsb_abs (sim_cpu *cpu, uint32_t offset)
953 {
954   ldrsb_wb (cpu, offset, NoWriteBack);
955 }
956 
957 /* 64 bit load sign-extended byte scaled or unscaled zero-
958    or sign-extended 32-bit register offset.  */
959 static void
960 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
961 {
962   unsigned rm = INSTR (20, 16);
963   unsigned rn = INSTR (9, 5);
964   unsigned rt = INSTR (4, 0);
965   /* rn may reference SP, rm and rt must reference ZR  */
966 
967   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
968   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
969 				 extension);
970   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
971   /* There is no scaling required for a byte load.  */
972   aarch64_set_reg_s64 (cpu, rt, NO_SP,
973 		       aarch64_get_mem_s8 (cpu, address + displacement));
974 }
975 
976 /* 32 bit load zero-extended short scaled unsigned 12 bit.  */
977 static void
978 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
979 {
980   unsigned rn = INSTR (9, 5);
981   unsigned rt = INSTR (4, 0);
982   uint32_t val;
983 
984   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
985   /* The target register may not be SP but the source may be.  */
986   val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
987 			     + SCALE (offset, 16));
988   aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
989 }
990 
991 /* 32 bit load zero-extended short unscaled signed 9 bit
992    with pre- or post-writeback.  */
993 static void
994 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
995 {
996   unsigned rn = INSTR (9, 5);
997   unsigned rt = INSTR (4, 0);
998   uint64_t address;
999 
1000   if (rn == rt && wb != NoWriteBack)
1001     HALT_UNALLOC;
1002 
1003   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1004 
1005   if (wb != Post)
1006     address += offset;
1007 
1008   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1009   aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1010 
1011   if (wb == Post)
1012     address += offset;
1013 
1014   if (wb != NoWriteBack)
1015     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1016 }
1017 
1018 /* 32 bit load zero-extended short scaled or unscaled zero-
1019    or sign-extended 32-bit register offset.  */
1020 static void
1021 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1022 {
1023   unsigned rm = INSTR (20, 16);
1024   unsigned rn = INSTR (9, 5);
1025   unsigned rt = INSTR (4, 0);
1026   /* rn may reference SP, rm and rt must reference ZR  */
1027 
1028   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1029   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1030   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1031 
1032   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1033   aarch64_set_reg_u32 (cpu, rt, NO_SP,
1034 		       aarch64_get_mem_u16 (cpu, address + displacement));
1035 }
1036 
1037 /* 32 bit load sign-extended short scaled unsigned 12 bit.  */
1038 static void
1039 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
1040 {
1041   unsigned rn = INSTR (9, 5);
1042   unsigned rt = INSTR (4, 0);
1043   int32_t val;
1044 
1045   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1046   /* The target register may not be SP but the source may be.  */
1047   val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1048 			     + SCALE (offset, 16));
1049   aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
1050 }
1051 
1052 /* 32 bit load sign-extended short unscaled signed 9 bit
1053    with pre- or post-writeback.  */
1054 static void
1055 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1056 {
1057   unsigned rn = INSTR (9, 5);
1058   unsigned rt = INSTR (4, 0);
1059   uint64_t address;
1060 
1061   if (rn == rt && wb != NoWriteBack)
1062     HALT_UNALLOC;
1063 
1064   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1065 
1066   if (wb != Post)
1067     address += offset;
1068 
1069   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1070   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1071 		       (int32_t) aarch64_get_mem_s16 (cpu, address));
1072 
1073   if (wb == Post)
1074     address += offset;
1075 
1076   if (wb != NoWriteBack)
1077     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1078 }
1079 
1080 /* 32 bit load sign-extended short scaled or unscaled zero-
1081    or sign-extended 32-bit register offset.  */
1082 static void
1083 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1084 {
1085   unsigned rm = INSTR (20, 16);
1086   unsigned rn = INSTR (9, 5);
1087   unsigned rt = INSTR (4, 0);
1088   /* rn may reference SP, rm and rt must reference ZR  */
1089 
1090   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1091   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1092   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1093 
1094   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1095   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1096 		       (int32_t) aarch64_get_mem_s16
1097 		       (cpu, address + displacement));
1098 }
1099 
1100 /* 64 bit load sign-extended short scaled unsigned 12 bit.  */
1101 static void
1102 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
1103 {
1104   unsigned rn = INSTR (9, 5);
1105   unsigned rt = INSTR (4, 0);
1106   int64_t val;
1107 
1108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1109   /* The target register may not be SP but the source may be.  */
1110   val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1111 			      + SCALE (offset, 16));
1112   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1113 }
1114 
1115 /* 64 bit load sign-extended short unscaled signed 9 bit
1116    with pre- or post-writeback.  */
1117 static void
1118 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1119 {
1120   unsigned rn = INSTR (9, 5);
1121   unsigned rt = INSTR (4, 0);
1122   uint64_t address;
1123   int64_t val;
1124 
1125   if (rn == rt && wb != NoWriteBack)
1126     HALT_UNALLOC;
1127 
1128   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1129   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1130 
1131   if (wb != Post)
1132     address += offset;
1133 
1134   val = aarch64_get_mem_s16 (cpu, address);
1135   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1136 
1137   if (wb == Post)
1138     address += offset;
1139 
1140   if (wb != NoWriteBack)
1141     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1142 }
1143 
1144 /* 64 bit load sign-extended short scaled or unscaled zero-
1145    or sign-extended 32-bit register offset.  */
1146 static void
1147 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1148 {
1149   unsigned rm = INSTR (20, 16);
1150   unsigned rn = INSTR (9, 5);
1151   unsigned rt = INSTR (4, 0);
1152 
1153   /* rn may reference SP, rm and rt must reference ZR  */
1154 
1155   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1156   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1157   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
1158   int64_t val;
1159 
1160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1161   val = aarch64_get_mem_s16 (cpu, address + displacement);
1162   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1163 }
1164 
1165 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
1166 static void
1167 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
1168 {
1169   unsigned rn = INSTR (9, 5);
1170   unsigned rt = INSTR (4, 0);
1171   int64_t val;
1172 
1173   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1174   val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1175 			     + SCALE (offset, 32));
1176   /* The target register may not be SP but the source may be.  */
1177   return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1178 }
1179 
1180 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
1181    with pre- or post-writeback.  */
1182 static void
1183 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1184 {
1185   unsigned rn = INSTR (9, 5);
1186   unsigned rt = INSTR (4, 0);
1187   uint64_t address;
1188 
1189   if (rn == rt && wb != NoWriteBack)
1190     HALT_UNALLOC;
1191 
1192   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1193 
1194   if (wb != Post)
1195     address += offset;
1196 
1197   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1198   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
1199 
1200   if (wb == Post)
1201     address += offset;
1202 
1203   if (wb != NoWriteBack)
1204     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1205 }
1206 
1207 /* 64 bit load sign-extended 32 bit scaled or unscaled zero-
1208    or sign-extended 32-bit register offset.  */
1209 static void
1210 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1211 {
1212   unsigned rm = INSTR (20, 16);
1213   unsigned rn = INSTR (9, 5);
1214   unsigned rt = INSTR (4, 0);
1215   /* rn may reference SP, rm and rt must reference ZR  */
1216 
1217   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1218   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1219   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
1220 
1221   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1222   aarch64_set_reg_s64 (cpu, rt, NO_SP,
1223 		       aarch64_get_mem_s32 (cpu, address + displacement));
1224 }
1225 
1226 /* N.B. with stores the value in source is written to the
1227    address identified by source2 modified by source3/offset.  */
1228 
1229 /* 32 bit store scaled unsigned 12 bit.  */
1230 static void
1231 str32_abs (sim_cpu *cpu, uint32_t offset)
1232 {
1233   unsigned rn = INSTR (9, 5);
1234   unsigned rt = INSTR (4, 0);
1235 
1236   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1237   /* The target register may not be SP but the source may be.  */
1238   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
1239 			     + SCALE (offset, 32)),
1240 		       aarch64_get_reg_u32 (cpu, rt, NO_SP));
1241 }
1242 
1243 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1244 static void
1245 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1246 {
1247   unsigned rn = INSTR (9, 5);
1248   unsigned rt = INSTR (4, 0);
1249   uint64_t address;
1250 
1251   if (rn == rt && wb != NoWriteBack)
1252     HALT_UNALLOC;
1253 
1254   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1255   if (wb != Post)
1256     address += offset;
1257 
1258   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1259   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
1260 
1261   if (wb == Post)
1262     address += offset;
1263 
1264   if (wb != NoWriteBack)
1265     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1266 }
1267 
1268 /* 32 bit store scaled or unscaled zero- or
1269    sign-extended 32-bit register offset.  */
1270 static void
1271 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1272 {
1273   unsigned rm = INSTR (20, 16);
1274   unsigned rn = INSTR (9, 5);
1275   unsigned rt = INSTR (4, 0);
1276 
1277   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1278   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1279   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
1280 
1281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1282   aarch64_set_mem_u32 (cpu, address + displacement,
1283 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1284 }
1285 
1286 /* 64 bit store scaled unsigned 12 bit.  */
1287 static void
1288 str_abs (sim_cpu *cpu, uint32_t offset)
1289 {
1290   unsigned rn = INSTR (9, 5);
1291   unsigned rt = INSTR (4, 0);
1292 
1293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1294   aarch64_set_mem_u64 (cpu,
1295 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
1296 		       + SCALE (offset, 64),
1297 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1298 }
1299 
1300 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1301 static void
1302 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1303 {
1304   unsigned rn = INSTR (9, 5);
1305   unsigned rt = INSTR (4, 0);
1306   uint64_t address;
1307 
1308   if (rn == rt && wb != NoWriteBack)
1309     HALT_UNALLOC;
1310 
1311   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1312 
1313   if (wb != Post)
1314     address += offset;
1315 
1316   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1317   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
1318 
1319   if (wb == Post)
1320     address += offset;
1321 
1322   if (wb != NoWriteBack)
1323     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1324 }
1325 
1326 /* 64 bit store scaled or unscaled zero-
1327    or sign-extended 32-bit register offset.  */
1328 static void
1329 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1330 {
1331   unsigned rm = INSTR (20, 16);
1332   unsigned rn = INSTR (9, 5);
1333   unsigned rt = INSTR (4, 0);
1334   /* rn may reference SP, rm and rt must reference ZR  */
1335 
1336   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1337   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1338 			       extension);
1339   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
1340 
1341   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1342   aarch64_set_mem_u64 (cpu, address + displacement,
1343 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1344 }
1345 
1346 /* 32 bit store byte scaled unsigned 12 bit.  */
1347 static void
1348 strb_abs (sim_cpu *cpu, uint32_t offset)
1349 {
1350   unsigned rn = INSTR (9, 5);
1351   unsigned rt = INSTR (4, 0);
1352 
1353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1354   /* The target register may not be SP but the source may be.
1355      There is no scaling required for a byte load.  */
1356   aarch64_set_mem_u8 (cpu,
1357 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
1358 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
1359 }
1360 
1361 /* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
1362 static void
1363 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1364 {
1365   unsigned rn = INSTR (9, 5);
1366   unsigned rt = INSTR (4, 0);
1367   uint64_t address;
1368 
1369   if (rn == rt && wb != NoWriteBack)
1370     HALT_UNALLOC;
1371 
1372   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1373 
1374   if (wb != Post)
1375     address += offset;
1376 
1377   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1378   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
1379 
1380   if (wb == Post)
1381     address += offset;
1382 
1383   if (wb != NoWriteBack)
1384     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1385 }
1386 
1387 /* 32 bit store byte scaled or unscaled zero-
1388    or sign-extended 32-bit register offset.  */
1389 static void
1390 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1391 {
1392   unsigned rm = INSTR (20, 16);
1393   unsigned rn = INSTR (9, 5);
1394   unsigned rt = INSTR (4, 0);
1395   /* rn may reference SP, rm and rt must reference ZR  */
1396 
1397   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1398   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1399 				 extension);
1400 
1401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1402   /* There is no scaling required for a byte load.  */
1403   aarch64_set_mem_u8 (cpu, address + displacement,
1404 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
1405 }
1406 
1407 /* 32 bit store short scaled unsigned 12 bit.  */
1408 static void
1409 strh_abs (sim_cpu *cpu, uint32_t offset)
1410 {
1411   unsigned rn = INSTR (9, 5);
1412   unsigned rt = INSTR (4, 0);
1413 
1414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1415   /* The target register may not be SP but the source may be.  */
1416   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1417 		       + SCALE (offset, 16),
1418 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
1419 }
1420 
1421 /* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
1422 static void
1423 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1424 {
1425   unsigned rn = INSTR (9, 5);
1426   unsigned rt = INSTR (4, 0);
1427   uint64_t address;
1428 
1429   if (rn == rt && wb != NoWriteBack)
1430     HALT_UNALLOC;
1431 
1432   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1433 
1434   if (wb != Post)
1435     address += offset;
1436 
1437   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1438   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
1439 
1440   if (wb == Post)
1441     address += offset;
1442 
1443   if (wb != NoWriteBack)
1444     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1445 }
1446 
1447 /* 32 bit store short scaled or unscaled zero-
1448    or sign-extended 32-bit register offset.  */
1449 static void
1450 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1451 {
1452   unsigned rm = INSTR (20, 16);
1453   unsigned rn = INSTR (9, 5);
1454   unsigned rt = INSTR (4, 0);
1455   /* rn may reference SP, rm and rt must reference ZR  */
1456 
1457   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1458   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1459   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1460 
1461   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1462   aarch64_set_mem_u16 (cpu, address + displacement,
1463 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
1464 }
1465 
1466 /* Prefetch unsigned 12 bit.  */
1467 static void
1468 prfm_abs (sim_cpu *cpu, uint32_t offset)
1469 {
1470   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1471                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1472                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1473                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1474                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1475                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1476                           ow ==> UNALLOC
1477      PrfOp prfop = prfop (instr, 4, 0);
1478      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
1479      + SCALE (offset, 64).  */
1480 
1481   /* TODO : implement prefetch of address.  */
1482 }
1483 
1484 /* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
1485 static void
1486 prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1487 {
1488   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1489                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1490                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1491                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1492                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1493                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1494                           ow ==> UNALLOC
1495      rn may reference SP, rm may only reference ZR
1496      PrfOp prfop = prfop (instr, 4, 0);
1497      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1498      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1499                                 extension);
1500      uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
1501      uint64_t address = base + displacement.  */
1502 
1503   /* TODO : implement prefetch of address  */
1504 }
1505 
1506 /* 64 bit pc-relative prefetch.  */
1507 static void
1508 prfm_pcrel (sim_cpu *cpu, int32_t offset)
1509 {
1510   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1511                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1512                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1513                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1514                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1515                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1516                           ow ==> UNALLOC
1517      PrfOp prfop = prfop (instr, 4, 0);
1518      uint64_t address = aarch64_get_PC (cpu) + offset.  */
1519 
1520   /* TODO : implement this  */
1521 }
1522 
1523 /* Load-store exclusive.  */
1524 
1525 static void
1526 ldxr (sim_cpu *cpu)
1527 {
1528   unsigned rn = INSTR (9, 5);
1529   unsigned rt = INSTR (4, 0);
1530   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1531   int size = INSTR (31, 30);
1532   /* int ordered = INSTR (15, 15);  */
1533   /* int exclusive = ! INSTR (23, 23);  */
1534 
1535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1536   switch (size)
1537     {
1538     case 0:
1539       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
1540       break;
1541     case 1:
1542       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1543       break;
1544     case 2:
1545       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
1546       break;
1547     case 3:
1548       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
1549       break;
1550     }
1551 }
1552 
1553 static void
1554 stxr (sim_cpu *cpu)
1555 {
1556   unsigned rn = INSTR (9, 5);
1557   unsigned rt = INSTR (4, 0);
1558   unsigned rs = INSTR (20, 16);
1559   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1560   int      size = INSTR (31, 30);
1561   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
1562 
1563   switch (size)
1564     {
1565     case 0: aarch64_set_mem_u8 (cpu, address, data); break;
1566     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
1567     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
1568     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
1569     }
1570 
1571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1572   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
1573 }
1574 
1575 static void
1576 dexLoadLiteral (sim_cpu *cpu)
1577 {
1578   /* instr[29,27] == 011
1579      instr[25,24] == 00
1580      instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
1581                             010 ==> LDRX,  011 ==> FLDRD
1582                             100 ==> LDRSW, 101 ==> FLDRQ
1583                             110 ==> PRFM, 111 ==> UNALLOC
1584      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
1585      instr[23, 5] == simm19  */
1586 
1587   /* unsigned rt = INSTR (4, 0);  */
1588   uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
1589   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
1590 
1591   switch (dispatch)
1592     {
1593     case 0: ldr32_pcrel (cpu, imm); break;
1594     case 1: fldrs_pcrel (cpu, imm); break;
1595     case 2: ldr_pcrel   (cpu, imm); break;
1596     case 3: fldrd_pcrel (cpu, imm); break;
1597     case 4: ldrsw_pcrel (cpu, imm); break;
1598     case 5: fldrq_pcrel (cpu, imm); break;
1599     case 6: prfm_pcrel  (cpu, imm); break;
1600     case 7:
1601     default:
1602       HALT_UNALLOC;
1603     }
1604 }
1605 
1606 /* Immediate arithmetic
1607    The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
1608    value left shifted by 12 bits (done at decode).
1609 
1610    N.B. the register args (dest, source) can normally be Xn or SP.
1611    the exception occurs for flag setting instructions which may
1612    only use Xn for the output (dest).  */
1613 
1614 /* 32 bit add immediate.  */
1615 static void
1616 add32 (sim_cpu *cpu, uint32_t aimm)
1617 {
1618   unsigned rn = INSTR (9, 5);
1619   unsigned rd = INSTR (4, 0);
1620 
1621   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1622   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1623 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
1624 }
1625 
1626 /* 64 bit add immediate.  */
1627 static void
1628 add64 (sim_cpu *cpu, uint32_t aimm)
1629 {
1630   unsigned rn = INSTR (9, 5);
1631   unsigned rd = INSTR (4, 0);
1632 
1633   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1634   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1635 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
1636 }
1637 
1638 static void
1639 set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
1640 {
1641   int32_t   result = value1 + value2;
1642   int64_t   sresult = (int64_t) value1 + (int64_t) value2;
1643   uint64_t  uresult = (uint64_t)(uint32_t) value1
1644     + (uint64_t)(uint32_t) value2;
1645   uint32_t  flags = 0;
1646 
1647   if (result == 0)
1648     flags |= Z;
1649 
1650   if (result & (1 << 31))
1651     flags |= N;
1652 
1653   if (uresult != (uint32_t)result)
1654     flags |= C;
1655 
1656   if (sresult != result)
1657     flags |= V;
1658 
1659   aarch64_set_CPSR (cpu, flags);
1660 }
1661 
1662 #define NEG(a) (((a) & signbit) == signbit)
1663 #define POS(a) (((a) & signbit) == 0)
1664 
1665 static void
1666 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1667 {
1668   uint64_t result = value1 + value2;
1669   uint32_t flags = 0;
1670   uint64_t signbit = 1ULL << 63;
1671 
1672   if (result == 0)
1673     flags |= Z;
1674 
1675   if (NEG (result))
1676     flags |= N;
1677 
1678   if (   (NEG (value1) && NEG (value2))
1679       || (NEG (value1) && POS (result))
1680       || (NEG (value2) && POS (result)))
1681     flags |= C;
1682 
1683   if (   (NEG (value1) && NEG (value2) && POS (result))
1684       || (POS (value1) && POS (value2) && NEG (result)))
1685     flags |= V;
1686 
1687   aarch64_set_CPSR (cpu, flags);
1688 }
1689 
1690 static void
1691 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
1692 {
1693   uint32_t result = value1 - value2;
1694   uint32_t flags = 0;
1695   uint32_t signbit = 1U << 31;
1696 
1697   if (result == 0)
1698     flags |= Z;
1699 
1700   if (NEG (result))
1701     flags |= N;
1702 
1703   if (   (NEG (value1) && POS (value2))
1704       || (NEG (value1) && POS (result))
1705       || (POS (value2) && POS (result)))
1706     flags |= C;
1707 
1708   if (   (NEG (value1) && POS (value2) && POS (result))
1709       || (POS (value1) && NEG (value2) && NEG (result)))
1710     flags |= V;
1711 
1712   aarch64_set_CPSR (cpu, flags);
1713 }
1714 
1715 static void
1716 set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1717 {
1718   uint64_t result = value1 - value2;
1719   uint32_t flags = 0;
1720   uint64_t signbit = 1ULL << 63;
1721 
1722   if (result == 0)
1723     flags |= Z;
1724 
1725   if (NEG (result))
1726     flags |= N;
1727 
1728   if (   (NEG (value1) && POS (value2))
1729       || (NEG (value1) && POS (result))
1730       || (POS (value2) && POS (result)))
1731     flags |= C;
1732 
1733   if (   (NEG (value1) && POS (value2) && POS (result))
1734       || (POS (value1) && NEG (value2) && NEG (result)))
1735     flags |= V;
1736 
1737   aarch64_set_CPSR (cpu, flags);
1738 }
1739 
1740 static void
1741 set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
1742 {
1743   uint32_t flags = 0;
1744 
1745   if (result == 0)
1746     flags |= Z;
1747   else
1748     flags &= ~ Z;
1749 
1750   if (result & (1 << 31))
1751     flags |= N;
1752   else
1753     flags &= ~ N;
1754 
1755   aarch64_set_CPSR (cpu, flags);
1756 }
1757 
1758 static void
1759 set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
1760 {
1761   uint32_t flags = 0;
1762 
1763   if (result == 0)
1764     flags |= Z;
1765   else
1766     flags &= ~ Z;
1767 
1768   if (result & (1ULL << 63))
1769     flags |= N;
1770   else
1771     flags &= ~ N;
1772 
1773   aarch64_set_CPSR (cpu, flags);
1774 }
1775 
1776 /* 32 bit add immediate set flags.  */
1777 static void
1778 adds32 (sim_cpu *cpu, uint32_t aimm)
1779 {
1780   unsigned rn = INSTR (9, 5);
1781   unsigned rd = INSTR (4, 0);
1782   /* TODO : do we need to worry about signs here?  */
1783   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
1784 
1785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1786   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
1787   set_flags_for_add32 (cpu, value1, aimm);
1788 }
1789 
1790 /* 64 bit add immediate set flags.  */
1791 static void
1792 adds64 (sim_cpu *cpu, uint32_t aimm)
1793 {
1794   unsigned rn = INSTR (9, 5);
1795   unsigned rd = INSTR (4, 0);
1796   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1797   uint64_t value2 = aimm;
1798 
1799   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1800   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1801   set_flags_for_add64 (cpu, value1, value2);
1802 }
1803 
1804 /* 32 bit sub immediate.  */
1805 static void
1806 sub32 (sim_cpu *cpu, uint32_t aimm)
1807 {
1808   unsigned rn = INSTR (9, 5);
1809   unsigned rd = INSTR (4, 0);
1810 
1811   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1812   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1813 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
1814 }
1815 
1816 /* 64 bit sub immediate.  */
1817 static void
1818 sub64 (sim_cpu *cpu, uint32_t aimm)
1819 {
1820   unsigned rn = INSTR (9, 5);
1821   unsigned rd = INSTR (4, 0);
1822 
1823   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1824   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1825 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
1826 }
1827 
1828 /* 32 bit sub immediate set flags.  */
1829 static void
1830 subs32 (sim_cpu *cpu, uint32_t aimm)
1831 {
1832   unsigned rn = INSTR (9, 5);
1833   unsigned rd = INSTR (4, 0);
1834   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1835   uint32_t value2 = aimm;
1836 
1837   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1838   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1839   set_flags_for_sub32 (cpu, value1, value2);
1840 }
1841 
1842 /* 64 bit sub immediate set flags.  */
1843 static void
1844 subs64 (sim_cpu *cpu, uint32_t aimm)
1845 {
1846   unsigned rn = INSTR (9, 5);
1847   unsigned rd = INSTR (4, 0);
1848   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1849   uint32_t value2 = aimm;
1850 
1851   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1852   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1853   set_flags_for_sub64 (cpu, value1, value2);
1854 }
1855 
1856 /* Data Processing Register.  */
1857 
1858 /* First two helpers to perform the shift operations.  */
1859 
1860 static inline uint32_t
1861 shifted32 (uint32_t value, Shift shift, uint32_t count)
1862 {
1863   switch (shift)
1864     {
1865     default:
1866     case LSL:
1867       return (value << count);
1868     case LSR:
1869       return (value >> count);
1870     case ASR:
1871       {
1872 	int32_t svalue = value;
1873 	return (svalue >> count);
1874       }
1875     case ROR:
1876       {
1877 	uint32_t top = value >> count;
1878 	uint32_t bottom = value << (32 - count);
1879 	return (bottom | top);
1880       }
1881     }
1882 }
1883 
1884 static inline uint64_t
1885 shifted64 (uint64_t value, Shift shift, uint32_t count)
1886 {
1887   switch (shift)
1888     {
1889     default:
1890     case LSL:
1891       return (value << count);
1892     case LSR:
1893       return (value >> count);
1894     case ASR:
1895       {
1896 	int64_t svalue = value;
1897 	return (svalue >> count);
1898       }
1899     case ROR:
1900       {
1901 	uint64_t top = value >> count;
1902 	uint64_t bottom = value << (64 - count);
1903 	return (bottom | top);
1904       }
1905     }
1906 }
1907 
1908 /* Arithmetic shifted register.
1909    These allow an optional LSL, ASR or LSR to the second source
1910    register with a count up to the register bit count.
1911 
1912    N.B register args may not be SP.  */
1913 
1914 /* 32 bit ADD shifted register.  */
1915 static void
1916 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1917 {
1918   unsigned rm = INSTR (20, 16);
1919   unsigned rn = INSTR (9, 5);
1920   unsigned rd = INSTR (4, 0);
1921 
1922   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1923   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1924 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
1925 		       + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1926 				    shift, count));
1927 }
1928 
1929 /* 64 bit ADD shifted register.  */
1930 static void
1931 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1932 {
1933   unsigned rm = INSTR (20, 16);
1934   unsigned rn = INSTR (9, 5);
1935   unsigned rd = INSTR (4, 0);
1936 
1937   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1938   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1939 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
1940 		       + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1941 				    shift, count));
1942 }
1943 
1944 /* 32 bit ADD shifted register setting flags.  */
1945 static void
1946 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1947 {
1948   unsigned rm = INSTR (20, 16);
1949   unsigned rn = INSTR (9, 5);
1950   unsigned rd = INSTR (4, 0);
1951 
1952   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
1953   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1954 			       shift, count);
1955 
1956   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1957   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1958   set_flags_for_add32 (cpu, value1, value2);
1959 }
1960 
1961 /* 64 bit ADD shifted register setting flags.  */
1962 static void
1963 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1964 {
1965   unsigned rm = INSTR (20, 16);
1966   unsigned rn = INSTR (9, 5);
1967   unsigned rd = INSTR (4, 0);
1968 
1969   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
1970   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1971 			       shift, count);
1972 
1973   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1974   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1975   set_flags_for_add64 (cpu, value1, value2);
1976 }
1977 
1978 /* 32 bit SUB shifted register.  */
1979 static void
1980 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1981 {
1982   unsigned rm = INSTR (20, 16);
1983   unsigned rn = INSTR (9, 5);
1984   unsigned rd = INSTR (4, 0);
1985 
1986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1987   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1988 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
1989 		       - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1990 				    shift, count));
1991 }
1992 
1993 /* 64 bit SUB shifted register.  */
1994 static void
1995 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1996 {
1997   unsigned rm = INSTR (20, 16);
1998   unsigned rn = INSTR (9, 5);
1999   unsigned rd = INSTR (4, 0);
2000 
2001   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2002   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2003 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
2004 		       - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2005 				    shift, count));
2006 }
2007 
2008 /* 32 bit SUB shifted register setting flags.  */
2009 static void
2010 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2011 {
2012   unsigned rm = INSTR (20, 16);
2013   unsigned rn = INSTR (9, 5);
2014   unsigned rd = INSTR (4, 0);
2015 
2016   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2017   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
2018 			      shift, count);
2019 
2020   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2021   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2022   set_flags_for_sub32 (cpu, value1, value2);
2023 }
2024 
2025 /* 64 bit SUB shifted register setting flags.  */
2026 static void
2027 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2028 {
2029   unsigned rm = INSTR (20, 16);
2030   unsigned rn = INSTR (9, 5);
2031   unsigned rd = INSTR (4, 0);
2032 
2033   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2034   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2035 			       shift, count);
2036 
2037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2038   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2039   set_flags_for_sub64 (cpu, value1, value2);
2040 }
2041 
2042 /* First a couple more helpers to fetch the
2043    relevant source register element either
2044    sign or zero extended as required by the
2045    extension value.  */
2046 
2047 static uint32_t
2048 extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
2049 {
2050   switch (extension)
2051     {
2052     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2053     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2054     case UXTW: /* Fall through.  */
2055     case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2056     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2057     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2058     case SXTW: /* Fall through.  */
2059     case SXTX: /* Fall through.  */
2060     default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2061   }
2062 }
2063 
2064 static uint64_t
2065 extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
2066 {
2067   switch (extension)
2068     {
2069     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2070     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2071     case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2072     case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
2073     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2074     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2075     case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2076     case SXTX:
2077     default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
2078     }
2079 }
2080 
2081 /* Arithmetic extending register
2082    These allow an optional sign extension of some portion of the
2083    second source register followed by an optional left shift of
2084    between 1 and 4 bits (i.e. a shift of 0-4 bits???)
2085 
2086    N.B output (dest) and first input arg (source) may normally be Xn
2087    or SP. However, for flag setting operations dest can only be
2088    Xn. Second input registers are always Xn.  */
2089 
2090 /* 32 bit ADD extending register.  */
2091 static void
2092 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2093 {
2094   unsigned rm = INSTR (20, 16);
2095   unsigned rn = INSTR (9, 5);
2096   unsigned rd = INSTR (4, 0);
2097 
2098   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2099   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2100 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
2101 		       + (extreg32 (cpu, rm, extension) << shift));
2102 }
2103 
2104 /* 64 bit ADD extending register.
2105    N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2106 static void
2107 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2108 {
2109   unsigned rm = INSTR (20, 16);
2110   unsigned rn = INSTR (9, 5);
2111   unsigned rd = INSTR (4, 0);
2112 
2113   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2114   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2115 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
2116 		       + (extreg64 (cpu, rm, extension) << shift));
2117 }
2118 
2119 /* 32 bit ADD extending register setting flags.  */
2120 static void
2121 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2122 {
2123   unsigned rm = INSTR (20, 16);
2124   unsigned rn = INSTR (9, 5);
2125   unsigned rd = INSTR (4, 0);
2126 
2127   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2128   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2129 
2130   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2131   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2132   set_flags_for_add32 (cpu, value1, value2);
2133 }
2134 
2135 /* 64 bit ADD extending register setting flags  */
2136 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2137 static void
2138 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2139 {
2140   unsigned rm = INSTR (20, 16);
2141   unsigned rn = INSTR (9, 5);
2142   unsigned rd = INSTR (4, 0);
2143 
2144   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2145   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2146 
2147   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2148   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2149   set_flags_for_add64 (cpu, value1, value2);
2150 }
2151 
2152 /* 32 bit SUB extending register.  */
2153 static void
2154 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2155 {
2156   unsigned rm = INSTR (20, 16);
2157   unsigned rn = INSTR (9, 5);
2158   unsigned rd = INSTR (4, 0);
2159 
2160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2161   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2162 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
2163 		       - (extreg32 (cpu, rm, extension) << shift));
2164 }
2165 
2166 /* 64 bit SUB extending register.  */
2167 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2168 static void
2169 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2170 {
2171   unsigned rm = INSTR (20, 16);
2172   unsigned rn = INSTR (9, 5);
2173   unsigned rd = INSTR (4, 0);
2174 
2175   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2176   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2177 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
2178 		       - (extreg64 (cpu, rm, extension) << shift));
2179 }
2180 
2181 /* 32 bit SUB extending register setting flags.  */
2182 static void
2183 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2184 {
2185   unsigned rm = INSTR (20, 16);
2186   unsigned rn = INSTR (9, 5);
2187   unsigned rd = INSTR (4, 0);
2188 
2189   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2190   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2191 
2192   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2193   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2194   set_flags_for_sub32 (cpu, value1, value2);
2195 }
2196 
2197 /* 64 bit SUB extending register setting flags  */
2198 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2199 static void
2200 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2201 {
2202   unsigned rm = INSTR (20, 16);
2203   unsigned rn = INSTR (9, 5);
2204   unsigned rd = INSTR (4, 0);
2205 
2206   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2207   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2208 
2209   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2210   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2211   set_flags_for_sub64 (cpu, value1, value2);
2212 }
2213 
2214 static void
2215 dexAddSubtractImmediate (sim_cpu *cpu)
2216 {
2217   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2218      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2219      instr[29]    = set : 0 ==> no flags, 1 ==> set flags
2220      instr[28,24] = 10001
2221      instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
2222      instr[21,10] = uimm12
2223      instr[9,5]   = Rn
2224      instr[4,0]   = Rd  */
2225 
2226   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
2227   uint32_t shift = INSTR (23, 22);
2228   uint32_t imm = INSTR (21, 10);
2229   uint32_t dispatch = INSTR (31, 29);
2230 
2231   NYI_assert (28, 24, 0x11);
2232 
2233   if (shift > 1)
2234     HALT_UNALLOC;
2235 
2236   if (shift)
2237     imm <<= 12;
2238 
2239   switch (dispatch)
2240     {
2241     case 0: add32 (cpu, imm); break;
2242     case 1: adds32 (cpu, imm); break;
2243     case 2: sub32 (cpu, imm); break;
2244     case 3: subs32 (cpu, imm); break;
2245     case 4: add64 (cpu, imm); break;
2246     case 5: adds64 (cpu, imm); break;
2247     case 6: sub64 (cpu, imm); break;
2248     case 7: subs64 (cpu, imm); break;
2249     }
2250 }
2251 
2252 static void
2253 dexAddSubtractShiftedRegister (sim_cpu *cpu)
2254 {
2255   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2256      instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
2257      instr[28,24] = 01011
2258      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
2259      instr[21]    = 0
2260      instr[20,16] = Rm
2261      instr[15,10] = count : must be 0xxxxx for 32 bit
2262      instr[9,5]   = Rn
2263      instr[4,0]   = Rd  */
2264 
2265   uint32_t size = INSTR (31, 31);
2266   uint32_t count = INSTR (15, 10);
2267   Shift shiftType = INSTR (23, 22);
2268 
2269   NYI_assert (28, 24, 0x0B);
2270   NYI_assert (21, 21, 0);
2271 
2272   /* Shift encoded as ROR is unallocated.  */
2273   if (shiftType == ROR)
2274     HALT_UNALLOC;
2275 
2276   /* 32 bit operations must have count[5] = 0
2277      or else we have an UNALLOC.  */
2278   if (size == 0 && uimm (count, 5, 5))
2279     HALT_UNALLOC;
2280 
2281   /* Dispatch on size:op i.e instr [31,29].  */
2282   switch (INSTR (31, 29))
2283     {
2284     case 0: add32_shift  (cpu, shiftType, count); break;
2285     case 1: adds32_shift (cpu, shiftType, count); break;
2286     case 2: sub32_shift  (cpu, shiftType, count); break;
2287     case 3: subs32_shift (cpu, shiftType, count); break;
2288     case 4: add64_shift  (cpu, shiftType, count); break;
2289     case 5: adds64_shift (cpu, shiftType, count); break;
2290     case 6: sub64_shift  (cpu, shiftType, count); break;
2291     case 7: subs64_shift (cpu, shiftType, count); break;
2292     }
2293 }
2294 
2295 static void
2296 dexAddSubtractExtendedRegister (sim_cpu *cpu)
2297 {
2298   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2299      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2300      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2301      instr[28,24] = 01011
2302      instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
2303      instr[21]    = 1
2304      instr[20,16] = Rm
2305      instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
2306                              000 ==> LSL|UXTW, 001 ==> UXTZ,
2307                              000 ==> SXTB, 001 ==> SXTH,
2308                              000 ==> SXTW, 001 ==> SXTX,
2309      instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
2310      instr[9,5]   = Rn
2311      instr[4,0]   = Rd  */
2312 
2313   Extension extensionType = INSTR (15, 13);
2314   uint32_t shift = INSTR (12, 10);
2315 
2316   NYI_assert (28, 24, 0x0B);
2317   NYI_assert (21, 21, 1);
2318 
2319   /* Shift may not exceed 4.  */
2320   if (shift > 4)
2321     HALT_UNALLOC;
2322 
2323   /* Dispatch on size:op:set?.  */
2324   switch (INSTR (31, 29))
2325     {
2326     case 0: add32_ext  (cpu, extensionType, shift); break;
2327     case 1: adds32_ext (cpu, extensionType, shift); break;
2328     case 2: sub32_ext  (cpu, extensionType, shift); break;
2329     case 3: subs32_ext (cpu, extensionType, shift); break;
2330     case 4: add64_ext  (cpu, extensionType, shift); break;
2331     case 5: adds64_ext (cpu, extensionType, shift); break;
2332     case 6: sub64_ext  (cpu, extensionType, shift); break;
2333     case 7: subs64_ext (cpu, extensionType, shift); break;
2334     }
2335 }
2336 
2337 /* Conditional data processing
2338    Condition register is implicit 3rd source.  */
2339 
2340 /* 32 bit add with carry.  */
2341 /* N.B register args may not be SP.  */
2342 
2343 static void
2344 adc32 (sim_cpu *cpu)
2345 {
2346   unsigned rm = INSTR (20, 16);
2347   unsigned rn = INSTR (9, 5);
2348   unsigned rd = INSTR (4, 0);
2349 
2350   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2351   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2352 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
2353 		       + aarch64_get_reg_u32 (cpu, rm, NO_SP)
2354 		       + IS_SET (C));
2355 }
2356 
2357 /* 64 bit add with carry  */
2358 static void
2359 adc64 (sim_cpu *cpu)
2360 {
2361   unsigned rm = INSTR (20, 16);
2362   unsigned rn = INSTR (9, 5);
2363   unsigned rd = INSTR (4, 0);
2364 
2365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2366   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2367 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
2368 		       + aarch64_get_reg_u64 (cpu, rm, NO_SP)
2369 		       + IS_SET (C));
2370 }
2371 
2372 /* 32 bit add with carry setting flags.  */
2373 static void
2374 adcs32 (sim_cpu *cpu)
2375 {
2376   unsigned rm = INSTR (20, 16);
2377   unsigned rn = INSTR (9, 5);
2378   unsigned rd = INSTR (4, 0);
2379 
2380   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2381   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2382   uint32_t carry = IS_SET (C);
2383 
2384   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2385   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2386   set_flags_for_add32 (cpu, value1, value2 + carry);
2387 }
2388 
2389 /* 64 bit add with carry setting flags.  */
2390 static void
2391 adcs64 (sim_cpu *cpu)
2392 {
2393   unsigned rm = INSTR (20, 16);
2394   unsigned rn = INSTR (9, 5);
2395   unsigned rd = INSTR (4, 0);
2396 
2397   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2398   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2399   uint64_t carry = IS_SET (C);
2400 
2401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2402   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2403   set_flags_for_add64 (cpu, value1, value2 + carry);
2404 }
2405 
2406 /* 32 bit sub with carry.  */
2407 static void
2408 sbc32 (sim_cpu *cpu)
2409 {
2410   unsigned rm = INSTR (20, 16);
2411   unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
2412   unsigned rd = INSTR (4, 0);
2413 
2414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2415   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2416 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
2417 		       - aarch64_get_reg_u32 (cpu, rm, NO_SP)
2418 		       - 1 + IS_SET (C));
2419 }
2420 
2421 /* 64 bit sub with carry  */
2422 static void
2423 sbc64 (sim_cpu *cpu)
2424 {
2425   unsigned rm = INSTR (20, 16);
2426   unsigned rn = INSTR (9, 5);
2427   unsigned rd = INSTR (4, 0);
2428 
2429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2430   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2431 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
2432 		       - aarch64_get_reg_u64 (cpu, rm, NO_SP)
2433 		       - 1 + IS_SET (C));
2434 }
2435 
2436 /* 32 bit sub with carry setting flags  */
2437 static void
2438 sbcs32 (sim_cpu *cpu)
2439 {
2440   unsigned rm = INSTR (20, 16);
2441   unsigned rn = INSTR (9, 5);
2442   unsigned rd = INSTR (4, 0);
2443 
2444   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2445   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2446   uint32_t carry  = IS_SET (C);
2447   uint32_t result = value1 - value2 + 1 - carry;
2448 
2449   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2450   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2451   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
2452 }
2453 
2454 /* 64 bit sub with carry setting flags  */
2455 static void
2456 sbcs64 (sim_cpu *cpu)
2457 {
2458   unsigned rm = INSTR (20, 16);
2459   unsigned rn = INSTR (9, 5);
2460   unsigned rd = INSTR (4, 0);
2461 
2462   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2463   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2464   uint64_t carry  = IS_SET (C);
2465   uint64_t result = value1 - value2 + 1 - carry;
2466 
2467   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2468   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2469   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
2470 }
2471 
2472 static void
2473 dexAddSubtractWithCarry (sim_cpu *cpu)
2474 {
2475   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2476      instr[30]    = op : 0 ==> ADC, 1 ==> SBC
2477      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2478      instr[28,21] = 1 1010 000
2479      instr[20,16] = Rm
2480      instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
2481      instr[9,5]   = Rn
2482      instr[4,0]   = Rd  */
2483 
2484   uint32_t op2 = INSTR (15, 10);
2485 
2486   NYI_assert (28, 21, 0xD0);
2487 
2488   if (op2 != 0)
2489     HALT_UNALLOC;
2490 
2491   /* Dispatch on size:op:set?.  */
2492   switch (INSTR (31, 29))
2493     {
2494     case 0: adc32 (cpu); break;
2495     case 1: adcs32 (cpu); break;
2496     case 2: sbc32 (cpu); break;
2497     case 3: sbcs32 (cpu); break;
2498     case 4: adc64 (cpu); break;
2499     case 5: adcs64 (cpu); break;
2500     case 6: sbc64 (cpu); break;
2501     case 7: sbcs64 (cpu); break;
2502     }
2503 }
2504 
2505 static uint32_t
2506 testConditionCode (sim_cpu *cpu, CondCode cc)
2507 {
2508   /* This should be reduceable to branchless logic
2509      by some careful testing of bits in CC followed
2510      by the requisite masking and combining of bits
2511      from the flag register.
2512 
2513      For now we do it with a switch.  */
2514   int res;
2515 
2516   switch (cc)
2517     {
2518     case EQ:  res = IS_SET (Z);    break;
2519     case NE:  res = IS_CLEAR (Z);  break;
2520     case CS:  res = IS_SET (C);    break;
2521     case CC:  res = IS_CLEAR (C);  break;
2522     case MI:  res = IS_SET (N);    break;
2523     case PL:  res = IS_CLEAR (N);  break;
2524     case VS:  res = IS_SET (V);    break;
2525     case VC:  res = IS_CLEAR (V);  break;
2526     case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
2527     case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
2528     case GE:  res = IS_SET (N) == IS_SET (V);    break;
2529     case LT:  res = IS_SET (N) != IS_SET (V);    break;
2530     case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
2531     case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
2532     case AL:
2533     case NV:
2534     default:
2535       res = 1;
2536       break;
2537     }
2538   return res;
2539 }
2540 
2541 static void
2542 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
2543 {
2544   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2545      instr[30]    = compare with positive (1) or negative value (0)
2546      instr[29,21] = 1 1101 0010
2547      instr[20,16] = Rm or const
2548      instr[15,12] = cond
2549      instr[11]    = compare reg (0) or const (1)
2550      instr[10]    = 0
2551      instr[9,5]   = Rn
2552      instr[4]     = 0
2553      instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
2554   signed int negate;
2555   unsigned rm;
2556   unsigned rn;
2557 
2558   NYI_assert (29, 21, 0x1d2);
2559   NYI_assert (10, 10, 0);
2560   NYI_assert (4, 4, 0);
2561 
2562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2563   if (! testConditionCode (cpu, INSTR (15, 12)))
2564     {
2565       aarch64_set_CPSR (cpu, INSTR (3, 0));
2566       return;
2567     }
2568 
2569   negate = INSTR (30, 30) ? 1 : -1;
2570   rm = INSTR (20, 16);
2571   rn = INSTR ( 9,  5);
2572 
2573   if (INSTR (31, 31))
2574     {
2575       if (INSTR (11, 11))
2576 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2577 			     negate * (uint64_t) rm);
2578       else
2579 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2580 			     negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
2581     }
2582   else
2583     {
2584       if (INSTR (11, 11))
2585 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2586 			     negate * rm);
2587       else
2588 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2589 			     negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
2590     }
2591 }
2592 
2593 static void
2594 do_vec_MOV_whole_vector (sim_cpu *cpu)
2595 {
2596   /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)
2597 
2598      instr[31]    = 0
2599      instr[30]    = half(0)/full(1)
2600      instr[29,21] = 001110101
2601      instr[20,16] = Vs
2602      instr[15,10] = 000111
2603      instr[9,5]   = Vs
2604      instr[4,0]   = Vd  */
2605 
2606   unsigned vs = INSTR (9, 5);
2607   unsigned vd = INSTR (4, 0);
2608 
2609   NYI_assert (29, 21, 0x075);
2610   NYI_assert (15, 10, 0x07);
2611 
2612   if (INSTR (20, 16) != vs)
2613     HALT_NYI;
2614 
2615   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2616   if (INSTR (30, 30))
2617     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
2618 
2619   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
2620 }
2621 
2622 static void
2623 do_vec_SMOV_into_scalar (sim_cpu *cpu)
2624 {
2625   /* instr[31]    = 0
2626      instr[30]    = word(0)/long(1)
2627      instr[29,21] = 00 1110 000
2628      instr[20,16] = element size and index
2629      instr[15,10] = 00 0010 11
2630      instr[9,5]   = V source
2631      instr[4,0]   = R dest  */
2632 
2633   unsigned vs = INSTR (9, 5);
2634   unsigned rd = INSTR (4, 0);
2635   unsigned imm5 = INSTR (20, 16);
2636   unsigned full = INSTR (30, 30);
2637   int size, index;
2638 
2639   NYI_assert (29, 21, 0x070);
2640   NYI_assert (15, 10, 0x0B);
2641 
2642   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2643 
2644   if (imm5 & 0x1)
2645     {
2646       size = 0;
2647       index = (imm5 >> 1) & 0xF;
2648     }
2649   else if (imm5 & 0x2)
2650     {
2651       size = 1;
2652       index = (imm5 >> 2) & 0x7;
2653     }
2654   else if (full && (imm5 & 0x4))
2655     {
2656       size = 2;
2657       index = (imm5 >> 3) & 0x3;
2658     }
2659   else
2660     HALT_UNALLOC;
2661 
2662   switch (size)
2663     {
2664     case 0:
2665       if (full)
2666 	aarch64_set_reg_s64 (cpu, rd, NO_SP,
2667 			     aarch64_get_vec_s8 (cpu, vs, index));
2668       else
2669 	aarch64_set_reg_s32 (cpu, rd, NO_SP,
2670 			     aarch64_get_vec_s8 (cpu, vs, index));
2671       break;
2672 
2673     case 1:
2674       if (full)
2675 	aarch64_set_reg_s64 (cpu, rd, NO_SP,
2676 			     aarch64_get_vec_s16 (cpu, vs, index));
2677       else
2678 	aarch64_set_reg_s32 (cpu, rd, NO_SP,
2679 			     aarch64_get_vec_s16 (cpu, vs, index));
2680       break;
2681 
2682     case 2:
2683       aarch64_set_reg_s64 (cpu, rd, NO_SP,
2684 			   aarch64_get_vec_s32 (cpu, vs, index));
2685       break;
2686 
2687     default:
2688       HALT_UNALLOC;
2689     }
2690 }
2691 
2692 static void
2693 do_vec_UMOV_into_scalar (sim_cpu *cpu)
2694 {
2695   /* instr[31]    = 0
2696      instr[30]    = word(0)/long(1)
2697      instr[29,21] = 00 1110 000
2698      instr[20,16] = element size and index
2699      instr[15,10] = 00 0011 11
2700      instr[9,5]   = V source
2701      instr[4,0]   = R dest  */
2702 
2703   unsigned vs = INSTR (9, 5);
2704   unsigned rd = INSTR (4, 0);
2705   unsigned imm5 = INSTR (20, 16);
2706   unsigned full = INSTR (30, 30);
2707   int size, index;
2708 
2709   NYI_assert (29, 21, 0x070);
2710   NYI_assert (15, 10, 0x0F);
2711 
2712   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2713 
2714   if (!full)
2715     {
2716       if (imm5 & 0x1)
2717 	{
2718 	  size = 0;
2719 	  index = (imm5 >> 1) & 0xF;
2720 	}
2721       else if (imm5 & 0x2)
2722 	{
2723 	  size = 1;
2724 	  index = (imm5 >> 2) & 0x7;
2725 	}
2726       else if (imm5 & 0x4)
2727 	{
2728 	  size = 2;
2729 	  index = (imm5 >> 3) & 0x3;
2730 	}
2731       else
2732 	HALT_UNALLOC;
2733     }
2734   else if (imm5 & 0x8)
2735     {
2736       size = 3;
2737       index = (imm5 >> 4) & 0x1;
2738     }
2739   else
2740     HALT_UNALLOC;
2741 
2742   switch (size)
2743     {
2744     case 0:
2745       aarch64_set_reg_u32 (cpu, rd, NO_SP,
2746 			   aarch64_get_vec_u8 (cpu, vs, index));
2747       break;
2748 
2749     case 1:
2750       aarch64_set_reg_u32 (cpu, rd, NO_SP,
2751 			   aarch64_get_vec_u16 (cpu, vs, index));
2752       break;
2753 
2754     case 2:
2755       aarch64_set_reg_u32 (cpu, rd, NO_SP,
2756 			   aarch64_get_vec_u32 (cpu, vs, index));
2757       break;
2758 
2759     case 3:
2760       aarch64_set_reg_u64 (cpu, rd, NO_SP,
2761 			   aarch64_get_vec_u64 (cpu, vs, index));
2762       break;
2763 
2764     default:
2765       HALT_UNALLOC;
2766     }
2767 }
2768 
2769 static void
2770 do_vec_INS (sim_cpu *cpu)
2771 {
2772   /* instr[31,21] = 01001110000
2773      instr[20,16] = element size and index
2774      instr[15,10] = 000111
2775      instr[9,5]   = W source
2776      instr[4,0]   = V dest  */
2777 
2778   int index;
2779   unsigned rs = INSTR (9, 5);
2780   unsigned vd = INSTR (4, 0);
2781 
2782   NYI_assert (31, 21, 0x270);
2783   NYI_assert (15, 10, 0x07);
2784 
2785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2786   if (INSTR (16, 16))
2787     {
2788       index = INSTR (20, 17);
2789       aarch64_set_vec_u8 (cpu, vd, index,
2790 			  aarch64_get_reg_u8 (cpu, rs, NO_SP));
2791     }
2792   else if (INSTR (17, 17))
2793     {
2794       index = INSTR (20, 18);
2795       aarch64_set_vec_u16 (cpu, vd, index,
2796 			   aarch64_get_reg_u16 (cpu, rs, NO_SP));
2797     }
2798   else if (INSTR (18, 18))
2799     {
2800       index = INSTR (20, 19);
2801       aarch64_set_vec_u32 (cpu, vd, index,
2802 			   aarch64_get_reg_u32 (cpu, rs, NO_SP));
2803     }
2804   else if (INSTR (19, 19))
2805     {
2806       index = INSTR (20, 20);
2807       aarch64_set_vec_u64 (cpu, vd, index,
2808 			   aarch64_get_reg_u64 (cpu, rs, NO_SP));
2809     }
2810   else
2811     HALT_NYI;
2812 }
2813 
2814 static void
2815 do_vec_DUP_vector_into_vector (sim_cpu *cpu)
2816 {
2817   /* instr[31]    = 0
2818      instr[30]    = half(0)/full(1)
2819      instr[29,21] = 00 1110 000
2820      instr[20,16] = element size and index
2821      instr[15,10] = 0000 01
2822      instr[9,5]   = V source
2823      instr[4,0]   = V dest.  */
2824 
2825   unsigned full = INSTR (30, 30);
2826   unsigned vs = INSTR (9, 5);
2827   unsigned vd = INSTR (4, 0);
2828   int i, index;
2829 
2830   NYI_assert (29, 21, 0x070);
2831   NYI_assert (15, 10, 0x01);
2832 
2833   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2834   if (INSTR (16, 16))
2835     {
2836       index = INSTR (20, 17);
2837 
2838       for (i = 0; i < (full ? 16 : 8); i++)
2839 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
2840     }
2841   else if (INSTR (17, 17))
2842     {
2843       index = INSTR (20, 18);
2844 
2845       for (i = 0; i < (full ? 8 : 4); i++)
2846 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
2847     }
2848   else if (INSTR (18, 18))
2849     {
2850       index = INSTR (20, 19);
2851 
2852       for (i = 0; i < (full ? 4 : 2); i++)
2853 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
2854     }
2855   else
2856     {
2857       if (INSTR (19, 19) == 0)
2858 	HALT_UNALLOC;
2859 
2860       if (! full)
2861 	HALT_UNALLOC;
2862 
2863       index = INSTR (20, 20);
2864 
2865       for (i = 0; i < 2; i++)
2866 	aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
2867     }
2868 }
2869 
2870 static void
2871 do_vec_TBL (sim_cpu *cpu)
2872 {
2873   /* instr[31]    = 0
2874      instr[30]    = half(0)/full(1)
2875      instr[29,21] = 00 1110 000
2876      instr[20,16] = Vm
2877      instr[15]    = 0
2878      instr[14,13] = vec length
2879      instr[12,10] = 000
2880      instr[9,5]   = V start
2881      instr[4,0]   = V dest  */
2882 
2883   int full    = INSTR (30, 30);
2884   int len     = INSTR (14, 13) + 1;
2885   unsigned vm = INSTR (20, 16);
2886   unsigned vn = INSTR (9, 5);
2887   unsigned vd = INSTR (4, 0);
2888   unsigned i;
2889 
2890   NYI_assert (29, 21, 0x070);
2891   NYI_assert (12, 10, 0);
2892 
2893   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2894   for (i = 0; i < (full ? 16 : 8); i++)
2895     {
2896       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
2897       uint8_t val;
2898 
2899       if (selector < 16)
2900 	val = aarch64_get_vec_u8 (cpu, vn, selector);
2901       else if (selector < 32)
2902 	val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
2903       else if (selector < 48)
2904 	val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
2905       else if (selector < 64)
2906 	val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
2907       else
2908 	val = 0;
2909 
2910       aarch64_set_vec_u8 (cpu, vd, i, val);
2911     }
2912 }
2913 
2914 static void
2915 do_vec_TRN (sim_cpu *cpu)
2916 {
2917   /* instr[31]    = 0
2918      instr[30]    = half(0)/full(1)
2919      instr[29,24] = 00 1110
2920      instr[23,22] = size
2921      instr[21]    = 0
2922      instr[20,16] = Vm
2923      instr[15]    = 0
2924      instr[14]    = TRN1 (0) / TRN2 (1)
2925      instr[13,10] = 1010
2926      instr[9,5]   = V source
2927      instr[4,0]   = V dest.  */
2928 
2929   int full    = INSTR (30, 30);
2930   int second  = INSTR (14, 14);
2931   unsigned vm = INSTR (20, 16);
2932   unsigned vn = INSTR (9, 5);
2933   unsigned vd = INSTR (4, 0);
2934   unsigned i;
2935 
2936   NYI_assert (29, 24, 0x0E);
2937   NYI_assert (13, 10, 0xA);
2938 
2939   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2940   switch (INSTR (23, 22))
2941     {
2942     case 0:
2943       for (i = 0; i < (full ? 8 : 4); i++)
2944 	{
2945 	  aarch64_set_vec_u8
2946 	    (cpu, vd, i * 2,
2947 	     aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
2948 	  aarch64_set_vec_u8
2949 	    (cpu, vd, 1 * 2 + 1,
2950 	     aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
2951 	}
2952       break;
2953 
2954     case 1:
2955       for (i = 0; i < (full ? 4 : 2); i++)
2956 	{
2957 	  aarch64_set_vec_u16
2958 	    (cpu, vd, i * 2,
2959 	     aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
2960 	  aarch64_set_vec_u16
2961 	    (cpu, vd, 1 * 2 + 1,
2962 	     aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
2963 	}
2964       break;
2965 
2966     case 2:
2967       aarch64_set_vec_u32
2968 	(cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
2969       aarch64_set_vec_u32
2970 	(cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
2971       aarch64_set_vec_u32
2972 	(cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
2973       aarch64_set_vec_u32
2974 	(cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
2975       break;
2976 
2977     case 3:
2978       if (! full)
2979 	HALT_UNALLOC;
2980 
2981       aarch64_set_vec_u64 (cpu, vd, 0,
2982 			   aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
2983       aarch64_set_vec_u64 (cpu, vd, 1,
2984 			   aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
2985       break;
2986     }
2987 }
2988 
2989 static void
2990 do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
2991 {
2992   /* instr[31]    = 0
2993      instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
2994                     [must be 1 for 64-bit xfer]
2995      instr[29,20] = 00 1110 0000
2996      instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
2997                                   0100=> 32-bits. 1000=>64-bits
2998      instr[15,10] = 0000 11
2999      instr[9,5]   = W source
3000      instr[4,0]   = V dest.  */
3001 
3002   unsigned i;
3003   unsigned Vd = INSTR (4, 0);
3004   unsigned Rs = INSTR (9, 5);
3005   int both    = INSTR (30, 30);
3006 
3007   NYI_assert (29, 20, 0x0E0);
3008   NYI_assert (15, 10, 0x03);
3009 
3010   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3011   switch (INSTR (19, 16))
3012     {
3013     case 1:
3014       for (i = 0; i < (both ? 16 : 8); i++)
3015 	aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
3016       break;
3017 
3018     case 2:
3019       for (i = 0; i < (both ? 8 : 4); i++)
3020 	aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
3021       break;
3022 
3023     case 4:
3024       for (i = 0; i < (both ? 4 : 2); i++)
3025 	aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
3026       break;
3027 
3028     case 8:
3029       if (!both)
3030 	HALT_NYI;
3031       aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
3032       aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
3033       break;
3034 
3035     default:
3036       HALT_NYI;
3037     }
3038 }
3039 
3040 static void
3041 do_vec_UZP (sim_cpu *cpu)
3042 {
3043   /* instr[31]    = 0
3044      instr[30]    = half(0)/full(1)
3045      instr[29,24] = 00 1110
3046      instr[23,22] = size: byte(00), half(01), word (10), long (11)
3047      instr[21]    = 0
3048      instr[20,16] = Vm
3049      instr[15]    = 0
3050      instr[14]    = lower (0) / upper (1)
3051      instr[13,10] = 0110
3052      instr[9,5]   = Vn
3053      instr[4,0]   = Vd.  */
3054 
3055   int full = INSTR (30, 30);
3056   int upper = INSTR (14, 14);
3057 
3058   unsigned vm = INSTR (20, 16);
3059   unsigned vn = INSTR (9, 5);
3060   unsigned vd = INSTR (4, 0);
3061 
3062   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3063   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3064   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3065   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3066 
3067   uint64_t val1;
3068   uint64_t val2;
3069 
3070   uint64_t input2 = full ? val_n2 : val_m1;
3071 
3072   NYI_assert (29, 24, 0x0E);
3073   NYI_assert (21, 21, 0);
3074   NYI_assert (15, 15, 0);
3075   NYI_assert (13, 10, 6);
3076 
3077   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3078   switch (INSTR (23, 22))
3079     {
3080     case 0:
3081       val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
3082       val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
3083       val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
3084       val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
3085 
3086       val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
3087       val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
3088       val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
3089       val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
3090 
3091       if (full)
3092 	{
3093 	  val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
3094 	  val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
3095 	  val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
3096 	  val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
3097 
3098 	  val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
3099 	  val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
3100 	  val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
3101 	  val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
3102 	}
3103       break;
3104 
3105     case 1:
3106       val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
3107       val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3108 
3109       val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
3110       val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3111 
3112       if (full)
3113 	{
3114 	  val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
3115 	  val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3116 
3117 	  val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
3118 	  val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3119 	}
3120       break;
3121 
3122     case 2:
3123       val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
3124       val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3125 
3126       if (full)
3127 	{
3128 	  val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
3129 	  val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3130 	}
3131       break;
3132 
3133     case 3:
3134       if (! full)
3135 	HALT_UNALLOC;
3136 
3137       val1 = upper ? val_n2 : val_n1;
3138       val2 = upper ? val_m2 : val_m1;
3139       break;
3140     }
3141 
3142   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3143   if (full)
3144     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3145 }
3146 
3147 static void
3148 do_vec_ZIP (sim_cpu *cpu)
3149 {
3150   /* instr[31]    = 0
3151      instr[30]    = half(0)/full(1)
3152      instr[29,24] = 00 1110
3153      instr[23,22] = size: byte(00), hald(01), word (10), long (11)
3154      instr[21]    = 0
3155      instr[20,16] = Vm
3156      instr[15]    = 0
3157      instr[14]    = lower (0) / upper (1)
3158      instr[13,10] = 1110
3159      instr[9,5]   = Vn
3160      instr[4,0]   = Vd.  */
3161 
3162   int full = INSTR (30, 30);
3163   int upper = INSTR (14, 14);
3164 
3165   unsigned vm = INSTR (20, 16);
3166   unsigned vn = INSTR (9, 5);
3167   unsigned vd = INSTR (4, 0);
3168 
3169   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3170   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3171   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3172   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3173 
3174   uint64_t val1 = 0;
3175   uint64_t val2 = 0;
3176 
3177   uint64_t input1 = upper ? val_n1 : val_m1;
3178   uint64_t input2 = upper ? val_n2 : val_m2;
3179 
3180   NYI_assert (29, 24, 0x0E);
3181   NYI_assert (21, 21, 0);
3182   NYI_assert (15, 15, 0);
3183   NYI_assert (13, 10, 0xE);
3184 
3185   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3186   switch (INSTR (23, 23))
3187     {
3188     case 0:
3189       val1 =
3190 	  ((input1 <<  0) & (0xFF    <<  0))
3191 	| ((input2 <<  8) & (0xFF    <<  8))
3192 	| ((input1 <<  8) & (0xFF    << 16))
3193 	| ((input2 << 16) & (0xFF    << 24))
3194 	| ((input1 << 16) & (0xFFULL << 32))
3195 	| ((input2 << 24) & (0xFFULL << 40))
3196 	| ((input1 << 24) & (0xFFULL << 48))
3197 	| ((input2 << 32) & (0xFFULL << 56));
3198 
3199       val2 =
3200 	  ((input1 >> 32) & (0xFF    <<  0))
3201 	| ((input2 >> 24) & (0xFF    <<  8))
3202 	| ((input1 >> 24) & (0xFF    << 16))
3203 	| ((input2 >> 16) & (0xFF    << 24))
3204 	| ((input1 >> 16) & (0xFFULL << 32))
3205 	| ((input2 >>  8) & (0xFFULL << 40))
3206 	| ((input1 >>  8) & (0xFFULL << 48))
3207 	| ((input2 >>  0) & (0xFFULL << 56));
3208       break;
3209 
3210     case 1:
3211       val1 =
3212 	  ((input1 <<  0) & (0xFFFF    <<  0))
3213 	| ((input2 << 16) & (0xFFFF    << 16))
3214 	| ((input1 << 16) & (0xFFFFULL << 32))
3215 	| ((input2 << 32) & (0xFFFFULL << 48));
3216 
3217       val2 =
3218 	  ((input1 >> 32) & (0xFFFF    <<  0))
3219 	| ((input2 >> 16) & (0xFFFF    << 16))
3220 	| ((input1 >> 16) & (0xFFFFULL << 32))
3221 	| ((input2 >>  0) & (0xFFFFULL << 48));
3222       break;
3223 
3224     case 2:
3225       val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
3226       val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
3227       break;
3228 
3229     case 3:
3230       val1 = input1;
3231       val2 = input2;
3232       break;
3233     }
3234 
3235   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3236   if (full)
3237     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3238 }
3239 
3240 /* Floating point immediates are encoded in 8 bits.
3241    fpimm[7] = sign bit.
3242    fpimm[6:4] = signed exponent.
3243    fpimm[3:0] = fraction (assuming leading 1).
3244    i.e. F = s * 1.f * 2^(e - b).  */
3245 
3246 static float
3247 fp_immediate_for_encoding_32 (uint32_t imm8)
3248 {
3249   float u;
3250   uint32_t s, e, f, i;
3251 
3252   s = (imm8 >> 7) & 0x1;
3253   e = (imm8 >> 4) & 0x7;
3254   f = imm8 & 0xf;
3255 
3256   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3257   u = (16.0 + f) / 16.0;
3258 
3259   /* N.B. exponent is signed.  */
3260   if (e < 4)
3261     {
3262       int epos = e;
3263 
3264       for (i = 0; i <= epos; i++)
3265 	u *= 2.0;
3266     }
3267   else
3268     {
3269       int eneg = 7 - e;
3270 
3271       for (i = 0; i < eneg; i++)
3272 	u /= 2.0;
3273     }
3274 
3275   if (s)
3276     u = - u;
3277 
3278   return u;
3279 }
3280 
3281 static double
3282 fp_immediate_for_encoding_64 (uint32_t imm8)
3283 {
3284   double u;
3285   uint32_t s, e, f, i;
3286 
3287   s = (imm8 >> 7) & 0x1;
3288   e = (imm8 >> 4) & 0x7;
3289   f = imm8 & 0xf;
3290 
3291   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3292   u = (16.0 + f) / 16.0;
3293 
3294   /* N.B. exponent is signed.  */
3295   if (e < 4)
3296     {
3297       int epos = e;
3298 
3299       for (i = 0; i <= epos; i++)
3300 	u *= 2.0;
3301     }
3302   else
3303     {
3304       int eneg = 7 - e;
3305 
3306       for (i = 0; i < eneg; i++)
3307 	u /= 2.0;
3308     }
3309 
3310   if (s)
3311     u = - u;
3312 
3313   return u;
3314 }
3315 
3316 static void
3317 do_vec_MOV_immediate (sim_cpu *cpu)
3318 {
3319   /* instr[31]    = 0
3320      instr[30]    = full/half selector
3321      instr[29,19] = 00111100000
3322      instr[18,16] = high 3 bits of uimm8
3323      instr[15,12] = size & shift:
3324                                   0000 => 32-bit
3325                                   0010 => 32-bit + LSL#8
3326                                   0100 => 32-bit + LSL#16
3327                                   0110 => 32-bit + LSL#24
3328                                   1010 => 16-bit + LSL#8
3329                                   1000 => 16-bit
3330                                   1101 => 32-bit + MSL#16
3331                                   1100 => 32-bit + MSL#8
3332                                   1110 => 8-bit
3333                                   1111 => double
3334      instr[11,10] = 01
3335      instr[9,5]   = low 5-bits of uimm8
3336      instr[4,0]   = Vd.  */
3337 
3338   int full     = INSTR (30, 30);
3339   unsigned vd  = INSTR (4, 0);
3340   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3341   unsigned i;
3342 
3343   NYI_assert (29, 19, 0x1E0);
3344   NYI_assert (11, 10, 1);
3345 
3346   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3347   switch (INSTR (15, 12))
3348     {
3349     case 0x0: /* 32-bit, no shift.  */
3350     case 0x2: /* 32-bit, shift by 8.  */
3351     case 0x4: /* 32-bit, shift by 16.  */
3352     case 0x6: /* 32-bit, shift by 24.  */
3353       val <<= (8 * INSTR (14, 13));
3354       for (i = 0; i < (full ? 4 : 2); i++)
3355 	aarch64_set_vec_u32 (cpu, vd, i, val);
3356       break;
3357 
3358     case 0xa: /* 16-bit, shift by 8.  */
3359       val <<= 8;
3360       /* Fall through.  */
3361     case 0x8: /* 16-bit, no shift.  */
3362       for (i = 0; i < (full ? 8 : 4); i++)
3363 	aarch64_set_vec_u16 (cpu, vd, i, val);
3364       break;
3365 
3366     case 0xd: /* 32-bit, mask shift by 16.  */
3367       val <<= 8;
3368       val |= 0xFF;
3369       /* Fall through.  */
3370     case 0xc: /* 32-bit, mask shift by 8. */
3371       val <<= 8;
3372       val |= 0xFF;
3373       for (i = 0; i < (full ? 4 : 2); i++)
3374 	aarch64_set_vec_u32 (cpu, vd, i, val);
3375       break;
3376 
3377     case 0xe: /* 8-bit, no shift.  */
3378       for (i = 0; i < (full ? 16 : 8); i++)
3379 	aarch64_set_vec_u8 (cpu, vd, i, val);
3380       break;
3381 
3382     case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
3383       {
3384 	float u = fp_immediate_for_encoding_32 (val);
3385 	for (i = 0; i < (full ? 4 : 2); i++)
3386 	  aarch64_set_vec_float (cpu, vd, i, u);
3387 	break;
3388       }
3389 
3390     default:
3391       HALT_NYI;
3392     }
3393 }
3394 
3395 static void
3396 do_vec_MVNI (sim_cpu *cpu)
3397 {
3398   /* instr[31]    = 0
3399      instr[30]    = full/half selector
3400      instr[29,19] = 10111100000
3401      instr[18,16] = high 3 bits of uimm8
3402      instr[15,12] = selector
3403      instr[11,10] = 01
3404      instr[9,5]   = low 5-bits of uimm8
3405      instr[4,0]   = Vd.  */
3406 
3407   int full     = INSTR (30, 30);
3408   unsigned vd  = INSTR (4, 0);
3409   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3410   unsigned i;
3411 
3412   NYI_assert (29, 19, 0x5E0);
3413   NYI_assert (11, 10, 1);
3414 
3415   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3416   switch (INSTR (15, 12))
3417     {
3418     case 0x0: /* 32-bit, no shift.  */
3419     case 0x2: /* 32-bit, shift by 8.  */
3420     case 0x4: /* 32-bit, shift by 16.  */
3421     case 0x6: /* 32-bit, shift by 24.  */
3422       val <<= (8 * INSTR (14, 13));
3423       val = ~ val;
3424       for (i = 0; i < (full ? 4 : 2); i++)
3425 	aarch64_set_vec_u32 (cpu, vd, i, val);
3426       return;
3427 
3428     case 0xa: /* 16-bit, 8 bit shift. */
3429       val <<= 8;
3430     case 0x8: /* 16-bit, no shift. */
3431       val = ~ val;
3432       for (i = 0; i < (full ? 8 : 4); i++)
3433 	aarch64_set_vec_u16 (cpu, vd, i, val);
3434       return;
3435 
3436     case 0xd: /* 32-bit, mask shift by 16.  */
3437       val <<= 8;
3438       val |= 0xFF;
3439     case 0xc: /* 32-bit, mask shift by 8. */
3440       val <<= 8;
3441       val |= 0xFF;
3442       val = ~ val;
3443       for (i = 0; i < (full ? 4 : 2); i++)
3444 	aarch64_set_vec_u32 (cpu, vd, i, val);
3445       return;
3446 
3447     case 0xE: /* MOVI Dn, #mask64 */
3448       {
3449 	uint64_t mask = 0;
3450 
3451 	for (i = 0; i < 8; i++)
3452 	  if (val & (1 << i))
3453 	    mask |= (0xFFUL << (i * 8));
3454 	aarch64_set_vec_u64 (cpu, vd, 0, mask);
3455 	aarch64_set_vec_u64 (cpu, vd, 1, mask);
3456 	return;
3457       }
3458 
3459     case 0xf: /* FMOV Vd.2D, #fpimm.  */
3460       {
3461 	double u = fp_immediate_for_encoding_64 (val);
3462 
3463 	if (! full)
3464 	  HALT_UNALLOC;
3465 
3466 	aarch64_set_vec_double (cpu, vd, 0, u);
3467 	aarch64_set_vec_double (cpu, vd, 1, u);
3468 	return;
3469       }
3470 
3471     default:
3472       HALT_NYI;
3473     }
3474 }
3475 
3476 #define ABS(A) ((A) < 0 ? - (A) : (A))
3477 
3478 static void
3479 do_vec_ABS (sim_cpu *cpu)
3480 {
3481   /* instr[31]    = 0
3482      instr[30]    = half(0)/full(1)
3483      instr[29,24] = 00 1110
3484      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3485      instr[21,10] = 10 0000 1011 10
3486      instr[9,5]   = Vn
3487      instr[4.0]   = Vd.  */
3488 
3489   unsigned vn = INSTR (9, 5);
3490   unsigned vd = INSTR (4, 0);
3491   unsigned full = INSTR (30, 30);
3492   unsigned i;
3493 
3494   NYI_assert (29, 24, 0x0E);
3495   NYI_assert (21, 10, 0x82E);
3496 
3497   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3498   switch (INSTR (23, 22))
3499     {
3500     case 0:
3501       for (i = 0; i < (full ? 16 : 8); i++)
3502 	aarch64_set_vec_s8 (cpu, vd, i,
3503 			    ABS (aarch64_get_vec_s8 (cpu, vn, i)));
3504       break;
3505 
3506     case 1:
3507       for (i = 0; i < (full ? 8 : 4); i++)
3508 	aarch64_set_vec_s16 (cpu, vd, i,
3509 			     ABS (aarch64_get_vec_s16 (cpu, vn, i)));
3510       break;
3511 
3512     case 2:
3513       for (i = 0; i < (full ? 4 : 2); i++)
3514 	aarch64_set_vec_s32 (cpu, vd, i,
3515 			     ABS (aarch64_get_vec_s32 (cpu, vn, i)));
3516       break;
3517 
3518     case 3:
3519       if (! full)
3520 	HALT_NYI;
3521       for (i = 0; i < 2; i++)
3522 	aarch64_set_vec_s64 (cpu, vd, i,
3523 			     ABS (aarch64_get_vec_s64 (cpu, vn, i)));
3524       break;
3525     }
3526 }
3527 
3528 static void
3529 do_vec_ADDV (sim_cpu *cpu)
3530 {
3531   /* instr[31]    = 0
3532      instr[30]    = full/half selector
3533      instr[29,24] = 00 1110
3534      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3535      instr[21,10] = 11 0001 1011 10
3536      instr[9,5]   = Vm
3537      instr[4.0]   = Rd.  */
3538 
3539   unsigned vm = INSTR (9, 5);
3540   unsigned rd = INSTR (4, 0);
3541   unsigned i;
3542   int      full = INSTR (30, 30);
3543 
3544   NYI_assert (29, 24, 0x0E);
3545   NYI_assert (21, 10, 0xC6E);
3546 
3547   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3548   switch (INSTR (23, 22))
3549     {
3550     case 0:
3551       {
3552 	uint8_t val = 0;
3553 	for (i = 0; i < (full ? 16 : 8); i++)
3554 	  val += aarch64_get_vec_u8 (cpu, vm, i);
3555 	aarch64_set_vec_u64 (cpu, rd, 0, val);
3556 	return;
3557       }
3558 
3559     case 1:
3560       {
3561 	uint16_t val = 0;
3562 	for (i = 0; i < (full ? 8 : 4); i++)
3563 	  val += aarch64_get_vec_u16 (cpu, vm, i);
3564 	aarch64_set_vec_u64 (cpu, rd, 0, val);
3565 	return;
3566       }
3567 
3568     case 2:
3569       {
3570 	uint32_t val = 0;
3571 	if (! full)
3572 	  HALT_UNALLOC;
3573 	for (i = 0; i < 4; i++)
3574 	  val += aarch64_get_vec_u32 (cpu, vm, i);
3575 	aarch64_set_vec_u64 (cpu, rd, 0, val);
3576 	return;
3577       }
3578 
3579     case 3:
3580       HALT_UNALLOC;
3581     }
3582 }
3583 
3584 static void
3585 do_vec_ins_2 (sim_cpu *cpu)
3586 {
3587   /* instr[31,21] = 01001110000
3588      instr[20,18] = size & element selector
3589      instr[17,14] = 0000
3590      instr[13]    = direction: to vec(0), from vec (1)
3591      instr[12,10] = 111
3592      instr[9,5]   = Vm
3593      instr[4,0]   = Vd.  */
3594 
3595   unsigned elem;
3596   unsigned vm = INSTR (9, 5);
3597   unsigned vd = INSTR (4, 0);
3598 
3599   NYI_assert (31, 21, 0x270);
3600   NYI_assert (17, 14, 0);
3601   NYI_assert (12, 10, 7);
3602 
3603   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3604   if (INSTR (13, 13) == 1)
3605     {
3606       if (INSTR (18, 18) == 1)
3607 	{
3608 	  /* 32-bit moves.  */
3609 	  elem = INSTR (20, 19);
3610 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
3611 			       aarch64_get_vec_u32 (cpu, vm, elem));
3612 	}
3613       else
3614 	{
3615 	  /* 64-bit moves.  */
3616 	  if (INSTR (19, 19) != 1)
3617 	    HALT_NYI;
3618 
3619 	  elem = INSTR (20, 20);
3620 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
3621 			       aarch64_get_vec_u64 (cpu, vm, elem));
3622 	}
3623     }
3624   else
3625     {
3626       if (INSTR (18, 18) == 1)
3627 	{
3628 	  /* 32-bit moves.  */
3629 	  elem = INSTR (20, 19);
3630 	  aarch64_set_vec_u32 (cpu, vd, elem,
3631 			       aarch64_get_reg_u32 (cpu, vm, NO_SP));
3632 	}
3633       else
3634 	{
3635 	  /* 64-bit moves.  */
3636 	  if (INSTR (19, 19) != 1)
3637 	    HALT_NYI;
3638 
3639 	  elem = INSTR (20, 20);
3640 	  aarch64_set_vec_u64 (cpu, vd, elem,
3641 			       aarch64_get_reg_u64 (cpu, vm, NO_SP));
3642 	}
3643     }
3644 }
3645 
3646 #define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)	  \
3647   do								  \
3648     {								  \
3649       DST_TYPE a[N], b[N];					  \
3650 								  \
3651       for (i = 0; i < (N); i++)					  \
3652 	{							  \
3653 	  a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
3654 	  b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
3655 	}							  \
3656       for (i = 0; i < (N); i++)					  \
3657 	aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);	  \
3658     }								  \
3659   while (0)
3660 
3661 static void
3662 do_vec_mull (sim_cpu *cpu)
3663 {
3664   /* instr[31]    = 0
3665      instr[30]    = lower(0)/upper(1) selector
3666      instr[29]    = signed(0)/unsigned(1)
3667      instr[28,24] = 0 1110
3668      instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
3669      instr[21]    = 1
3670      instr[20,16] = Vm
3671      instr[15,10] = 11 0000
3672      instr[9,5]   = Vn
3673      instr[4.0]   = Vd.  */
3674 
3675   int    unsign = INSTR (29, 29);
3676   int    bias = INSTR (30, 30);
3677   unsigned vm = INSTR (20, 16);
3678   unsigned vn = INSTR ( 9,  5);
3679   unsigned vd = INSTR ( 4,  0);
3680   unsigned i;
3681 
3682   NYI_assert (28, 24, 0x0E);
3683   NYI_assert (15, 10, 0x30);
3684 
3685   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3686   /* NB: Read source values before writing results, in case
3687      the source and destination vectors are the same.  */
3688   switch (INSTR (23, 22))
3689     {
3690     case 0:
3691       if (bias)
3692 	bias = 8;
3693       if (unsign)
3694 	DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
3695       else
3696 	DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
3697       return;
3698 
3699     case 1:
3700       if (bias)
3701 	bias = 4;
3702       if (unsign)
3703 	DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
3704       else
3705 	DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
3706       return;
3707 
3708     case 2:
3709       if (bias)
3710 	bias = 2;
3711       if (unsign)
3712 	DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
3713       else
3714 	DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
3715       return;
3716 
3717     case 3:
3718       HALT_NYI;
3719     }
3720 }
3721 
3722 static void
3723 do_vec_fadd (sim_cpu *cpu)
3724 {
3725   /* instr[31]    = 0
3726      instr[30]    = half(0)/full(1)
3727      instr[29,24] = 001110
3728      instr[23]    = FADD(0)/FSUB(1)
3729      instr[22]    = float (0)/double(1)
3730      instr[21]    = 1
3731      instr[20,16] = Vm
3732      instr[15,10] = 110101
3733      instr[9,5]   = Vn
3734      instr[4.0]   = Vd.  */
3735 
3736   unsigned vm = INSTR (20, 16);
3737   unsigned vn = INSTR (9, 5);
3738   unsigned vd = INSTR (4, 0);
3739   unsigned i;
3740   int      full = INSTR (30, 30);
3741 
3742   NYI_assert (29, 24, 0x0E);
3743   NYI_assert (21, 21, 1);
3744   NYI_assert (15, 10, 0x35);
3745 
3746   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3747   if (INSTR (23, 23))
3748     {
3749       if (INSTR (22, 22))
3750 	{
3751 	  if (! full)
3752 	    HALT_NYI;
3753 
3754 	  for (i = 0; i < 2; i++)
3755 	    aarch64_set_vec_double (cpu, vd, i,
3756 				    aarch64_get_vec_double (cpu, vn, i)
3757 				    - aarch64_get_vec_double (cpu, vm, i));
3758 	}
3759       else
3760 	{
3761 	  for (i = 0; i < (full ? 4 : 2); i++)
3762 	    aarch64_set_vec_float (cpu, vd, i,
3763 				   aarch64_get_vec_float (cpu, vn, i)
3764 				   - aarch64_get_vec_float (cpu, vm, i));
3765 	}
3766     }
3767   else
3768     {
3769       if (INSTR (22, 22))
3770 	{
3771 	  if (! full)
3772 	    HALT_NYI;
3773 
3774 	  for (i = 0; i < 2; i++)
3775 	    aarch64_set_vec_double (cpu, vd, i,
3776 				    aarch64_get_vec_double (cpu, vm, i)
3777 				    + aarch64_get_vec_double (cpu, vn, i));
3778 	}
3779       else
3780 	{
3781 	  for (i = 0; i < (full ? 4 : 2); i++)
3782 	    aarch64_set_vec_float (cpu, vd, i,
3783 				   aarch64_get_vec_float (cpu, vm, i)
3784 				   + aarch64_get_vec_float (cpu, vn, i));
3785 	}
3786     }
3787 }
3788 
3789 static void
3790 do_vec_add (sim_cpu *cpu)
3791 {
3792   /* instr[31]    = 0
3793      instr[30]    = full/half selector
3794      instr[29,24] = 001110
3795      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3796      instr[21]    = 1
3797      instr[20,16] = Vn
3798      instr[15,10] = 100001
3799      instr[9,5]   = Vm
3800      instr[4.0]   = Vd.  */
3801 
3802   unsigned vm = INSTR (20, 16);
3803   unsigned vn = INSTR (9, 5);
3804   unsigned vd = INSTR (4, 0);
3805   unsigned i;
3806   int      full = INSTR (30, 30);
3807 
3808   NYI_assert (29, 24, 0x0E);
3809   NYI_assert (21, 21, 1);
3810   NYI_assert (15, 10, 0x21);
3811 
3812   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3813   switch (INSTR (23, 22))
3814     {
3815     case 0:
3816       for (i = 0; i < (full ? 16 : 8); i++)
3817 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
3818 			    + aarch64_get_vec_u8 (cpu, vm, i));
3819       return;
3820 
3821     case 1:
3822       for (i = 0; i < (full ? 8 : 4); i++)
3823 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
3824 			     + aarch64_get_vec_u16 (cpu, vm, i));
3825       return;
3826 
3827     case 2:
3828       for (i = 0; i < (full ? 4 : 2); i++)
3829 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
3830 			     + aarch64_get_vec_u32 (cpu, vm, i));
3831       return;
3832 
3833     case 3:
3834       if (! full)
3835 	HALT_UNALLOC;
3836       aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
3837 			   + aarch64_get_vec_u64 (cpu, vm, 0));
3838       aarch64_set_vec_u64 (cpu, vd, 1,
3839 			   aarch64_get_vec_u64 (cpu, vn, 1)
3840 			   + aarch64_get_vec_u64 (cpu, vm, 1));
3841       return;
3842     }
3843 }
3844 
3845 static void
3846 do_vec_mul (sim_cpu *cpu)
3847 {
3848   /* instr[31]    = 0
3849      instr[30]    = full/half selector
3850      instr[29,24] = 00 1110
3851      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3852      instr[21]    = 1
3853      instr[20,16] = Vn
3854      instr[15,10] = 10 0111
3855      instr[9,5]   = Vm
3856      instr[4.0]   = Vd.  */
3857 
3858   unsigned vm = INSTR (20, 16);
3859   unsigned vn = INSTR (9, 5);
3860   unsigned vd = INSTR (4, 0);
3861   unsigned i;
3862   int      full = INSTR (30, 30);
3863   int      bias = 0;
3864 
3865   NYI_assert (29, 24, 0x0E);
3866   NYI_assert (21, 21, 1);
3867   NYI_assert (15, 10, 0x27);
3868 
3869   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3870   switch (INSTR (23, 22))
3871     {
3872     case 0:
3873       DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
3874       return;
3875 
3876     case 1:
3877       DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
3878       return;
3879 
3880     case 2:
3881       DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
3882       return;
3883 
3884     case 3:
3885       HALT_UNALLOC;
3886     }
3887 }
3888 
3889 static void
3890 do_vec_MLA (sim_cpu *cpu)
3891 {
3892   /* instr[31]    = 0
3893      instr[30]    = full/half selector
3894      instr[29,24] = 00 1110
3895      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3896      instr[21]    = 1
3897      instr[20,16] = Vn
3898      instr[15,10] = 1001 01
3899      instr[9,5]   = Vm
3900      instr[4.0]   = Vd.  */
3901 
3902   unsigned vm = INSTR (20, 16);
3903   unsigned vn = INSTR (9, 5);
3904   unsigned vd = INSTR (4, 0);
3905   unsigned i;
3906   int      full = INSTR (30, 30);
3907 
3908   NYI_assert (29, 24, 0x0E);
3909   NYI_assert (21, 21, 1);
3910   NYI_assert (15, 10, 0x25);
3911 
3912   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3913   switch (INSTR (23, 22))
3914     {
3915     case 0:
3916       for (i = 0; i < (full ? 16 : 8); i++)
3917 	aarch64_set_vec_u8 (cpu, vd, i,
3918 			    aarch64_get_vec_u8 (cpu, vd, i)
3919 			    + (aarch64_get_vec_u8 (cpu, vn, i)
3920 			       * aarch64_get_vec_u8 (cpu, vm, i)));
3921       return;
3922 
3923     case 1:
3924       for (i = 0; i < (full ? 8 : 4); i++)
3925 	aarch64_set_vec_u16 (cpu, vd, i,
3926 			     aarch64_get_vec_u16 (cpu, vd, i)
3927 			     + (aarch64_get_vec_u16 (cpu, vn, i)
3928 				* aarch64_get_vec_u16 (cpu, vm, i)));
3929       return;
3930 
3931     case 2:
3932       for (i = 0; i < (full ? 4 : 2); i++)
3933 	aarch64_set_vec_u32 (cpu, vd, i,
3934 			     aarch64_get_vec_u32 (cpu, vd, i)
3935 			     + (aarch64_get_vec_u32 (cpu, vn, i)
3936 				* aarch64_get_vec_u32 (cpu, vm, i)));
3937       return;
3938 
3939     default:
3940       HALT_UNALLOC;
3941     }
3942 }
3943 
3944 static float
3945 fmaxnm (float a, float b)
3946 {
3947   if (! isnan (a))
3948     {
3949       if (! isnan (b))
3950 	return a > b ? a : b;
3951       return a;
3952     }
3953   else if (! isnan (b))
3954     return b;
3955   return a;
3956 }
3957 
3958 static float
3959 fminnm (float a, float b)
3960 {
3961   if (! isnan (a))
3962     {
3963       if (! isnan (b))
3964 	return a < b ? a : b;
3965       return a;
3966     }
3967   else if (! isnan (b))
3968     return b;
3969   return a;
3970 }
3971 
3972 static double
3973 dmaxnm (double a, double b)
3974 {
3975   if (! isnan (a))
3976     {
3977       if (! isnan (b))
3978 	return a > b ? a : b;
3979       return a;
3980     }
3981   else if (! isnan (b))
3982     return b;
3983   return a;
3984 }
3985 
3986 static double
3987 dminnm (double a, double b)
3988 {
3989   if (! isnan (a))
3990     {
3991       if (! isnan (b))
3992 	return a < b ? a : b;
3993       return a;
3994     }
3995   else if (! isnan (b))
3996     return b;
3997   return a;
3998 }
3999 
4000 static void
4001 do_vec_FminmaxNMP (sim_cpu *cpu)
4002 {
4003   /* instr [31]    = 0
4004      instr [30]    = half (0)/full (1)
4005      instr [29,24] = 10 1110
4006      instr [23]    = max(0)/min(1)
4007      instr [22]    = float (0)/double (1)
4008      instr [21]    = 1
4009      instr [20,16] = Vn
4010      instr [15,10] = 1100 01
4011      instr [9,5]   = Vm
4012      instr [4.0]   = Vd.  */
4013 
4014   unsigned vm = INSTR (20, 16);
4015   unsigned vn = INSTR (9, 5);
4016   unsigned vd = INSTR (4, 0);
4017   int      full = INSTR (30, 30);
4018 
4019   NYI_assert (29, 24, 0x2E);
4020   NYI_assert (21, 21, 1);
4021   NYI_assert (15, 10, 0x31);
4022 
4023   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4024   if (INSTR (22, 22))
4025     {
4026       double (* fn)(double, double) = INSTR (23, 23)
4027 	? dminnm : dmaxnm;
4028 
4029       if (! full)
4030 	HALT_NYI;
4031       aarch64_set_vec_double (cpu, vd, 0,
4032 			      fn (aarch64_get_vec_double (cpu, vn, 0),
4033 				  aarch64_get_vec_double (cpu, vn, 1)));
4034       aarch64_set_vec_double (cpu, vd, 0,
4035 			      fn (aarch64_get_vec_double (cpu, vm, 0),
4036 				  aarch64_get_vec_double (cpu, vm, 1)));
4037     }
4038   else
4039     {
4040       float (* fn)(float, float) = INSTR (23, 23)
4041 	? fminnm : fmaxnm;
4042 
4043       aarch64_set_vec_float (cpu, vd, 0,
4044 			     fn (aarch64_get_vec_float (cpu, vn, 0),
4045 				 aarch64_get_vec_float (cpu, vn, 1)));
4046       if (full)
4047 	aarch64_set_vec_float (cpu, vd, 1,
4048 			       fn (aarch64_get_vec_float (cpu, vn, 2),
4049 				   aarch64_get_vec_float (cpu, vn, 3)));
4050 
4051       aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
4052 			     fn (aarch64_get_vec_float (cpu, vm, 0),
4053 				 aarch64_get_vec_float (cpu, vm, 1)));
4054       if (full)
4055 	aarch64_set_vec_float (cpu, vd, 3,
4056 			       fn (aarch64_get_vec_float (cpu, vm, 2),
4057 				   aarch64_get_vec_float (cpu, vm, 3)));
4058     }
4059 }
4060 
4061 static void
4062 do_vec_AND (sim_cpu *cpu)
4063 {
4064   /* instr[31]    = 0
4065      instr[30]    = half (0)/full (1)
4066      instr[29,21] = 001110001
4067      instr[20,16] = Vm
4068      instr[15,10] = 000111
4069      instr[9,5]   = Vn
4070      instr[4.0]   = Vd.  */
4071 
4072   unsigned vm = INSTR (20, 16);
4073   unsigned vn = INSTR (9, 5);
4074   unsigned vd = INSTR (4, 0);
4075   unsigned i;
4076   int      full = INSTR (30, 30);
4077 
4078   NYI_assert (29, 21, 0x071);
4079   NYI_assert (15, 10, 0x07);
4080 
4081   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4082   for (i = 0; i < (full ? 4 : 2); i++)
4083     aarch64_set_vec_u32 (cpu, vd, i,
4084 			 aarch64_get_vec_u32 (cpu, vn, i)
4085 			 & aarch64_get_vec_u32 (cpu, vm, i));
4086 }
4087 
4088 static void
4089 do_vec_BSL (sim_cpu *cpu)
4090 {
4091   /* instr[31]    = 0
4092      instr[30]    = half (0)/full (1)
4093      instr[29,21] = 101110011
4094      instr[20,16] = Vm
4095      instr[15,10] = 000111
4096      instr[9,5]   = Vn
4097      instr[4.0]   = Vd.  */
4098 
4099   unsigned vm = INSTR (20, 16);
4100   unsigned vn = INSTR (9, 5);
4101   unsigned vd = INSTR (4, 0);
4102   unsigned i;
4103   int      full = INSTR (30, 30);
4104 
4105   NYI_assert (29, 21, 0x173);
4106   NYI_assert (15, 10, 0x07);
4107 
4108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4109   for (i = 0; i < (full ? 16 : 8); i++)
4110     aarch64_set_vec_u8 (cpu, vd, i,
4111 			(    aarch64_get_vec_u8 (cpu, vd, i)
4112 			   & aarch64_get_vec_u8 (cpu, vn, i))
4113 			| ((~ aarch64_get_vec_u8 (cpu, vd, i))
4114 			   & aarch64_get_vec_u8 (cpu, vm, i)));
4115 }
4116 
4117 static void
4118 do_vec_EOR (sim_cpu *cpu)
4119 {
4120   /* instr[31]    = 0
4121      instr[30]    = half (0)/full (1)
4122      instr[29,21] = 10 1110 001
4123      instr[20,16] = Vm
4124      instr[15,10] = 000111
4125      instr[9,5]   = Vn
4126      instr[4.0]   = Vd.  */
4127 
4128   unsigned vm = INSTR (20, 16);
4129   unsigned vn = INSTR (9, 5);
4130   unsigned vd = INSTR (4, 0);
4131   unsigned i;
4132   int      full = INSTR (30, 30);
4133 
4134   NYI_assert (29, 21, 0x171);
4135   NYI_assert (15, 10, 0x07);
4136 
4137   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4138   for (i = 0; i < (full ? 4 : 2); i++)
4139     aarch64_set_vec_u32 (cpu, vd, i,
4140 			 aarch64_get_vec_u32 (cpu, vn, i)
4141 			 ^ aarch64_get_vec_u32 (cpu, vm, i));
4142 }
4143 
4144 static void
4145 do_vec_bit (sim_cpu *cpu)
4146 {
4147   /* instr[31]    = 0
4148      instr[30]    = half (0)/full (1)
4149      instr[29,23] = 10 1110 1
4150      instr[22]    = BIT (0) / BIF (1)
4151      instr[21]    = 1
4152      instr[20,16] = Vm
4153      instr[15,10] = 0001 11
4154      instr[9,5]   = Vn
4155      instr[4.0]   = Vd.  */
4156 
4157   unsigned vm = INSTR (20, 16);
4158   unsigned vn = INSTR (9, 5);
4159   unsigned vd = INSTR (4, 0);
4160   unsigned full = INSTR (30, 30);
4161   unsigned test_false = INSTR (22, 22);
4162   unsigned i;
4163 
4164   NYI_assert (29, 23, 0x5D);
4165   NYI_assert (21, 21, 1);
4166   NYI_assert (15, 10, 0x07);
4167 
4168   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4169   for (i = 0; i < (full ? 4 : 2); i++)
4170     {
4171       uint32_t vd_val = aarch64_get_vec_u32 (cpu, vd, i);
4172       uint32_t vn_val = aarch64_get_vec_u32 (cpu, vn, i);
4173       uint32_t vm_val = aarch64_get_vec_u32 (cpu, vm, i);
4174       if (test_false)
4175 	aarch64_set_vec_u32 (cpu, vd, i,
4176 			     (vd_val & vm_val) | (vn_val & ~vm_val));
4177       else
4178 	aarch64_set_vec_u32 (cpu, vd, i,
4179 			     (vd_val & ~vm_val) | (vn_val & vm_val));
4180     }
4181 }
4182 
4183 static void
4184 do_vec_ORN (sim_cpu *cpu)
4185 {
4186   /* instr[31]    = 0
4187      instr[30]    = half (0)/full (1)
4188      instr[29,21] = 00 1110 111
4189      instr[20,16] = Vm
4190      instr[15,10] = 00 0111
4191      instr[9,5]   = Vn
4192      instr[4.0]   = Vd.  */
4193 
4194   unsigned vm = INSTR (20, 16);
4195   unsigned vn = INSTR (9, 5);
4196   unsigned vd = INSTR (4, 0);
4197   unsigned i;
4198   int      full = INSTR (30, 30);
4199 
4200   NYI_assert (29, 21, 0x077);
4201   NYI_assert (15, 10, 0x07);
4202 
4203   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4204   for (i = 0; i < (full ? 16 : 8); i++)
4205     aarch64_set_vec_u8 (cpu, vd, i,
4206 			aarch64_get_vec_u8 (cpu, vn, i)
4207 			| ~ aarch64_get_vec_u8 (cpu, vm, i));
4208 }
4209 
4210 static void
4211 do_vec_ORR (sim_cpu *cpu)
4212 {
4213   /* instr[31]    = 0
4214      instr[30]    = half (0)/full (1)
4215      instr[29,21] = 00 1110 101
4216      instr[20,16] = Vm
4217      instr[15,10] = 0001 11
4218      instr[9,5]   = Vn
4219      instr[4.0]   = Vd.  */
4220 
4221   unsigned vm = INSTR (20, 16);
4222   unsigned vn = INSTR (9, 5);
4223   unsigned vd = INSTR (4, 0);
4224   unsigned i;
4225   int      full = INSTR (30, 30);
4226 
4227   NYI_assert (29, 21, 0x075);
4228   NYI_assert (15, 10, 0x07);
4229 
4230   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4231   for (i = 0; i < (full ? 16 : 8); i++)
4232     aarch64_set_vec_u8 (cpu, vd, i,
4233 			aarch64_get_vec_u8 (cpu, vn, i)
4234 			| aarch64_get_vec_u8 (cpu, vm, i));
4235 }
4236 
4237 static void
4238 do_vec_BIC (sim_cpu *cpu)
4239 {
4240   /* instr[31]    = 0
4241      instr[30]    = half (0)/full (1)
4242      instr[29,21] = 00 1110 011
4243      instr[20,16] = Vm
4244      instr[15,10] = 00 0111
4245      instr[9,5]   = Vn
4246      instr[4.0]   = Vd.  */
4247 
4248   unsigned vm = INSTR (20, 16);
4249   unsigned vn = INSTR (9, 5);
4250   unsigned vd = INSTR (4, 0);
4251   unsigned i;
4252   int      full = INSTR (30, 30);
4253 
4254   NYI_assert (29, 21, 0x073);
4255   NYI_assert (15, 10, 0x07);
4256 
4257   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4258   for (i = 0; i < (full ? 16 : 8); i++)
4259     aarch64_set_vec_u8 (cpu, vd, i,
4260 			aarch64_get_vec_u8 (cpu, vn, i)
4261 			& ~ aarch64_get_vec_u8 (cpu, vm, i));
4262 }
4263 
4264 static void
4265 do_vec_XTN (sim_cpu *cpu)
4266 {
4267   /* instr[31]    = 0
4268      instr[30]    = first part (0)/ second part (1)
4269      instr[29,24] = 00 1110
4270      instr[23,22] = size: byte(00), half(01), word (10)
4271      instr[21,10] = 1000 0100 1010
4272      instr[9,5]   = Vs
4273      instr[4,0]   = Vd.  */
4274 
4275   unsigned vs = INSTR (9, 5);
4276   unsigned vd = INSTR (4, 0);
4277   unsigned bias = INSTR (30, 30);
4278   unsigned i;
4279 
4280   NYI_assert (29, 24, 0x0E);
4281   NYI_assert (21, 10, 0x84A);
4282 
4283   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4284   switch (INSTR (23, 22))
4285     {
4286     case 0:
4287       for (i = 0; i < 8; i++)
4288 	aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
4289 			    aarch64_get_vec_u16 (cpu, vs, i));
4290       return;
4291 
4292     case 1:
4293       for (i = 0; i < 4; i++)
4294 	aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
4295 			     aarch64_get_vec_u32 (cpu, vs, i));
4296       return;
4297 
4298     case 2:
4299       for (i = 0; i < 2; i++)
4300 	aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
4301 			     aarch64_get_vec_u64 (cpu, vs, i));
4302       return;
4303     }
4304 }
4305 
4306 /* Return the number of bits set in the input value.  */
4307 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
4308 # define popcount __builtin_popcount
4309 #else
4310 static int
4311 popcount (unsigned char x)
4312 {
4313   static const unsigned char popcnt[16] =
4314     {
4315       0, 1, 1, 2,
4316       1, 2, 2, 3,
4317       1, 2, 2, 3,
4318       2, 3, 3, 4
4319     };
4320 
4321   /* Only counts the low 8 bits of the input as that is all we need.  */
4322   return popcnt[x % 16] + popcnt[x / 16];
4323 }
4324 #endif
4325 
4326 static void
4327 do_vec_CNT (sim_cpu *cpu)
4328 {
4329   /* instr[31]    = 0
4330      instr[30]    = half (0)/ full (1)
4331      instr[29,24] = 00 1110
4332      instr[23,22] = size: byte(00)
4333      instr[21,10] = 1000 0001 0110
4334      instr[9,5]   = Vs
4335      instr[4,0]   = Vd.  */
4336 
4337   unsigned vs = INSTR (9, 5);
4338   unsigned vd = INSTR (4, 0);
4339   int full = INSTR (30, 30);
4340   int size = INSTR (23, 22);
4341   int i;
4342 
4343   NYI_assert (29, 24, 0x0E);
4344   NYI_assert (21, 10, 0x816);
4345 
4346   if (size != 0)
4347     HALT_UNALLOC;
4348 
4349   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4350 
4351   for (i = 0; i < (full ? 16 : 8); i++)
4352     aarch64_set_vec_u8 (cpu, vd, i,
4353 			popcount (aarch64_get_vec_u8 (cpu, vs, i)));
4354 }
4355 
4356 static void
4357 do_vec_maxv (sim_cpu *cpu)
4358 {
4359   /* instr[31]    = 0
4360      instr[30]    = half(0)/full(1)
4361      instr[29]    = signed (0)/unsigned(1)
4362      instr[28,24] = 0 1110
4363      instr[23,22] = size: byte(00), half(01), word (10)
4364      instr[21]    = 1
4365      instr[20,17] = 1 000
4366      instr[16]    = max(0)/min(1)
4367      instr[15,10] = 1010 10
4368      instr[9,5]   = V source
4369      instr[4.0]   = R dest.  */
4370 
4371   unsigned vs = INSTR (9, 5);
4372   unsigned rd = INSTR (4, 0);
4373   unsigned full = INSTR (30, 30);
4374   unsigned i;
4375 
4376   NYI_assert (28, 24, 0x0E);
4377   NYI_assert (21, 21, 1);
4378   NYI_assert (20, 17, 8);
4379   NYI_assert (15, 10, 0x2A);
4380 
4381   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4382   switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
4383     {
4384     case 0: /* SMAXV.  */
4385        {
4386 	int64_t smax;
4387 	switch (INSTR (23, 22))
4388 	  {
4389 	  case 0:
4390 	    smax = aarch64_get_vec_s8 (cpu, vs, 0);
4391 	    for (i = 1; i < (full ? 16 : 8); i++)
4392 	      smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
4393 	    break;
4394 	  case 1:
4395 	    smax = aarch64_get_vec_s16 (cpu, vs, 0);
4396 	    for (i = 1; i < (full ? 8 : 4); i++)
4397 	      smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
4398 	    break;
4399 	  case 2:
4400 	    smax = aarch64_get_vec_s32 (cpu, vs, 0);
4401 	    for (i = 1; i < (full ? 4 : 2); i++)
4402 	      smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
4403 	    break;
4404 	  case 3:
4405 	    HALT_UNALLOC;
4406 	  }
4407 	aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
4408 	return;
4409       }
4410 
4411     case 1: /* SMINV.  */
4412       {
4413 	int64_t smin;
4414 	switch (INSTR (23, 22))
4415 	  {
4416 	  case 0:
4417 	    smin = aarch64_get_vec_s8 (cpu, vs, 0);
4418 	    for (i = 1; i < (full ? 16 : 8); i++)
4419 	      smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
4420 	    break;
4421 	  case 1:
4422 	    smin = aarch64_get_vec_s16 (cpu, vs, 0);
4423 	    for (i = 1; i < (full ? 8 : 4); i++)
4424 	      smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
4425 	    break;
4426 	  case 2:
4427 	    smin = aarch64_get_vec_s32 (cpu, vs, 0);
4428 	    for (i = 1; i < (full ? 4 : 2); i++)
4429 	      smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
4430 	    break;
4431 
4432 	  case 3:
4433 	    HALT_UNALLOC;
4434 	  }
4435 	aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
4436 	return;
4437       }
4438 
4439     case 2: /* UMAXV.  */
4440       {
4441 	uint64_t umax;
4442 	switch (INSTR (23, 22))
4443 	  {
4444 	  case 0:
4445 	    umax = aarch64_get_vec_u8 (cpu, vs, 0);
4446 	    for (i = 1; i < (full ? 16 : 8); i++)
4447 	      umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
4448 	    break;
4449 	  case 1:
4450 	    umax = aarch64_get_vec_u16 (cpu, vs, 0);
4451 	    for (i = 1; i < (full ? 8 : 4); i++)
4452 	      umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
4453 	    break;
4454 	  case 2:
4455 	    umax = aarch64_get_vec_u32 (cpu, vs, 0);
4456 	    for (i = 1; i < (full ? 4 : 2); i++)
4457 	      umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
4458 	    break;
4459 
4460 	  case 3:
4461 	    HALT_UNALLOC;
4462 	  }
4463 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
4464 	return;
4465       }
4466 
4467     case 3: /* UMINV.  */
4468       {
4469 	uint64_t umin;
4470 	switch (INSTR (23, 22))
4471 	  {
4472 	  case 0:
4473 	    umin = aarch64_get_vec_u8 (cpu, vs, 0);
4474 	    for (i = 1; i < (full ? 16 : 8); i++)
4475 	      umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
4476 	    break;
4477 	  case 1:
4478 	    umin = aarch64_get_vec_u16 (cpu, vs, 0);
4479 	    for (i = 1; i < (full ? 8 : 4); i++)
4480 	      umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
4481 	    break;
4482 	  case 2:
4483 	    umin = aarch64_get_vec_u32 (cpu, vs, 0);
4484 	    for (i = 1; i < (full ? 4 : 2); i++)
4485 	      umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
4486 	    break;
4487 
4488 	  case 3:
4489 	    HALT_UNALLOC;
4490 	  }
4491 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
4492 	return;
4493       }
4494     }
4495 }
4496 
4497 static void
4498 do_vec_fminmaxV (sim_cpu *cpu)
4499 {
4500   /* instr[31,24] = 0110 1110
4501      instr[23]    = max(0)/min(1)
4502      instr[22,14] = 011 0000 11
4503      instr[13,12] = nm(00)/normal(11)
4504      instr[11,10] = 10
4505      instr[9,5]   = V source
4506      instr[4.0]   = R dest.  */
4507 
4508   unsigned vs = INSTR (9, 5);
4509   unsigned rd = INSTR (4, 0);
4510   unsigned i;
4511   float res   = aarch64_get_vec_float (cpu, vs, 0);
4512 
4513   NYI_assert (31, 24, 0x6E);
4514   NYI_assert (22, 14, 0x0C3);
4515   NYI_assert (11, 10, 2);
4516 
4517   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4518   if (INSTR (23, 23))
4519     {
4520       switch (INSTR (13, 12))
4521 	{
4522 	case 0: /* FMNINNMV.  */
4523 	  for (i = 1; i < 4; i++)
4524 	    res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
4525 	  break;
4526 
4527 	case 3: /* FMINV.  */
4528 	  for (i = 1; i < 4; i++)
4529 	    res = min (res, aarch64_get_vec_float (cpu, vs, i));
4530 	  break;
4531 
4532 	default:
4533 	  HALT_NYI;
4534 	}
4535     }
4536   else
4537     {
4538       switch (INSTR (13, 12))
4539 	{
4540 	case 0: /* FMNAXNMV.  */
4541 	  for (i = 1; i < 4; i++)
4542 	    res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
4543 	  break;
4544 
4545 	case 3: /* FMAXV.  */
4546 	  for (i = 1; i < 4; i++)
4547 	    res = max (res, aarch64_get_vec_float (cpu, vs, i));
4548 	  break;
4549 
4550 	default:
4551 	  HALT_NYI;
4552 	}
4553     }
4554 
4555   aarch64_set_FP_float (cpu, rd, res);
4556 }
4557 
4558 static void
4559 do_vec_Fminmax (sim_cpu *cpu)
4560 {
4561   /* instr[31]    = 0
4562      instr[30]    = half(0)/full(1)
4563      instr[29,24] = 00 1110
4564      instr[23]    = max(0)/min(1)
4565      instr[22]    = float(0)/double(1)
4566      instr[21]    = 1
4567      instr[20,16] = Vm
4568      instr[15,14] = 11
4569      instr[13,12] = nm(00)/normal(11)
4570      instr[11,10] = 01
4571      instr[9,5]   = Vn
4572      instr[4,0]   = Vd.  */
4573 
4574   unsigned vm = INSTR (20, 16);
4575   unsigned vn = INSTR (9, 5);
4576   unsigned vd = INSTR (4, 0);
4577   unsigned full = INSTR (30, 30);
4578   unsigned min = INSTR (23, 23);
4579   unsigned i;
4580 
4581   NYI_assert (29, 24, 0x0E);
4582   NYI_assert (21, 21, 1);
4583   NYI_assert (15, 14, 3);
4584   NYI_assert (11, 10, 1);
4585 
4586   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4587   if (INSTR (22, 22))
4588     {
4589       double (* func)(double, double);
4590 
4591       if (! full)
4592 	HALT_NYI;
4593 
4594       if (INSTR (13, 12) == 0)
4595 	func = min ? dminnm : dmaxnm;
4596       else if (INSTR (13, 12) == 3)
4597 	func = min ? fmin : fmax;
4598       else
4599 	HALT_NYI;
4600 
4601       for (i = 0; i < 2; i++)
4602 	aarch64_set_vec_double (cpu, vd, i,
4603 				func (aarch64_get_vec_double (cpu, vn, i),
4604 				      aarch64_get_vec_double (cpu, vm, i)));
4605     }
4606   else
4607     {
4608       float (* func)(float, float);
4609 
4610       if (INSTR (13, 12) == 0)
4611 	func = min ? fminnm : fmaxnm;
4612       else if (INSTR (13, 12) == 3)
4613 	func = min ? fminf : fmaxf;
4614       else
4615 	HALT_NYI;
4616 
4617       for (i = 0; i < (full ? 4 : 2); i++)
4618 	aarch64_set_vec_float (cpu, vd, i,
4619 			       func (aarch64_get_vec_float (cpu, vn, i),
4620 				     aarch64_get_vec_float (cpu, vm, i)));
4621     }
4622 }
4623 
4624 static void
4625 do_vec_SCVTF (sim_cpu *cpu)
4626 {
4627   /* instr[31]    = 0
4628      instr[30]    = Q
4629      instr[29,23] = 00 1110 0
4630      instr[22]    = float(0)/double(1)
4631      instr[21,10] = 10 0001 1101 10
4632      instr[9,5]   = Vn
4633      instr[4,0]   = Vd.  */
4634 
4635   unsigned vn = INSTR (9, 5);
4636   unsigned vd = INSTR (4, 0);
4637   unsigned full = INSTR (30, 30);
4638   unsigned size = INSTR (22, 22);
4639   unsigned i;
4640 
4641   NYI_assert (29, 23, 0x1C);
4642   NYI_assert (21, 10, 0x876);
4643 
4644   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4645   if (size)
4646     {
4647       if (! full)
4648 	HALT_UNALLOC;
4649 
4650       for (i = 0; i < 2; i++)
4651 	{
4652 	  double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
4653 	  aarch64_set_vec_double (cpu, vd, i, val);
4654 	}
4655     }
4656   else
4657     {
4658       for (i = 0; i < (full ? 4 : 2); i++)
4659 	{
4660 	  float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
4661 	  aarch64_set_vec_float (cpu, vd, i, val);
4662 	}
4663     }
4664 }
4665 
4666 #define VEC_CMP(SOURCE, CMP)						\
4667   do									\
4668     {									\
4669       switch (size)							\
4670 	{								\
4671 	case 0:								\
4672 	  for (i = 0; i < (full ? 16 : 8); i++)				\
4673 	    aarch64_set_vec_u8 (cpu, vd, i,				\
4674 				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4675 				CMP					\
4676 				aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
4677 				? -1 : 0);				\
4678 	  return;							\
4679 	case 1:								\
4680 	  for (i = 0; i < (full ? 8 : 4); i++)				\
4681 	    aarch64_set_vec_u16 (cpu, vd, i,				\
4682 				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4683 				 CMP					\
4684 				 aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
4685 				 ? -1 : 0);				\
4686 	  return;							\
4687 	case 2:								\
4688 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4689 	    aarch64_set_vec_u32 (cpu, vd, i, \
4690 				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4691 				 CMP					\
4692 				 aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
4693 				 ? -1 : 0);				\
4694 	  return;							\
4695 	case 3:								\
4696 	  if (! full)							\
4697 	    HALT_UNALLOC;						\
4698 	  for (i = 0; i < 2; i++)					\
4699 	    aarch64_set_vec_u64 (cpu, vd, i, \
4700 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4701 				 CMP					\
4702 				 aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
4703 				 ? -1ULL : 0);				\
4704 	  return;							\
4705 	}								\
4706     }									\
4707   while (0)
4708 
4709 #define VEC_CMP0(SOURCE, CMP)						\
4710   do									\
4711     {									\
4712       switch (size)							\
4713 	{								\
4714 	case 0:								\
4715 	  for (i = 0; i < (full ? 16 : 8); i++)				\
4716 	    aarch64_set_vec_u8 (cpu, vd, i,				\
4717 				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4718 				CMP 0 ? -1 : 0);			\
4719 	  return;							\
4720 	case 1:								\
4721 	  for (i = 0; i < (full ? 8 : 4); i++)				\
4722 	    aarch64_set_vec_u16 (cpu, vd, i,				\
4723 				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4724 				 CMP 0 ? -1 : 0);			\
4725 	  return;							\
4726 	case 2:								\
4727 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4728 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4729 				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4730 				 CMP 0 ? -1 : 0);			\
4731 	  return;							\
4732 	case 3:								\
4733 	  if (! full)							\
4734 	    HALT_UNALLOC;						\
4735 	  for (i = 0; i < 2; i++)					\
4736 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4737 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4738 				 CMP 0 ? -1ULL : 0);			\
4739 	  return;							\
4740 	}								\
4741     }									\
4742   while (0)
4743 
4744 #define VEC_FCMP0(CMP)							\
4745   do									\
4746     {									\
4747       if (vm != 0)							\
4748 	HALT_NYI;							\
4749       if (INSTR (22, 22))						\
4750 	{								\
4751 	  if (! full)							\
4752 	    HALT_NYI;							\
4753 	  for (i = 0; i < 2; i++)					\
4754 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4755 				 aarch64_get_vec_double (cpu, vn, i)	\
4756 				 CMP 0.0 ? -1 : 0);			\
4757 	}								\
4758       else								\
4759 	{								\
4760 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4761 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4762 				 aarch64_get_vec_float (cpu, vn, i)	\
4763 				 CMP 0.0 ? -1 : 0);			\
4764 	}								\
4765       return;								\
4766     }									\
4767   while (0)
4768 
4769 #define VEC_FCMP(CMP)							\
4770   do									\
4771     {									\
4772       if (INSTR (22, 22))						\
4773 	{								\
4774 	  if (! full)							\
4775 	    HALT_NYI;							\
4776 	  for (i = 0; i < 2; i++)					\
4777 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4778 				 aarch64_get_vec_double (cpu, vn, i)	\
4779 				 CMP					\
4780 				 aarch64_get_vec_double (cpu, vm, i)	\
4781 				 ? -1 : 0);				\
4782 	}								\
4783       else								\
4784 	{								\
4785 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4786 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4787 				 aarch64_get_vec_float (cpu, vn, i)	\
4788 				 CMP					\
4789 				 aarch64_get_vec_float (cpu, vm, i)	\
4790 				 ? -1 : 0);				\
4791 	}								\
4792       return;								\
4793     }									\
4794   while (0)
4795 
4796 static void
4797 do_vec_compare (sim_cpu *cpu)
4798 {
4799   /* instr[31]    = 0
4800      instr[30]    = half(0)/full(1)
4801      instr[29]    = part-of-comparison-type
4802      instr[28,24] = 0 1110
4803      instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
4804                     type of float compares: single (-0) / double (-1)
4805      instr[21]    = 1
4806      instr[20,16] = Vm or 00000 (compare vs 0)
4807      instr[15,10] = part-of-comparison-type
4808      instr[9,5]   = Vn
4809      instr[4.0]   = Vd.  */
4810 
4811   int full = INSTR (30, 30);
4812   int size = INSTR (23, 22);
4813   unsigned vm = INSTR (20, 16);
4814   unsigned vn = INSTR (9, 5);
4815   unsigned vd = INSTR (4, 0);
4816   unsigned i;
4817 
4818   NYI_assert (28, 24, 0x0E);
4819   NYI_assert (21, 21, 1);
4820 
4821   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4822   if ((INSTR (11, 11)
4823        && INSTR (14, 14))
4824       || ((INSTR (11, 11) == 0
4825 	   && INSTR (10, 10) == 0)))
4826     {
4827       /* A compare vs 0.  */
4828       if (vm != 0)
4829 	{
4830 	  if (INSTR (15, 10) == 0x2A)
4831 	    do_vec_maxv (cpu);
4832 	  else if (INSTR (15, 10) == 0x32
4833 		   || INSTR (15, 10) == 0x3E)
4834 	    do_vec_fminmaxV (cpu);
4835 	  else if (INSTR (29, 23) == 0x1C
4836 		   && INSTR (21, 10) == 0x876)
4837 	    do_vec_SCVTF (cpu);
4838 	  else
4839 	    HALT_NYI;
4840 	  return;
4841 	}
4842     }
4843 
4844   if (INSTR (14, 14))
4845     {
4846       /* A floating point compare.  */
4847       unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
4848 	| INSTR (13, 10);
4849 
4850       NYI_assert (15, 15, 1);
4851 
4852       switch (decode)
4853 	{
4854 	case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
4855 	case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
4856 	case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
4857 	case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
4858 	case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
4859 	case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
4860 	case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
4861 	case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);
4862 
4863 	default:
4864 	  HALT_NYI;
4865 	}
4866     }
4867   else
4868     {
4869       unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
4870 
4871       switch (decode)
4872 	{
4873 	case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
4874 	case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
4875 	case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
4876 	case 0x23: /* 0100011 TST */	VEC_CMP  (u, & );
4877 	case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
4878 	case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
4879 	case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
4880 	case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
4881 	case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
4882 	case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
4883 	case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
4884 	default:
4885 	  if (vm == 0)
4886 	    HALT_NYI;
4887 	  do_vec_maxv (cpu);
4888 	}
4889     }
4890 }
4891 
4892 static void
4893 do_vec_SSHL (sim_cpu *cpu)
4894 {
4895   /* instr[31]    = 0
4896      instr[30]    = first part (0)/ second part (1)
4897      instr[29,24] = 00 1110
4898      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4899      instr[21]    = 1
4900      instr[20,16] = Vm
4901      instr[15,10] = 0100 01
4902      instr[9,5]   = Vn
4903      instr[4,0]   = Vd.  */
4904 
4905   unsigned full = INSTR (30, 30);
4906   unsigned vm = INSTR (20, 16);
4907   unsigned vn = INSTR (9, 5);
4908   unsigned vd = INSTR (4, 0);
4909   unsigned i;
4910   signed int shift;
4911 
4912   NYI_assert (29, 24, 0x0E);
4913   NYI_assert (21, 21, 1);
4914   NYI_assert (15, 10, 0x11);
4915 
4916   /* FIXME: What is a signed shift left in this context ?.  */
4917 
4918   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4919   switch (INSTR (23, 22))
4920     {
4921     case 0:
4922       for (i = 0; i < (full ? 16 : 8); i++)
4923 	{
4924 	  shift = aarch64_get_vec_s8 (cpu, vm, i);
4925 	  if (shift >= 0)
4926 	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4927 				<< shift);
4928 	  else
4929 	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4930 				>> - shift);
4931 	}
4932       return;
4933 
4934     case 1:
4935       for (i = 0; i < (full ? 8 : 4); i++)
4936 	{
4937 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4938 	  if (shift >= 0)
4939 	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4940 				 << shift);
4941 	  else
4942 	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4943 				 >> - shift);
4944 	}
4945       return;
4946 
4947     case 2:
4948       for (i = 0; i < (full ? 4 : 2); i++)
4949 	{
4950 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4951 	  if (shift >= 0)
4952 	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4953 				 << shift);
4954 	  else
4955 	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4956 				 >> - shift);
4957 	}
4958       return;
4959 
4960     case 3:
4961       if (! full)
4962 	HALT_UNALLOC;
4963       for (i = 0; i < 2; i++)
4964 	{
4965 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4966 	  if (shift >= 0)
4967 	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4968 				 << shift);
4969 	  else
4970 	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4971 				 >> - shift);
4972 	}
4973       return;
4974     }
4975 }
4976 
4977 static void
4978 do_vec_USHL (sim_cpu *cpu)
4979 {
4980   /* instr[31]    = 0
4981      instr[30]    = first part (0)/ second part (1)
4982      instr[29,24] = 10 1110
4983      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4984      instr[21]    = 1
4985      instr[20,16] = Vm
4986      instr[15,10] = 0100 01
4987      instr[9,5]   = Vn
4988      instr[4,0]   = Vd  */
4989 
4990   unsigned full = INSTR (30, 30);
4991   unsigned vm = INSTR (20, 16);
4992   unsigned vn = INSTR (9, 5);
4993   unsigned vd = INSTR (4, 0);
4994   unsigned i;
4995   signed int shift;
4996 
4997   NYI_assert (29, 24, 0x2E);
4998   NYI_assert (15, 10, 0x11);
4999 
5000   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5001   switch (INSTR (23, 22))
5002     {
5003     case 0:
5004 	for (i = 0; i < (full ? 16 : 8); i++)
5005 	  {
5006 	    shift = aarch64_get_vec_s8 (cpu, vm, i);
5007 	    if (shift >= 0)
5008 	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
5009 				  << shift);
5010 	    else
5011 	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
5012 				  >> - shift);
5013 	  }
5014       return;
5015 
5016     case 1:
5017       for (i = 0; i < (full ? 8 : 4); i++)
5018 	{
5019 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
5020 	  if (shift >= 0)
5021 	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
5022 				 << shift);
5023 	  else
5024 	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
5025 				 >> - shift);
5026 	}
5027       return;
5028 
5029     case 2:
5030       for (i = 0; i < (full ? 4 : 2); i++)
5031 	{
5032 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
5033 	  if (shift >= 0)
5034 	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
5035 				 << shift);
5036 	  else
5037 	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
5038 				 >> - shift);
5039 	}
5040       return;
5041 
5042     case 3:
5043       if (! full)
5044 	HALT_UNALLOC;
5045       for (i = 0; i < 2; i++)
5046 	{
5047 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
5048 	  if (shift >= 0)
5049 	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
5050 				 << shift);
5051 	  else
5052 	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
5053 				 >> - shift);
5054 	}
5055       return;
5056     }
5057 }
5058 
5059 static void
5060 do_vec_FMLA (sim_cpu *cpu)
5061 {
5062   /* instr[31]    = 0
5063      instr[30]    = full/half selector
5064      instr[29,23] = 0011100
5065      instr[22]    = size: 0=>float, 1=>double
5066      instr[21]    = 1
5067      instr[20,16] = Vn
5068      instr[15,10] = 1100 11
5069      instr[9,5]   = Vm
5070      instr[4.0]   = Vd.  */
5071 
5072   unsigned vm = INSTR (20, 16);
5073   unsigned vn = INSTR (9, 5);
5074   unsigned vd = INSTR (4, 0);
5075   unsigned i;
5076   int      full = INSTR (30, 30);
5077 
5078   NYI_assert (29, 23, 0x1C);
5079   NYI_assert (21, 21, 1);
5080   NYI_assert (15, 10, 0x33);
5081 
5082   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5083   if (INSTR (22, 22))
5084     {
5085       if (! full)
5086 	HALT_UNALLOC;
5087       for (i = 0; i < 2; i++)
5088 	aarch64_set_vec_double (cpu, vd, i,
5089 				aarch64_get_vec_double (cpu, vn, i) *
5090 				aarch64_get_vec_double (cpu, vm, i) +
5091 				aarch64_get_vec_double (cpu, vd, i));
5092     }
5093   else
5094     {
5095       for (i = 0; i < (full ? 4 : 2); i++)
5096 	aarch64_set_vec_float (cpu, vd, i,
5097 			       aarch64_get_vec_float (cpu, vn, i) *
5098 			       aarch64_get_vec_float (cpu, vm, i) +
5099 			       aarch64_get_vec_float (cpu, vd, i));
5100     }
5101 }
5102 
5103 static void
5104 do_vec_max (sim_cpu *cpu)
5105 {
5106   /* instr[31]    = 0
5107      instr[30]    = full/half selector
5108      instr[29]    = SMAX (0) / UMAX (1)
5109      instr[28,24] = 0 1110
5110      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5111      instr[21]    = 1
5112      instr[20,16] = Vn
5113      instr[15,10] = 0110 01
5114      instr[9,5]   = Vm
5115      instr[4.0]   = Vd.  */
5116 
5117   unsigned vm = INSTR (20, 16);
5118   unsigned vn = INSTR (9, 5);
5119   unsigned vd = INSTR (4, 0);
5120   unsigned i;
5121   int      full = INSTR (30, 30);
5122 
5123   NYI_assert (28, 24, 0x0E);
5124   NYI_assert (21, 21, 1);
5125   NYI_assert (15, 10, 0x19);
5126 
5127   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5128   if (INSTR (29, 29))
5129     {
5130       switch (INSTR (23, 22))
5131 	{
5132 	case 0:
5133 	  for (i = 0; i < (full ? 16 : 8); i++)
5134 	    aarch64_set_vec_u8 (cpu, vd, i,
5135 				aarch64_get_vec_u8 (cpu, vn, i)
5136 				> aarch64_get_vec_u8 (cpu, vm, i)
5137 				? aarch64_get_vec_u8 (cpu, vn, i)
5138 				: aarch64_get_vec_u8 (cpu, vm, i));
5139 	  return;
5140 
5141 	case 1:
5142 	  for (i = 0; i < (full ? 8 : 4); i++)
5143 	    aarch64_set_vec_u16 (cpu, vd, i,
5144 				 aarch64_get_vec_u16 (cpu, vn, i)
5145 				 > aarch64_get_vec_u16 (cpu, vm, i)
5146 				 ? aarch64_get_vec_u16 (cpu, vn, i)
5147 				 : aarch64_get_vec_u16 (cpu, vm, i));
5148 	  return;
5149 
5150 	case 2:
5151 	  for (i = 0; i < (full ? 4 : 2); i++)
5152 	    aarch64_set_vec_u32 (cpu, vd, i,
5153 				 aarch64_get_vec_u32 (cpu, vn, i)
5154 				 > aarch64_get_vec_u32 (cpu, vm, i)
5155 				 ? aarch64_get_vec_u32 (cpu, vn, i)
5156 				 : aarch64_get_vec_u32 (cpu, vm, i));
5157 	  return;
5158 
5159 	case 3:
5160 	  HALT_UNALLOC;
5161 	}
5162     }
5163   else
5164     {
5165       switch (INSTR (23, 22))
5166 	{
5167 	case 0:
5168 	  for (i = 0; i < (full ? 16 : 8); i++)
5169 	    aarch64_set_vec_s8 (cpu, vd, i,
5170 				aarch64_get_vec_s8 (cpu, vn, i)
5171 				> aarch64_get_vec_s8 (cpu, vm, i)
5172 				? aarch64_get_vec_s8 (cpu, vn, i)
5173 				: aarch64_get_vec_s8 (cpu, vm, i));
5174 	  return;
5175 
5176 	case 1:
5177 	  for (i = 0; i < (full ? 8 : 4); i++)
5178 	    aarch64_set_vec_s16 (cpu, vd, i,
5179 				 aarch64_get_vec_s16 (cpu, vn, i)
5180 				 > aarch64_get_vec_s16 (cpu, vm, i)
5181 				 ? aarch64_get_vec_s16 (cpu, vn, i)
5182 				 : aarch64_get_vec_s16 (cpu, vm, i));
5183 	  return;
5184 
5185 	case 2:
5186 	  for (i = 0; i < (full ? 4 : 2); i++)
5187 	    aarch64_set_vec_s32 (cpu, vd, i,
5188 				 aarch64_get_vec_s32 (cpu, vn, i)
5189 				 > aarch64_get_vec_s32 (cpu, vm, i)
5190 				 ? aarch64_get_vec_s32 (cpu, vn, i)
5191 				 : aarch64_get_vec_s32 (cpu, vm, i));
5192 	  return;
5193 
5194 	case 3:
5195 	  HALT_UNALLOC;
5196 	}
5197     }
5198 }
5199 
5200 static void
5201 do_vec_min (sim_cpu *cpu)
5202 {
5203   /* instr[31]    = 0
5204      instr[30]    = full/half selector
5205      instr[29]    = SMIN (0) / UMIN (1)
5206      instr[28,24] = 0 1110
5207      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5208      instr[21]    = 1
5209      instr[20,16] = Vn
5210      instr[15,10] = 0110 11
5211      instr[9,5]   = Vm
5212      instr[4.0]   = Vd.  */
5213 
5214   unsigned vm = INSTR (20, 16);
5215   unsigned vn = INSTR (9, 5);
5216   unsigned vd = INSTR (4, 0);
5217   unsigned i;
5218   int      full = INSTR (30, 30);
5219 
5220   NYI_assert (28, 24, 0x0E);
5221   NYI_assert (21, 21, 1);
5222   NYI_assert (15, 10, 0x1B);
5223 
5224   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5225   if (INSTR (29, 29))
5226     {
5227       switch (INSTR (23, 22))
5228 	{
5229 	case 0:
5230 	  for (i = 0; i < (full ? 16 : 8); i++)
5231 	    aarch64_set_vec_u8 (cpu, vd, i,
5232 				aarch64_get_vec_u8 (cpu, vn, i)
5233 				< aarch64_get_vec_u8 (cpu, vm, i)
5234 				? aarch64_get_vec_u8 (cpu, vn, i)
5235 				: aarch64_get_vec_u8 (cpu, vm, i));
5236 	  return;
5237 
5238 	case 1:
5239 	  for (i = 0; i < (full ? 8 : 4); i++)
5240 	    aarch64_set_vec_u16 (cpu, vd, i,
5241 				 aarch64_get_vec_u16 (cpu, vn, i)
5242 				 < aarch64_get_vec_u16 (cpu, vm, i)
5243 				 ? aarch64_get_vec_u16 (cpu, vn, i)
5244 				 : aarch64_get_vec_u16 (cpu, vm, i));
5245 	  return;
5246 
5247 	case 2:
5248 	  for (i = 0; i < (full ? 4 : 2); i++)
5249 	    aarch64_set_vec_u32 (cpu, vd, i,
5250 				 aarch64_get_vec_u32 (cpu, vn, i)
5251 				 < aarch64_get_vec_u32 (cpu, vm, i)
5252 				 ? aarch64_get_vec_u32 (cpu, vn, i)
5253 				 : aarch64_get_vec_u32 (cpu, vm, i));
5254 	  return;
5255 
5256 	case 3:
5257 	  HALT_UNALLOC;
5258 	}
5259     }
5260   else
5261     {
5262       switch (INSTR (23, 22))
5263 	{
5264 	case 0:
5265 	  for (i = 0; i < (full ? 16 : 8); i++)
5266 	    aarch64_set_vec_s8 (cpu, vd, i,
5267 				aarch64_get_vec_s8 (cpu, vn, i)
5268 				< aarch64_get_vec_s8 (cpu, vm, i)
5269 				? aarch64_get_vec_s8 (cpu, vn, i)
5270 				: aarch64_get_vec_s8 (cpu, vm, i));
5271 	  return;
5272 
5273 	case 1:
5274 	  for (i = 0; i < (full ? 8 : 4); i++)
5275 	    aarch64_set_vec_s16 (cpu, vd, i,
5276 				 aarch64_get_vec_s16 (cpu, vn, i)
5277 				 < aarch64_get_vec_s16 (cpu, vm, i)
5278 				 ? aarch64_get_vec_s16 (cpu, vn, i)
5279 				 : aarch64_get_vec_s16 (cpu, vm, i));
5280 	  return;
5281 
5282 	case 2:
5283 	  for (i = 0; i < (full ? 4 : 2); i++)
5284 	    aarch64_set_vec_s32 (cpu, vd, i,
5285 				 aarch64_get_vec_s32 (cpu, vn, i)
5286 				 < aarch64_get_vec_s32 (cpu, vm, i)
5287 				 ? aarch64_get_vec_s32 (cpu, vn, i)
5288 				 : aarch64_get_vec_s32 (cpu, vm, i));
5289 	  return;
5290 
5291 	case 3:
5292 	  HALT_UNALLOC;
5293 	}
5294     }
5295 }
5296 
5297 static void
5298 do_vec_sub_long (sim_cpu *cpu)
5299 {
5300   /* instr[31]    = 0
5301      instr[30]    = lower (0) / upper (1)
5302      instr[29]    = signed (0) / unsigned (1)
5303      instr[28,24] = 0 1110
5304      instr[23,22] = size: bytes (00), half (01), word (10)
5305      instr[21]    = 1
5306      insrt[20,16] = Vm
5307      instr[15,10] = 0010 00
5308      instr[9,5]   = Vn
5309      instr[4,0]   = V dest.  */
5310 
5311   unsigned size = INSTR (23, 22);
5312   unsigned vm = INSTR (20, 16);
5313   unsigned vn = INSTR (9, 5);
5314   unsigned vd = INSTR (4, 0);
5315   unsigned bias = 0;
5316   unsigned i;
5317 
5318   NYI_assert (28, 24, 0x0E);
5319   NYI_assert (21, 21, 1);
5320   NYI_assert (15, 10, 0x08);
5321 
5322   if (size == 3)
5323     HALT_UNALLOC;
5324 
5325   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5326   switch (INSTR (30, 29))
5327     {
5328     case 2: /* SSUBL2.  */
5329       bias = 2;
5330     case 0: /* SSUBL.  */
5331       switch (size)
5332 	{
5333 	case 0:
5334 	  bias *= 3;
5335 	  for (i = 0; i < 8; i++)
5336 	    aarch64_set_vec_s16 (cpu, vd, i,
5337 				 aarch64_get_vec_s8 (cpu, vn, i + bias)
5338 				 - aarch64_get_vec_s8 (cpu, vm, i + bias));
5339 	  break;
5340 
5341 	case 1:
5342 	  bias *= 2;
5343 	  for (i = 0; i < 4; i++)
5344 	    aarch64_set_vec_s32 (cpu, vd, i,
5345 				 aarch64_get_vec_s16 (cpu, vn, i + bias)
5346 				 - aarch64_get_vec_s16 (cpu, vm, i + bias));
5347 	  break;
5348 
5349 	case 2:
5350 	  for (i = 0; i < 2; i++)
5351 	    aarch64_set_vec_s64 (cpu, vd, i,
5352 				 aarch64_get_vec_s32 (cpu, vn, i + bias)
5353 				 - aarch64_get_vec_s32 (cpu, vm, i + bias));
5354 	  break;
5355 
5356 	default:
5357 	  HALT_UNALLOC;
5358 	}
5359       break;
5360 
5361     case 3: /* USUBL2.  */
5362       bias = 2;
5363     case 1: /* USUBL.  */
5364       switch (size)
5365 	{
5366 	case 0:
5367 	  bias *= 3;
5368 	  for (i = 0; i < 8; i++)
5369 	    aarch64_set_vec_u16 (cpu, vd, i,
5370 				 aarch64_get_vec_u8 (cpu, vn, i + bias)
5371 				 - aarch64_get_vec_u8 (cpu, vm, i + bias));
5372 	  break;
5373 
5374 	case 1:
5375 	  bias *= 2;
5376 	  for (i = 0; i < 4; i++)
5377 	    aarch64_set_vec_u32 (cpu, vd, i,
5378 				 aarch64_get_vec_u16 (cpu, vn, i + bias)
5379 				 - aarch64_get_vec_u16 (cpu, vm, i + bias));
5380 	  break;
5381 
5382 	case 2:
5383 	  for (i = 0; i < 2; i++)
5384 	    aarch64_set_vec_u64 (cpu, vd, i,
5385 				 aarch64_get_vec_u32 (cpu, vn, i + bias)
5386 				 - aarch64_get_vec_u32 (cpu, vm, i + bias));
5387 	  break;
5388 
5389 	default:
5390 	  HALT_UNALLOC;
5391 	}
5392       break;
5393     }
5394 }
5395 
5396 static void
5397 do_vec_ADDP (sim_cpu *cpu)
5398 {
5399   /* instr[31]    = 0
5400      instr[30]    = half(0)/full(1)
5401      instr[29,24] = 00 1110
5402      instr[23,22] = size: bytes (00), half (01), word (10), long (11)
5403      instr[21]    = 1
5404      insrt[20,16] = Vm
5405      instr[15,10] = 1011 11
5406      instr[9,5]   = Vn
5407      instr[4,0]   = V dest.  */
5408 
5409   FRegister copy_vn;
5410   FRegister copy_vm;
5411   unsigned full = INSTR (30, 30);
5412   unsigned size = INSTR (23, 22);
5413   unsigned vm = INSTR (20, 16);
5414   unsigned vn = INSTR (9, 5);
5415   unsigned vd = INSTR (4, 0);
5416   unsigned i, range;
5417 
5418   NYI_assert (29, 24, 0x0E);
5419   NYI_assert (21, 21, 1);
5420   NYI_assert (15, 10, 0x2F);
5421 
5422   /* Make copies of the source registers in case vd == vn/vm.  */
5423   copy_vn = cpu->fr[vn];
5424   copy_vm = cpu->fr[vm];
5425 
5426   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5427   switch (size)
5428     {
5429     case 0:
5430       range = full ? 8 : 4;
5431       for (i = 0; i < range; i++)
5432 	{
5433 	  aarch64_set_vec_u8 (cpu, vd, i,
5434 			      copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
5435 	  aarch64_set_vec_u8 (cpu, vd, i + range,
5436 			      copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
5437 	}
5438       return;
5439 
5440     case 1:
5441       range = full ? 4 : 2;
5442       for (i = 0; i < range; i++)
5443 	{
5444 	  aarch64_set_vec_u16 (cpu, vd, i,
5445 			       copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
5446 	  aarch64_set_vec_u16 (cpu, vd, i + range,
5447 			       copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
5448 	}
5449       return;
5450 
5451     case 2:
5452       range = full ? 2 : 1;
5453       for (i = 0; i < range; i++)
5454 	{
5455 	  aarch64_set_vec_u32 (cpu, vd, i,
5456 			       copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
5457 	  aarch64_set_vec_u32 (cpu, vd, i + range,
5458 			       copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
5459 	}
5460       return;
5461 
5462     case 3:
5463       if (! full)
5464 	HALT_UNALLOC;
5465       aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
5466       aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
5467       return;
5468     }
5469 }
5470 
5471 /* Float point vector convert to longer (precision).  */
5472 static void
5473 do_vec_FCVTL (sim_cpu *cpu)
5474 {
5475   /* instr[31]    = 0
5476      instr[30]    = half (0) / all (1)
5477      instr[29,23] = 00 1110 0
5478      instr[22]    = single (0) / double (1)
5479      instr[21,10] = 10 0001 0111 10
5480      instr[9,5]   = Rn
5481      instr[4,0]   = Rd.  */
5482 
5483   unsigned rn = INSTR (9, 5);
5484   unsigned rd = INSTR (4, 0);
5485   unsigned full = INSTR (30, 30);
5486   unsigned i;
5487 
5488   NYI_assert (31, 31, 0);
5489   NYI_assert (29, 23, 0x1C);
5490   NYI_assert (21, 10, 0x85E);
5491 
5492   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5493   if (INSTR (22, 22))
5494     {
5495       for (i = 0; i < 2; i++)
5496 	aarch64_set_vec_double (cpu, rd, i,
5497 				aarch64_get_vec_float (cpu, rn, i + 2*full));
5498     }
5499   else
5500     {
5501       HALT_NYI;
5502 
5503 #if 0
5504       /* TODO: Implement missing half-float support.  */
5505       for (i = 0; i < 4; i++)
5506 	aarch64_set_vec_float (cpu, rd, i,
5507 			     aarch64_get_vec_halffloat (cpu, rn, i + 4*full));
5508 #endif
5509     }
5510 }
5511 
5512 static void
5513 do_vec_FABS (sim_cpu *cpu)
5514 {
5515   /* instr[31]    = 0
5516      instr[30]    = half(0)/full(1)
5517      instr[29,23] = 00 1110 1
5518      instr[22]    = float(0)/double(1)
5519      instr[21,16] = 10 0000
5520      instr[15,10] = 1111 10
5521      instr[9,5]   = Vn
5522      instr[4,0]   = Vd.  */
5523 
5524   unsigned vn = INSTR (9, 5);
5525   unsigned vd = INSTR (4, 0);
5526   unsigned full = INSTR (30, 30);
5527   unsigned i;
5528 
5529   NYI_assert (29, 23, 0x1D);
5530   NYI_assert (21, 10, 0x83E);
5531 
5532   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5533   if (INSTR (22, 22))
5534     {
5535       if (! full)
5536 	HALT_NYI;
5537 
5538       for (i = 0; i < 2; i++)
5539 	aarch64_set_vec_double (cpu, vd, i,
5540 				fabs (aarch64_get_vec_double (cpu, vn, i)));
5541     }
5542   else
5543     {
5544       for (i = 0; i < (full ? 4 : 2); i++)
5545 	aarch64_set_vec_float (cpu, vd, i,
5546 			       fabsf (aarch64_get_vec_float (cpu, vn, i)));
5547     }
5548 }
5549 
5550 static void
5551 do_vec_FCVTZS (sim_cpu *cpu)
5552 {
5553   /* instr[31]    = 0
5554      instr[30]    = half (0) / all (1)
5555      instr[29,23] = 00 1110 1
5556      instr[22]    = single (0) / double (1)
5557      instr[21,10] = 10 0001 1011 10
5558      instr[9,5]   = Rn
5559      instr[4,0]   = Rd.  */
5560 
5561   unsigned rn = INSTR (9, 5);
5562   unsigned rd = INSTR (4, 0);
5563   unsigned full = INSTR (30, 30);
5564   unsigned i;
5565 
5566   NYI_assert (31, 31, 0);
5567   NYI_assert (29, 23, 0x1D);
5568   NYI_assert (21, 10, 0x86E);
5569 
5570   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5571   if (INSTR (22, 22))
5572     {
5573       if (! full)
5574 	HALT_UNALLOC;
5575 
5576       for (i = 0; i < 2; i++)
5577 	aarch64_set_vec_s64 (cpu, rd, i,
5578 			     (int64_t) aarch64_get_vec_double (cpu, rn, i));
5579     }
5580   else
5581     for (i = 0; i < (full ? 4 : 2); i++)
5582       aarch64_set_vec_s32 (cpu, rd, i,
5583 			   (int32_t) aarch64_get_vec_float (cpu, rn, i));
5584 }
5585 
5586 static void
5587 do_vec_REV64 (sim_cpu *cpu)
5588 {
5589   /* instr[31]    = 0
5590      instr[30]    = full/half
5591      instr[29,24] = 00 1110
5592      instr[23,22] = size
5593      instr[21,10] = 10 0000 0000 10
5594      instr[9,5]   = Rn
5595      instr[4,0]   = Rd.  */
5596 
5597   unsigned rn = INSTR (9, 5);
5598   unsigned rd = INSTR (4, 0);
5599   unsigned size = INSTR (23, 22);
5600   unsigned full = INSTR (30, 30);
5601   unsigned i;
5602   FRegister val;
5603 
5604   NYI_assert (29, 24, 0x0E);
5605   NYI_assert (21, 10, 0x802);
5606 
5607   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5608   switch (size)
5609     {
5610     case 0:
5611       for (i = 0; i < (full ? 16 : 8); i++)
5612 	val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
5613       break;
5614 
5615     case 1:
5616       for (i = 0; i < (full ? 8 : 4); i++)
5617 	val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
5618       break;
5619 
5620     case 2:
5621       for (i = 0; i < (full ? 4 : 2); i++)
5622 	val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
5623       break;
5624 
5625     case 3:
5626       HALT_UNALLOC;
5627     }
5628 
5629   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5630   if (full)
5631     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5632 }
5633 
5634 static void
5635 do_vec_REV16 (sim_cpu *cpu)
5636 {
5637   /* instr[31]    = 0
5638      instr[30]    = full/half
5639      instr[29,24] = 00 1110
5640      instr[23,22] = size
5641      instr[21,10] = 10 0000 0001 10
5642      instr[9,5]   = Rn
5643      instr[4,0]   = Rd.  */
5644 
5645   unsigned rn = INSTR (9, 5);
5646   unsigned rd = INSTR (4, 0);
5647   unsigned size = INSTR (23, 22);
5648   unsigned full = INSTR (30, 30);
5649   unsigned i;
5650   FRegister val;
5651 
5652   NYI_assert (29, 24, 0x0E);
5653   NYI_assert (21, 10, 0x806);
5654 
5655   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5656   switch (size)
5657     {
5658     case 0:
5659       for (i = 0; i < (full ? 16 : 8); i++)
5660 	val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
5661       break;
5662 
5663     default:
5664       HALT_UNALLOC;
5665     }
5666 
5667   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5668   if (full)
5669     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5670 }
5671 
5672 static void
5673 do_vec_op1 (sim_cpu *cpu)
5674 {
5675   /* instr[31]    = 0
5676      instr[30]    = half/full
5677      instr[29,24] = 00 1110
5678      instr[23,21] = ???
5679      instr[20,16] = Vm
5680      instr[15,10] = sub-opcode
5681      instr[9,5]   = Vn
5682      instr[4,0]   = Vd  */
5683   NYI_assert (29, 24, 0x0E);
5684 
5685   if (INSTR (21, 21) == 0)
5686     {
5687       if (INSTR (23, 22) == 0)
5688 	{
5689 	  if (INSTR (30, 30) == 1
5690 	      && INSTR (17, 14) == 0
5691 	      && INSTR (12, 10) == 7)
5692 	    return do_vec_ins_2 (cpu);
5693 
5694 	  switch (INSTR (15, 10))
5695 	    {
5696 	    case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
5697 	    case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
5698 	    case 0x07: do_vec_INS (cpu); return;
5699 	    case 0x0B: do_vec_SMOV_into_scalar (cpu); return;
5700 	    case 0x0F: do_vec_UMOV_into_scalar (cpu); return;
5701 
5702 	    case 0x00:
5703 	    case 0x08:
5704 	    case 0x10:
5705 	    case 0x18:
5706 	      do_vec_TBL (cpu); return;
5707 
5708 	    case 0x06:
5709 	    case 0x16:
5710 	      do_vec_UZP (cpu); return;
5711 
5712 	    case 0x0A: do_vec_TRN (cpu); return;
5713 
5714 	    case 0x0E:
5715 	    case 0x1E:
5716 	      do_vec_ZIP (cpu); return;
5717 
5718 	    default:
5719 	      HALT_NYI;
5720 	    }
5721 	}
5722 
5723       switch (INSTR (13, 10))
5724 	{
5725 	case 0x6: do_vec_UZP (cpu); return;
5726 	case 0xE: do_vec_ZIP (cpu); return;
5727 	case 0xA: do_vec_TRN (cpu); return;
5728 	default:  HALT_NYI;
5729 	}
5730     }
5731 
5732   switch (INSTR (15, 10))
5733     {
5734     case 0x02: do_vec_REV64 (cpu); return;
5735     case 0x06: do_vec_REV16 (cpu); return;
5736 
5737     case 0x07:
5738       switch (INSTR (23, 21))
5739 	{
5740 	case 1: do_vec_AND (cpu); return;
5741 	case 3: do_vec_BIC (cpu); return;
5742 	case 5: do_vec_ORR (cpu); return;
5743 	case 7: do_vec_ORN (cpu); return;
5744 	default: HALT_NYI;
5745 	}
5746 
5747     case 0x08: do_vec_sub_long (cpu); return;
5748     case 0x0a: do_vec_XTN (cpu); return;
5749     case 0x11: do_vec_SSHL (cpu); return;
5750     case 0x16: do_vec_CNT (cpu); return;
5751     case 0x19: do_vec_max (cpu); return;
5752     case 0x1B: do_vec_min (cpu); return;
5753     case 0x21: do_vec_add (cpu); return;
5754     case 0x25: do_vec_MLA (cpu); return;
5755     case 0x27: do_vec_mul (cpu); return;
5756     case 0x2F: do_vec_ADDP (cpu); return;
5757     case 0x30: do_vec_mull (cpu); return;
5758     case 0x33: do_vec_FMLA (cpu); return;
5759     case 0x35: do_vec_fadd (cpu); return;
5760 
5761     case 0x1E:
5762       switch (INSTR (20, 16))
5763 	{
5764 	case 0x01: do_vec_FCVTL (cpu); return;
5765 	default: HALT_NYI;
5766 	}
5767 
5768     case 0x2E:
5769       switch (INSTR (20, 16))
5770 	{
5771 	case 0x00: do_vec_ABS (cpu); return;
5772 	case 0x01: do_vec_FCVTZS (cpu); return;
5773 	case 0x11: do_vec_ADDV (cpu); return;
5774 	default: HALT_NYI;
5775 	}
5776 
5777     case 0x31:
5778     case 0x3B:
5779       do_vec_Fminmax (cpu); return;
5780 
5781     case 0x0D:
5782     case 0x0F:
5783     case 0x22:
5784     case 0x23:
5785     case 0x26:
5786     case 0x2A:
5787     case 0x32:
5788     case 0x36:
5789     case 0x39:
5790     case 0x3A:
5791       do_vec_compare (cpu); return;
5792 
5793     case 0x3E:
5794       do_vec_FABS (cpu); return;
5795 
5796     default:
5797       HALT_NYI;
5798     }
5799 }
5800 
5801 static void
5802 do_vec_xtl (sim_cpu *cpu)
5803 {
5804   /* instr[31]    = 0
5805      instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
5806      instr[28,22] = 0 1111 00
5807      instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
5808      instr[15,10] = 1010 01
5809      instr[9,5]   = V source
5810      instr[4,0]   = V dest.  */
5811 
5812   unsigned vs = INSTR (9, 5);
5813   unsigned vd = INSTR (4, 0);
5814   unsigned i, shift, bias = 0;
5815 
5816   NYI_assert (28, 22, 0x3C);
5817   NYI_assert (15, 10, 0x29);
5818 
5819   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5820   switch (INSTR (30, 29))
5821     {
5822     case 2: /* SXTL2, SSHLL2.  */
5823       bias = 2;
5824     case 0: /* SXTL, SSHLL.  */
5825       if (INSTR (21, 21))
5826 	{
5827 	  int64_t val1, val2;
5828 
5829 	  shift = INSTR (20, 16);
5830 	  /* Get the source values before setting the destination values
5831 	     in case the source and destination are the same.  */
5832 	  val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
5833 	  val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
5834 	  aarch64_set_vec_s64 (cpu, vd, 0, val1);
5835 	  aarch64_set_vec_s64 (cpu, vd, 1, val2);
5836 	}
5837       else if (INSTR (20, 20))
5838 	{
5839 	  int32_t v[4];
5840 	  int32_t v1,v2,v3,v4;
5841 
5842 	  shift = INSTR (19, 16);
5843 	  bias *= 2;
5844 	  for (i = 0; i < 4; i++)
5845 	    v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
5846 	  for (i = 0; i < 4; i++)
5847 	    aarch64_set_vec_s32 (cpu, vd, i, v[i]);
5848 	}
5849       else
5850 	{
5851 	  int16_t v[8];
5852 	  NYI_assert (19, 19, 1);
5853 
5854 	  shift = INSTR (18, 16);
5855 	  bias *= 4;
5856 	  for (i = 0; i < 8; i++)
5857 	    v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
5858 	  for (i = 0; i < 8; i++)
5859 	    aarch64_set_vec_s16 (cpu, vd, i, v[i]);
5860 	}
5861       return;
5862 
5863     case 3: /* UXTL2, USHLL2.  */
5864       bias = 2;
5865     case 1: /* UXTL, USHLL.  */
5866       if (INSTR (21, 21))
5867 	{
5868 	  uint64_t v1, v2;
5869 	  shift = INSTR (20, 16);
5870 	  v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
5871 	  v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
5872 	  aarch64_set_vec_u64 (cpu, vd, 0, v1);
5873 	  aarch64_set_vec_u64 (cpu, vd, 1, v2);
5874 	}
5875       else if (INSTR (20, 20))
5876 	{
5877 	  uint32_t v[4];
5878 	  shift = INSTR (19, 16);
5879 	  bias *= 2;
5880 	  for (i = 0; i < 4; i++)
5881 	    v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
5882 	  for (i = 0; i < 4; i++)
5883 	    aarch64_set_vec_u32 (cpu, vd, i, v[i]);
5884 	}
5885       else
5886 	{
5887 	  uint16_t v[8];
5888 	  NYI_assert (19, 19, 1);
5889 
5890 	  shift = INSTR (18, 16);
5891 	  bias *= 4;
5892 	  for (i = 0; i < 8; i++)
5893 	    v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
5894 	  for (i = 0; i < 8; i++)
5895 	    aarch64_set_vec_u16 (cpu, vd, i, v[i]);
5896 	}
5897       return;
5898     }
5899 }
5900 
5901 static void
5902 do_vec_SHL (sim_cpu *cpu)
5903 {
5904   /* instr [31]    = 0
5905      instr [30]    = half(0)/full(1)
5906      instr [29,23] = 001 1110
5907      instr [22,16] = size and shift amount
5908      instr [15,10] = 01 0101
5909      instr [9, 5]  = Vs
5910      instr [4, 0]  = Vd.  */
5911 
5912   int shift;
5913   int full    = INSTR (30, 30);
5914   unsigned vs = INSTR (9, 5);
5915   unsigned vd = INSTR (4, 0);
5916   unsigned i;
5917 
5918   NYI_assert (29, 23, 0x1E);
5919   NYI_assert (15, 10, 0x15);
5920 
5921   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5922   if (INSTR (22, 22))
5923     {
5924       shift = INSTR (21, 16);
5925 
5926       if (full == 0)
5927 	HALT_UNALLOC;
5928 
5929       for (i = 0; i < 2; i++)
5930 	{
5931 	  uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5932 	  aarch64_set_vec_u64 (cpu, vd, i, val << shift);
5933 	}
5934 
5935       return;
5936     }
5937 
5938   if (INSTR (21, 21))
5939     {
5940       shift = INSTR (20, 16);
5941 
5942       for (i = 0; i < (full ? 4 : 2); i++)
5943 	{
5944 	  uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5945 	  aarch64_set_vec_u32 (cpu, vd, i, val << shift);
5946 	}
5947 
5948       return;
5949     }
5950 
5951   if (INSTR (20, 20))
5952     {
5953       shift = INSTR (19, 16);
5954 
5955       for (i = 0; i < (full ? 8 : 4); i++)
5956 	{
5957 	  uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5958 	  aarch64_set_vec_u16 (cpu, vd, i, val << shift);
5959 	}
5960 
5961       return;
5962     }
5963 
5964   if (INSTR (19, 19) == 0)
5965     HALT_UNALLOC;
5966 
5967   shift = INSTR (18, 16);
5968 
5969   for (i = 0; i < (full ? 16 : 8); i++)
5970     {
5971       uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5972       aarch64_set_vec_u8 (cpu, vd, i, val << shift);
5973     }
5974 }
5975 
5976 static void
5977 do_vec_SSHR_USHR (sim_cpu *cpu)
5978 {
5979   /* instr [31]    = 0
5980      instr [30]    = half(0)/full(1)
5981      instr [29]    = signed(0)/unsigned(1)
5982      instr [28,23] = 0 1111 0
5983      instr [22,16] = size and shift amount
5984      instr [15,10] = 0000 01
5985      instr [9, 5]  = Vs
5986      instr [4, 0]  = Vd.  */
5987 
5988   int full       = INSTR (30, 30);
5989   int sign       = ! INSTR (29, 29);
5990   unsigned shift = INSTR (22, 16);
5991   unsigned vs    = INSTR (9, 5);
5992   unsigned vd    = INSTR (4, 0);
5993   unsigned i;
5994 
5995   NYI_assert (28, 23, 0x1E);
5996   NYI_assert (15, 10, 0x01);
5997 
5998   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5999   if (INSTR (22, 22))
6000     {
6001       shift = 128 - shift;
6002 
6003       if (full == 0)
6004 	HALT_UNALLOC;
6005 
6006       if (sign)
6007 	for (i = 0; i < 2; i++)
6008 	  {
6009 	    int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
6010 	    aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
6011 	  }
6012       else
6013 	for (i = 0; i < 2; i++)
6014 	  {
6015 	    uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
6016 	    aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
6017 	  }
6018 
6019       return;
6020     }
6021 
6022   if (INSTR (21, 21))
6023     {
6024       shift = 64 - shift;
6025 
6026       if (sign)
6027 	for (i = 0; i < (full ? 4 : 2); i++)
6028 	  {
6029 	    int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
6030 	    aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
6031 	  }
6032       else
6033 	for (i = 0; i < (full ? 4 : 2); i++)
6034 	  {
6035 	    uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
6036 	    aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
6037 	  }
6038 
6039       return;
6040     }
6041 
6042   if (INSTR (20, 20))
6043     {
6044       shift = 32 - shift;
6045 
6046       if (sign)
6047 	for (i = 0; i < (full ? 8 : 4); i++)
6048 	  {
6049 	    int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
6050 	    aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
6051 	  }
6052       else
6053 	for (i = 0; i < (full ? 8 : 4); i++)
6054 	  {
6055 	    uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
6056 	    aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
6057 	  }
6058 
6059       return;
6060     }
6061 
6062   if (INSTR (19, 19) == 0)
6063     HALT_UNALLOC;
6064 
6065   shift = 16 - shift;
6066 
6067   if (sign)
6068     for (i = 0; i < (full ? 16 : 8); i++)
6069       {
6070 	int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
6071 	aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
6072       }
6073   else
6074     for (i = 0; i < (full ? 16 : 8); i++)
6075       {
6076 	uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
6077 	aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
6078       }
6079 }
6080 
6081 static void
6082 do_vec_MUL_by_element (sim_cpu *cpu)
6083 {
6084   /* instr[31]    = 0
6085      instr[30]    = half/full
6086      instr[29,24] = 00 1111
6087      instr[23,22] = size
6088      instr[21]    = L
6089      instr[20]    = M
6090      instr[19,16] = m
6091      instr[15,12] = 1000
6092      instr[11]    = H
6093      instr[10]    = 0
6094      instr[9,5]   = Vn
6095      instr[4,0]   = Vd  */
6096 
6097   unsigned full     = INSTR (30, 30);
6098   unsigned L        = INSTR (21, 21);
6099   unsigned H        = INSTR (11, 11);
6100   unsigned vn       = INSTR (9, 5);
6101   unsigned vd       = INSTR (4, 0);
6102   unsigned size     = INSTR (23, 22);
6103   unsigned index;
6104   unsigned vm;
6105   unsigned e;
6106 
6107   NYI_assert (29, 24, 0x0F);
6108   NYI_assert (15, 12, 0x8);
6109   NYI_assert (10, 10, 0);
6110 
6111   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6112   switch (size)
6113     {
6114     case 1:
6115       {
6116 	/* 16 bit products.  */
6117 	uint16_t product;
6118 	uint16_t element1;
6119 	uint16_t element2;
6120 
6121 	index = (H << 2) | (L << 1) | INSTR (20, 20);
6122 	vm = INSTR (19, 16);
6123 	element2 = aarch64_get_vec_u16 (cpu, vm, index);
6124 
6125 	for (e = 0; e < (full ? 8 : 4); e ++)
6126 	  {
6127 	    element1 = aarch64_get_vec_u16 (cpu, vn, e);
6128 	    product  = element1 * element2;
6129 	    aarch64_set_vec_u16 (cpu, vd, e, product);
6130 	  }
6131       }
6132       break;
6133 
6134     case 2:
6135       {
6136 	/* 32 bit products.  */
6137 	uint32_t product;
6138 	uint32_t element1;
6139 	uint32_t element2;
6140 
6141 	index = (H << 1) | L;
6142 	vm = INSTR (20, 16);
6143 	element2 = aarch64_get_vec_u32 (cpu, vm, index);
6144 
6145 	for (e = 0; e < (full ? 4 : 2); e ++)
6146 	  {
6147 	    element1 = aarch64_get_vec_u32 (cpu, vn, e);
6148 	    product  = element1 * element2;
6149 	    aarch64_set_vec_u32 (cpu, vd, e, product);
6150 	  }
6151       }
6152       break;
6153 
6154     default:
6155       HALT_UNALLOC;
6156     }
6157 }
6158 
6159 static void
6160 do_FMLA_by_element (sim_cpu *cpu)
6161 {
6162   /* instr[31]    = 0
6163      instr[30]    = half/full
6164      instr[29,23] = 00 1111 1
6165      instr[22]    = size
6166      instr[21]    = L
6167      instr[20,16] = m
6168      instr[15,12] = 0001
6169      instr[11]    = H
6170      instr[10]    = 0
6171      instr[9,5]   = Vn
6172      instr[4,0]   = Vd  */
6173 
6174   unsigned full     = INSTR (30, 30);
6175   unsigned size     = INSTR (22, 22);
6176   unsigned L        = INSTR (21, 21);
6177   unsigned vm       = INSTR (20, 16);
6178   unsigned H        = INSTR (11, 11);
6179   unsigned vn       = INSTR (9, 5);
6180   unsigned vd       = INSTR (4, 0);
6181   unsigned e;
6182 
6183   NYI_assert (29, 23, 0x1F);
6184   NYI_assert (15, 12, 0x1);
6185   NYI_assert (10, 10, 0);
6186 
6187   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6188   if (size)
6189     {
6190       double element1, element2;
6191 
6192       if (! full || L)
6193 	HALT_UNALLOC;
6194 
6195       element2 = aarch64_get_vec_double (cpu, vm, H);
6196 
6197       for (e = 0; e < 2; e++)
6198 	{
6199 	  element1 = aarch64_get_vec_double (cpu, vn, e);
6200 	  element1 *= element2;
6201 	  element1 += aarch64_get_vec_double (cpu, vd, e);
6202 	  aarch64_set_vec_double (cpu, vd, e, element1);
6203 	}
6204     }
6205   else
6206     {
6207       float element1;
6208       float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
6209 
6210       for (e = 0; e < (full ? 4 : 2); e++)
6211 	{
6212 	  element1 = aarch64_get_vec_float (cpu, vn, e);
6213 	  element1 *= element2;
6214 	  element1 += aarch64_get_vec_float (cpu, vd, e);
6215 	  aarch64_set_vec_float (cpu, vd, e, element1);
6216 	}
6217     }
6218 }
6219 
6220 static void
6221 do_vec_op2 (sim_cpu *cpu)
6222 {
6223   /* instr[31]    = 0
6224      instr[30]    = half/full
6225      instr[29,24] = 00 1111
6226      instr[23]    = ?
6227      instr[22,16] = element size & index
6228      instr[15,10] = sub-opcode
6229      instr[9,5]   = Vm
6230      instr[4,0]   = Vd  */
6231 
6232   NYI_assert (29, 24, 0x0F);
6233 
6234   if (INSTR (23, 23) != 0)
6235     {
6236       switch (INSTR (15, 10))
6237 	{
6238 	case 0x04:
6239 	case 0x06:
6240 	  do_FMLA_by_element (cpu);
6241 	  return;
6242 
6243 	case 0x20:
6244 	case 0x22:
6245 	  do_vec_MUL_by_element (cpu);
6246 	  return;
6247 
6248 	default:
6249 	  HALT_NYI;
6250 	}
6251     }
6252   else
6253     {
6254       switch (INSTR (15, 10))
6255 	{
6256 	case 0x01: do_vec_SSHR_USHR (cpu); return;
6257 	case 0x15: do_vec_SHL (cpu); return;
6258 	case 0x20:
6259 	case 0x22: do_vec_MUL_by_element (cpu); return;
6260 	case 0x29: do_vec_xtl (cpu); return;
6261 	default:   HALT_NYI;
6262 	}
6263     }
6264 }
6265 
6266 static void
6267 do_vec_neg (sim_cpu *cpu)
6268 {
6269   /* instr[31]    = 0
6270      instr[30]    = full(1)/half(0)
6271      instr[29,24] = 10 1110
6272      instr[23,22] = size: byte(00), half (01), word (10), long (11)
6273      instr[21,10] = 1000 0010 1110
6274      instr[9,5]   = Vs
6275      instr[4,0]   = Vd  */
6276 
6277   int    full = INSTR (30, 30);
6278   unsigned vs = INSTR (9, 5);
6279   unsigned vd = INSTR (4, 0);
6280   unsigned i;
6281 
6282   NYI_assert (29, 24, 0x2E);
6283   NYI_assert (21, 10, 0x82E);
6284 
6285   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6286   switch (INSTR (23, 22))
6287     {
6288     case 0:
6289       for (i = 0; i < (full ? 16 : 8); i++)
6290 	aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
6291       return;
6292 
6293     case 1:
6294       for (i = 0; i < (full ? 8 : 4); i++)
6295 	aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
6296       return;
6297 
6298     case 2:
6299       for (i = 0; i < (full ? 4 : 2); i++)
6300 	aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
6301       return;
6302 
6303     case 3:
6304       if (! full)
6305 	HALT_NYI;
6306       for (i = 0; i < 2; i++)
6307 	aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
6308       return;
6309     }
6310 }
6311 
6312 static void
6313 do_vec_sqrt (sim_cpu *cpu)
6314 {
6315   /* instr[31]    = 0
6316      instr[30]    = full(1)/half(0)
6317      instr[29,23] = 101 1101
6318      instr[22]    = single(0)/double(1)
6319      instr[21,10] = 1000 0111 1110
6320      instr[9,5]   = Vs
6321      instr[4,0]   = Vd.  */
6322 
6323   int    full = INSTR (30, 30);
6324   unsigned vs = INSTR (9, 5);
6325   unsigned vd = INSTR (4, 0);
6326   unsigned i;
6327 
6328   NYI_assert (29, 23, 0x5B);
6329   NYI_assert (21, 10, 0x87E);
6330 
6331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6332   if (INSTR (22, 22) == 0)
6333     for (i = 0; i < (full ? 4 : 2); i++)
6334       aarch64_set_vec_float (cpu, vd, i,
6335 			     sqrtf (aarch64_get_vec_float (cpu, vs, i)));
6336   else
6337     for (i = 0; i < 2; i++)
6338       aarch64_set_vec_double (cpu, vd, i,
6339 			      sqrt (aarch64_get_vec_double (cpu, vs, i)));
6340 }
6341 
6342 static void
6343 do_vec_mls_indexed (sim_cpu *cpu)
6344 {
6345   /* instr[31]       = 0
6346      instr[30]       = half(0)/full(1)
6347      instr[29,24]    = 10 1111
6348      instr[23,22]    = 16-bit(01)/32-bit(10)
6349      instr[21,20+11] = index (if 16-bit)
6350      instr[21+11]    = index (if 32-bit)
6351      instr[20,16]    = Vm
6352      instr[15,12]    = 0100
6353      instr[11]       = part of index
6354      instr[10]       = 0
6355      instr[9,5]      = Vs
6356      instr[4,0]      = Vd.  */
6357 
6358   int    full = INSTR (30, 30);
6359   unsigned vs = INSTR (9, 5);
6360   unsigned vd = INSTR (4, 0);
6361   unsigned vm = INSTR (20, 16);
6362   unsigned i;
6363 
6364   NYI_assert (15, 12, 4);
6365   NYI_assert (10, 10, 0);
6366 
6367   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6368   switch (INSTR (23, 22))
6369     {
6370     case 1:
6371       {
6372 	unsigned elem;
6373 	uint32_t val;
6374 
6375 	if (vm > 15)
6376 	  HALT_NYI;
6377 
6378 	elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
6379 	val = aarch64_get_vec_u16 (cpu, vm, elem);
6380 
6381 	for (i = 0; i < (full ? 8 : 4); i++)
6382 	  aarch64_set_vec_u32 (cpu, vd, i,
6383 			       aarch64_get_vec_u32 (cpu, vd, i) -
6384 			       (aarch64_get_vec_u32 (cpu, vs, i) * val));
6385 	return;
6386       }
6387 
6388     case 2:
6389       {
6390 	unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
6391 	uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
6392 
6393 	for (i = 0; i < (full ? 4 : 2); i++)
6394 	  aarch64_set_vec_u64 (cpu, vd, i,
6395 			       aarch64_get_vec_u64 (cpu, vd, i) -
6396 			       (aarch64_get_vec_u64 (cpu, vs, i) * val));
6397 	return;
6398       }
6399 
6400     case 0:
6401     case 3:
6402     default:
6403       HALT_NYI;
6404     }
6405 }
6406 
6407 static void
6408 do_vec_SUB (sim_cpu *cpu)
6409 {
6410   /* instr [31]    = 0
6411      instr [30]    = half(0)/full(1)
6412      instr [29,24] = 10 1110
6413      instr [23,22] = size: byte(00, half(01), word (10), long (11)
6414      instr [21]    = 1
6415      instr [20,16] = Vm
6416      instr [15,10] = 10 0001
6417      instr [9, 5]  = Vn
6418      instr [4, 0]  = Vd.  */
6419 
6420   unsigned full = INSTR (30, 30);
6421   unsigned vm = INSTR (20, 16);
6422   unsigned vn = INSTR (9, 5);
6423   unsigned vd = INSTR (4, 0);
6424   unsigned i;
6425 
6426   NYI_assert (29, 24, 0x2E);
6427   NYI_assert (21, 21, 1);
6428   NYI_assert (15, 10, 0x21);
6429 
6430   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6431   switch (INSTR (23, 22))
6432     {
6433     case 0:
6434       for (i = 0; i < (full ? 16 : 8); i++)
6435 	aarch64_set_vec_s8 (cpu, vd, i,
6436 			    aarch64_get_vec_s8 (cpu, vn, i)
6437 			    - aarch64_get_vec_s8 (cpu, vm, i));
6438       return;
6439 
6440     case 1:
6441       for (i = 0; i < (full ? 8 : 4); i++)
6442 	aarch64_set_vec_s16 (cpu, vd, i,
6443 			     aarch64_get_vec_s16 (cpu, vn, i)
6444 			     - aarch64_get_vec_s16 (cpu, vm, i));
6445       return;
6446 
6447     case 2:
6448       for (i = 0; i < (full ? 4 : 2); i++)
6449 	aarch64_set_vec_s32 (cpu, vd, i,
6450 			     aarch64_get_vec_s32 (cpu, vn, i)
6451 			     - aarch64_get_vec_s32 (cpu, vm, i));
6452       return;
6453 
6454     case 3:
6455       if (full == 0)
6456 	HALT_UNALLOC;
6457 
6458       for (i = 0; i < 2; i++)
6459 	aarch64_set_vec_s64 (cpu, vd, i,
6460 			     aarch64_get_vec_s64 (cpu, vn, i)
6461 			     - aarch64_get_vec_s64 (cpu, vm, i));
6462       return;
6463     }
6464 }
6465 
6466 static void
6467 do_vec_MLS (sim_cpu *cpu)
6468 {
6469   /* instr [31]    = 0
6470      instr [30]    = half(0)/full(1)
6471      instr [29,24] = 10 1110
6472      instr [23,22] = size: byte(00, half(01), word (10)
6473      instr [21]    = 1
6474      instr [20,16] = Vm
6475      instr [15,10] = 10 0101
6476      instr [9, 5]  = Vn
6477      instr [4, 0]  = Vd.  */
6478 
6479   unsigned full = INSTR (30, 30);
6480   unsigned vm = INSTR (20, 16);
6481   unsigned vn = INSTR (9, 5);
6482   unsigned vd = INSTR (4, 0);
6483   unsigned i;
6484 
6485   NYI_assert (29, 24, 0x2E);
6486   NYI_assert (21, 21, 1);
6487   NYI_assert (15, 10, 0x25);
6488 
6489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6490   switch (INSTR (23, 22))
6491     {
6492     case 0:
6493       for (i = 0; i < (full ? 16 : 8); i++)
6494 	aarch64_set_vec_u8 (cpu, vd, i,
6495 			    aarch64_get_vec_u8 (cpu, vd, i)
6496 			    - (aarch64_get_vec_u8 (cpu, vn, i)
6497 			       * aarch64_get_vec_u8 (cpu, vm, i)));
6498       return;
6499 
6500     case 1:
6501       for (i = 0; i < (full ? 8 : 4); i++)
6502 	aarch64_set_vec_u16 (cpu, vd, i,
6503 			     aarch64_get_vec_u16 (cpu, vd, i)
6504 			     - (aarch64_get_vec_u16 (cpu, vn, i)
6505 				* aarch64_get_vec_u16 (cpu, vm, i)));
6506       return;
6507 
6508     case 2:
6509       for (i = 0; i < (full ? 4 : 2); i++)
6510 	aarch64_set_vec_u32 (cpu, vd, i,
6511 			     aarch64_get_vec_u32 (cpu, vd, i)
6512 			     - (aarch64_get_vec_u32 (cpu, vn, i)
6513 				* aarch64_get_vec_u32 (cpu, vm, i)));
6514       return;
6515 
6516     default:
6517       HALT_UNALLOC;
6518     }
6519 }
6520 
6521 static void
6522 do_vec_FDIV (sim_cpu *cpu)
6523 {
6524   /* instr [31]    = 0
6525      instr [30]    = half(0)/full(1)
6526      instr [29,23] = 10 1110 0
6527      instr [22]    = float()/double(1)
6528      instr [21]    = 1
6529      instr [20,16] = Vm
6530      instr [15,10] = 1111 11
6531      instr [9, 5]  = Vn
6532      instr [4, 0]  = Vd.  */
6533 
6534   unsigned full = INSTR (30, 30);
6535   unsigned vm = INSTR (20, 16);
6536   unsigned vn = INSTR (9, 5);
6537   unsigned vd = INSTR (4, 0);
6538   unsigned i;
6539 
6540   NYI_assert (29, 23, 0x5C);
6541   NYI_assert (21, 21, 1);
6542   NYI_assert (15, 10, 0x3F);
6543 
6544   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6545   if (INSTR (22, 22))
6546     {
6547       if (! full)
6548 	HALT_UNALLOC;
6549 
6550       for (i = 0; i < 2; i++)
6551 	aarch64_set_vec_double (cpu, vd, i,
6552 				aarch64_get_vec_double (cpu, vn, i)
6553 				/ aarch64_get_vec_double (cpu, vm, i));
6554     }
6555   else
6556     for (i = 0; i < (full ? 4 : 2); i++)
6557       aarch64_set_vec_float (cpu, vd, i,
6558 			     aarch64_get_vec_float (cpu, vn, i)
6559 			     / aarch64_get_vec_float (cpu, vm, i));
6560 }
6561 
6562 static void
6563 do_vec_FMUL (sim_cpu *cpu)
6564 {
6565   /* instr [31]    = 0
6566      instr [30]    = half(0)/full(1)
6567      instr [29,23] = 10 1110 0
6568      instr [22]    = float(0)/double(1)
6569      instr [21]    = 1
6570      instr [20,16] = Vm
6571      instr [15,10] = 1101 11
6572      instr [9, 5]  = Vn
6573      instr [4, 0]  = Vd.  */
6574 
6575   unsigned full = INSTR (30, 30);
6576   unsigned vm = INSTR (20, 16);
6577   unsigned vn = INSTR (9, 5);
6578   unsigned vd = INSTR (4, 0);
6579   unsigned i;
6580 
6581   NYI_assert (29, 23, 0x5C);
6582   NYI_assert (21, 21, 1);
6583   NYI_assert (15, 10, 0x37);
6584 
6585   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6586   if (INSTR (22, 22))
6587     {
6588       if (! full)
6589 	HALT_UNALLOC;
6590 
6591       for (i = 0; i < 2; i++)
6592 	aarch64_set_vec_double (cpu, vd, i,
6593 				aarch64_get_vec_double (cpu, vn, i)
6594 				* aarch64_get_vec_double (cpu, vm, i));
6595     }
6596   else
6597     for (i = 0; i < (full ? 4 : 2); i++)
6598       aarch64_set_vec_float (cpu, vd, i,
6599 			     aarch64_get_vec_float (cpu, vn, i)
6600 			     * aarch64_get_vec_float (cpu, vm, i));
6601 }
6602 
6603 static void
6604 do_vec_FADDP (sim_cpu *cpu)
6605 {
6606   /* instr [31]    = 0
6607      instr [30]    = half(0)/full(1)
6608      instr [29,23] = 10 1110 0
6609      instr [22]    = float(0)/double(1)
6610      instr [21]    = 1
6611      instr [20,16] = Vm
6612      instr [15,10] = 1101 01
6613      instr [9, 5]  = Vn
6614      instr [4, 0]  = Vd.  */
6615 
6616   unsigned full = INSTR (30, 30);
6617   unsigned vm = INSTR (20, 16);
6618   unsigned vn = INSTR (9, 5);
6619   unsigned vd = INSTR (4, 0);
6620 
6621   NYI_assert (29, 23, 0x5C);
6622   NYI_assert (21, 21, 1);
6623   NYI_assert (15, 10, 0x35);
6624 
6625   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6626   if (INSTR (22, 22))
6627     {
6628       /* Extract values before adding them incase vd == vn/vm.  */
6629       double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
6630       double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
6631       double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
6632       double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
6633 
6634       if (! full)
6635 	HALT_UNALLOC;
6636 
6637       aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
6638       aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
6639     }
6640   else
6641     {
6642       /* Extract values before adding them incase vd == vn/vm.  */
6643       float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
6644       float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
6645       float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
6646       float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
6647 
6648       if (full)
6649 	{
6650 	  float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
6651 	  float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
6652 	  float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
6653 	  float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
6654 
6655 	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6656 	  aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
6657 	  aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
6658 	  aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
6659 	}
6660       else
6661 	{
6662 	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6663 	  aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
6664 	}
6665     }
6666 }
6667 
6668 static void
6669 do_vec_FSQRT (sim_cpu *cpu)
6670 {
6671   /* instr[31]    = 0
6672      instr[30]    = half(0)/full(1)
6673      instr[29,23] = 10 1110 1
6674      instr[22]    = single(0)/double(1)
6675      instr[21,10] = 10 0001 1111 10
6676      instr[9,5]   = Vsrc
6677      instr[4,0]   = Vdest.  */
6678 
6679   unsigned vn = INSTR (9, 5);
6680   unsigned vd = INSTR (4, 0);
6681   unsigned full = INSTR (30, 30);
6682   int i;
6683 
6684   NYI_assert (29, 23, 0x5D);
6685   NYI_assert (21, 10, 0x87E);
6686 
6687   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6688   if (INSTR (22, 22))
6689     {
6690       if (! full)
6691 	HALT_UNALLOC;
6692 
6693       for (i = 0; i < 2; i++)
6694 	aarch64_set_vec_double (cpu, vd, i,
6695 				sqrt (aarch64_get_vec_double (cpu, vn, i)));
6696     }
6697   else
6698     {
6699       for (i = 0; i < (full ? 4 : 2); i++)
6700 	aarch64_set_vec_float (cpu, vd, i,
6701 			       sqrtf (aarch64_get_vec_float (cpu, vn, i)));
6702     }
6703 }
6704 
6705 static void
6706 do_vec_FNEG (sim_cpu *cpu)
6707 {
6708   /* instr[31]    = 0
6709      instr[30]    = half (0)/full (1)
6710      instr[29,23] = 10 1110 1
6711      instr[22]    = single (0)/double (1)
6712      instr[21,10] = 10 0000 1111 10
6713      instr[9,5]   = Vsrc
6714      instr[4,0]   = Vdest.  */
6715 
6716   unsigned vn = INSTR (9, 5);
6717   unsigned vd = INSTR (4, 0);
6718   unsigned full = INSTR (30, 30);
6719   int i;
6720 
6721   NYI_assert (29, 23, 0x5D);
6722   NYI_assert (21, 10, 0x83E);
6723 
6724   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6725   if (INSTR (22, 22))
6726     {
6727       if (! full)
6728 	HALT_UNALLOC;
6729 
6730       for (i = 0; i < 2; i++)
6731 	aarch64_set_vec_double (cpu, vd, i,
6732 				- aarch64_get_vec_double (cpu, vn, i));
6733     }
6734   else
6735     {
6736       for (i = 0; i < (full ? 4 : 2); i++)
6737 	aarch64_set_vec_float (cpu, vd, i,
6738 			       - aarch64_get_vec_float (cpu, vn, i));
6739     }
6740 }
6741 
6742 static void
6743 do_vec_NOT (sim_cpu *cpu)
6744 {
6745   /* instr[31]    = 0
6746      instr[30]    = half (0)/full (1)
6747      instr[29,10] = 10 1110 0010 0000 0101 10
6748      instr[9,5]   = Vn
6749      instr[4.0]   = Vd.  */
6750 
6751   unsigned vn = INSTR (9, 5);
6752   unsigned vd = INSTR (4, 0);
6753   unsigned i;
6754   int      full = INSTR (30, 30);
6755 
6756   NYI_assert (29, 10, 0xB8816);
6757 
6758   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6759   for (i = 0; i < (full ? 16 : 8); i++)
6760     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
6761 }
6762 
6763 static unsigned int
6764 clz (uint64_t val, unsigned size)
6765 {
6766   uint64_t mask = 1;
6767   int      count;
6768 
6769   mask <<= (size - 1);
6770   count = 0;
6771   do
6772     {
6773       if (val & mask)
6774 	break;
6775       mask >>= 1;
6776       count ++;
6777     }
6778   while (mask);
6779 
6780   return count;
6781 }
6782 
6783 static void
6784 do_vec_CLZ (sim_cpu *cpu)
6785 {
6786   /* instr[31]    = 0
6787      instr[30]    = half (0)/full (1)
6788      instr[29,24] = 10 1110
6789      instr[23,22] = size
6790      instr[21,10] = 10 0000 0100 10
6791      instr[9,5]   = Vn
6792      instr[4.0]   = Vd.  */
6793 
6794   unsigned vn = INSTR (9, 5);
6795   unsigned vd = INSTR (4, 0);
6796   unsigned i;
6797   int      full = INSTR (30,30);
6798 
6799   NYI_assert (29, 24, 0x2E);
6800   NYI_assert (21, 10, 0x812);
6801 
6802   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6803   switch (INSTR (23, 22))
6804     {
6805     case 0:
6806       for (i = 0; i < (full ? 16 : 8); i++)
6807 	aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
6808       break;
6809     case 1:
6810       for (i = 0; i < (full ? 8 : 4); i++)
6811 	aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
6812       break;
6813     case 2:
6814       for (i = 0; i < (full ? 4 : 2); i++)
6815 	aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
6816       break;
6817     case 3:
6818       if (! full)
6819 	HALT_UNALLOC;
6820       aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
6821       aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
6822       break;
6823     }
6824 }
6825 
6826 static void
6827 do_vec_MOV_element (sim_cpu *cpu)
6828 {
6829   /* instr[31,21] = 0110 1110 000
6830      instr[20,16] = size & dest index
6831      instr[15]    = 0
6832      instr[14,11] = source index
6833      instr[10]    = 1
6834      instr[9,5]   = Vs
6835      instr[4.0]   = Vd.  */
6836 
6837   unsigned vs = INSTR (9, 5);
6838   unsigned vd = INSTR (4, 0);
6839   unsigned src_index;
6840   unsigned dst_index;
6841 
6842   NYI_assert (31, 21, 0x370);
6843   NYI_assert (15, 15, 0);
6844   NYI_assert (10, 10, 1);
6845 
6846   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6847   if (INSTR (16, 16))
6848     {
6849       /* Move a byte.  */
6850       src_index = INSTR (14, 11);
6851       dst_index = INSTR (20, 17);
6852       aarch64_set_vec_u8 (cpu, vd, dst_index,
6853 			  aarch64_get_vec_u8 (cpu, vs, src_index));
6854     }
6855   else if (INSTR (17, 17))
6856     {
6857       /* Move 16-bits.  */
6858       NYI_assert (11, 11, 0);
6859       src_index = INSTR (14, 12);
6860       dst_index = INSTR (20, 18);
6861       aarch64_set_vec_u16 (cpu, vd, dst_index,
6862 			   aarch64_get_vec_u16 (cpu, vs, src_index));
6863     }
6864   else if (INSTR (18, 18))
6865     {
6866       /* Move 32-bits.  */
6867       NYI_assert (12, 11, 0);
6868       src_index = INSTR (14, 13);
6869       dst_index = INSTR (20, 19);
6870       aarch64_set_vec_u32 (cpu, vd, dst_index,
6871 			   aarch64_get_vec_u32 (cpu, vs, src_index));
6872     }
6873   else
6874     {
6875       NYI_assert (19, 19, 1);
6876       NYI_assert (13, 11, 0);
6877       src_index = INSTR (14, 14);
6878       dst_index = INSTR (20, 20);
6879       aarch64_set_vec_u64 (cpu, vd, dst_index,
6880 			   aarch64_get_vec_u64 (cpu, vs, src_index));
6881     }
6882 }
6883 
6884 static void
6885 do_vec_REV32 (sim_cpu *cpu)
6886 {
6887   /* instr[31]    = 0
6888      instr[30]    = full/half
6889      instr[29,24] = 10 1110
6890      instr[23,22] = size
6891      instr[21,10] = 10 0000 0000 10
6892      instr[9,5]   = Rn
6893      instr[4,0]   = Rd.  */
6894 
6895   unsigned rn = INSTR (9, 5);
6896   unsigned rd = INSTR (4, 0);
6897   unsigned size = INSTR (23, 22);
6898   unsigned full = INSTR (30, 30);
6899   unsigned i;
6900   FRegister val;
6901 
6902   NYI_assert (29, 24, 0x2E);
6903   NYI_assert (21, 10, 0x802);
6904 
6905   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6906   switch (size)
6907     {
6908     case 0:
6909       for (i = 0; i < (full ? 16 : 8); i++)
6910 	val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
6911       break;
6912 
6913     case 1:
6914       for (i = 0; i < (full ? 8 : 4); i++)
6915 	val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
6916       break;
6917 
6918     default:
6919       HALT_UNALLOC;
6920     }
6921 
6922   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
6923   if (full)
6924     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
6925 }
6926 
6927 static void
6928 do_vec_EXT (sim_cpu *cpu)
6929 {
6930   /* instr[31]    = 0
6931      instr[30]    = full/half
6932      instr[29,21] = 10 1110 000
6933      instr[20,16] = Vm
6934      instr[15]    = 0
6935      instr[14,11] = source index
6936      instr[10]    = 0
6937      instr[9,5]   = Vn
6938      instr[4.0]   = Vd.  */
6939 
6940   unsigned vm = INSTR (20, 16);
6941   unsigned vn = INSTR (9, 5);
6942   unsigned vd = INSTR (4, 0);
6943   unsigned src_index = INSTR (14, 11);
6944   unsigned full = INSTR (30, 30);
6945   unsigned i;
6946   unsigned j;
6947   FRegister val;
6948 
6949   NYI_assert (31, 21, 0x370);
6950   NYI_assert (15, 15, 0);
6951   NYI_assert (10, 10, 0);
6952 
6953   if (!full && (src_index & 0x8))
6954     HALT_UNALLOC;
6955 
6956   j = 0;
6957 
6958   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6959   for (i = src_index; i < (full ? 16 : 8); i++)
6960     val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
6961   for (i = 0; i < src_index; i++)
6962     val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
6963 
6964   aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
6965   if (full)
6966     aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
6967 }
6968 
6969 static void
6970 dexAdvSIMD0 (sim_cpu *cpu)
6971 {
6972   /* instr [28,25] = 0 111.  */
6973   if (    INSTR (15, 10) == 0x07
6974       && (INSTR (9, 5) ==
6975 	  INSTR (20, 16)))
6976     {
6977       if (INSTR (31, 21) == 0x075
6978 	  || INSTR (31, 21) == 0x275)
6979 	{
6980 	  do_vec_MOV_whole_vector (cpu);
6981 	  return;
6982 	}
6983     }
6984 
6985   if (INSTR (29, 19) == 0x1E0)
6986     {
6987       do_vec_MOV_immediate (cpu);
6988       return;
6989     }
6990 
6991   if (INSTR (29, 19) == 0x5E0)
6992     {
6993       do_vec_MVNI (cpu);
6994       return;
6995     }
6996 
6997   if (INSTR (29, 19) == 0x1C0
6998       || INSTR (29, 19) == 0x1C1)
6999     {
7000       if (INSTR (15, 10) == 0x03)
7001 	{
7002 	  do_vec_DUP_scalar_into_vector (cpu);
7003 	  return;
7004 	}
7005     }
7006 
7007   switch (INSTR (29, 24))
7008     {
7009     case 0x0E: do_vec_op1 (cpu); return;
7010     case 0x0F: do_vec_op2 (cpu); return;
7011 
7012     case 0x2E:
7013       if (INSTR (21, 21) == 1)
7014 	{
7015 	  switch (INSTR (15, 10))
7016 	    {
7017 	    case 0x02:
7018 	      do_vec_REV32 (cpu);
7019 	      return;
7020 
7021 	    case 0x07:
7022 	      switch (INSTR (23, 22))
7023 		{
7024 		case 0: do_vec_EOR (cpu); return;
7025 		case 1: do_vec_BSL (cpu); return;
7026 		case 2:
7027 		case 3: do_vec_bit (cpu); return;
7028 		}
7029 	      break;
7030 
7031 	    case 0x08: do_vec_sub_long (cpu); return;
7032 	    case 0x11: do_vec_USHL (cpu); return;
7033 	    case 0x12: do_vec_CLZ (cpu); return;
7034 	    case 0x16: do_vec_NOT (cpu); return;
7035 	    case 0x19: do_vec_max (cpu); return;
7036 	    case 0x1B: do_vec_min (cpu); return;
7037 	    case 0x21: do_vec_SUB (cpu); return;
7038 	    case 0x25: do_vec_MLS (cpu); return;
7039 	    case 0x31: do_vec_FminmaxNMP (cpu); return;
7040 	    case 0x35: do_vec_FADDP (cpu); return;
7041 	    case 0x37: do_vec_FMUL (cpu); return;
7042 	    case 0x3F: do_vec_FDIV (cpu); return;
7043 
7044 	    case 0x3E:
7045 	      switch (INSTR (20, 16))
7046 		{
7047 		case 0x00: do_vec_FNEG (cpu); return;
7048 		case 0x01: do_vec_FSQRT (cpu); return;
7049 		default:   HALT_NYI;
7050 		}
7051 
7052 	    case 0x0D:
7053 	    case 0x0F:
7054 	    case 0x22:
7055 	    case 0x23:
7056 	    case 0x26:
7057 	    case 0x2A:
7058 	    case 0x32:
7059 	    case 0x36:
7060 	    case 0x39:
7061 	    case 0x3A:
7062 	      do_vec_compare (cpu); return;
7063 
7064 	    default:
7065 	      break;
7066 	    }
7067 	}
7068 
7069       if (INSTR (31, 21) == 0x370)
7070 	{
7071 	  if (INSTR (10, 10))
7072 	    do_vec_MOV_element (cpu);
7073 	  else
7074 	    do_vec_EXT (cpu);
7075 	  return;
7076 	}
7077 
7078       switch (INSTR (21, 10))
7079 	{
7080 	case 0x82E: do_vec_neg (cpu); return;
7081 	case 0x87E: do_vec_sqrt (cpu); return;
7082 	default:
7083 	  if (INSTR (15, 10) == 0x30)
7084 	    {
7085 	      do_vec_mull (cpu);
7086 	      return;
7087 	    }
7088 	  break;
7089 	}
7090       break;
7091 
7092     case 0x2f:
7093       switch (INSTR (15, 10))
7094 	{
7095 	case 0x01: do_vec_SSHR_USHR (cpu); return;
7096 	case 0x10:
7097 	case 0x12: do_vec_mls_indexed (cpu); return;
7098 	case 0x29: do_vec_xtl (cpu); return;
7099 	default:
7100 	  HALT_NYI;
7101 	}
7102 
7103     default:
7104       break;
7105     }
7106 
7107   HALT_NYI;
7108 }
7109 
7110 /* 3 sources.  */
7111 
7112 /* Float multiply add.  */
7113 static void
7114 fmadds (sim_cpu *cpu)
7115 {
7116   unsigned sa = INSTR (14, 10);
7117   unsigned sm = INSTR (20, 16);
7118   unsigned sn = INSTR ( 9,  5);
7119   unsigned sd = INSTR ( 4,  0);
7120 
7121   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7122   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7123 			+ aarch64_get_FP_float (cpu, sn)
7124 			* aarch64_get_FP_float (cpu, sm));
7125 }
7126 
7127 /* Double multiply add.  */
7128 static void
7129 fmaddd (sim_cpu *cpu)
7130 {
7131   unsigned sa = INSTR (14, 10);
7132   unsigned sm = INSTR (20, 16);
7133   unsigned sn = INSTR ( 9,  5);
7134   unsigned sd = INSTR ( 4,  0);
7135 
7136   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7137   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7138 			 + aarch64_get_FP_double (cpu, sn)
7139 			 * aarch64_get_FP_double (cpu, sm));
7140 }
7141 
7142 /* Float multiply subtract.  */
7143 static void
7144 fmsubs (sim_cpu *cpu)
7145 {
7146   unsigned sa = INSTR (14, 10);
7147   unsigned sm = INSTR (20, 16);
7148   unsigned sn = INSTR ( 9,  5);
7149   unsigned sd = INSTR ( 4,  0);
7150 
7151   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7152   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7153 			- aarch64_get_FP_float (cpu, sn)
7154 			* aarch64_get_FP_float (cpu, sm));
7155 }
7156 
7157 /* Double multiply subtract.  */
7158 static void
7159 fmsubd (sim_cpu *cpu)
7160 {
7161   unsigned sa = INSTR (14, 10);
7162   unsigned sm = INSTR (20, 16);
7163   unsigned sn = INSTR ( 9,  5);
7164   unsigned sd = INSTR ( 4,  0);
7165 
7166   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7167   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7168 			 - aarch64_get_FP_double (cpu, sn)
7169 			 * aarch64_get_FP_double (cpu, sm));
7170 }
7171 
7172 /* Float negative multiply add.  */
7173 static void
7174 fnmadds (sim_cpu *cpu)
7175 {
7176   unsigned sa = INSTR (14, 10);
7177   unsigned sm = INSTR (20, 16);
7178   unsigned sn = INSTR ( 9,  5);
7179   unsigned sd = INSTR ( 4,  0);
7180 
7181   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7182   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7183 			+ (- aarch64_get_FP_float (cpu, sn))
7184 			* aarch64_get_FP_float (cpu, sm));
7185 }
7186 
7187 /* Double negative multiply add.  */
7188 static void
7189 fnmaddd (sim_cpu *cpu)
7190 {
7191   unsigned sa = INSTR (14, 10);
7192   unsigned sm = INSTR (20, 16);
7193   unsigned sn = INSTR ( 9,  5);
7194   unsigned sd = INSTR ( 4,  0);
7195 
7196   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7197   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7198 			 + (- aarch64_get_FP_double (cpu, sn))
7199 			 * aarch64_get_FP_double (cpu, sm));
7200 }
7201 
7202 /* Float negative multiply subtract.  */
7203 static void
7204 fnmsubs (sim_cpu *cpu)
7205 {
7206   unsigned sa = INSTR (14, 10);
7207   unsigned sm = INSTR (20, 16);
7208   unsigned sn = INSTR ( 9,  5);
7209   unsigned sd = INSTR ( 4,  0);
7210 
7211   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7212   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7213 			+ aarch64_get_FP_float (cpu, sn)
7214 			* aarch64_get_FP_float (cpu, sm));
7215 }
7216 
7217 /* Double negative multiply subtract.  */
7218 static void
7219 fnmsubd (sim_cpu *cpu)
7220 {
7221   unsigned sa = INSTR (14, 10);
7222   unsigned sm = INSTR (20, 16);
7223   unsigned sn = INSTR ( 9,  5);
7224   unsigned sd = INSTR ( 4,  0);
7225 
7226   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7227   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7228 			 + aarch64_get_FP_double (cpu, sn)
7229 			 * aarch64_get_FP_double (cpu, sm));
7230 }
7231 
7232 static void
7233 dexSimpleFPDataProc3Source (sim_cpu *cpu)
7234 {
7235   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7236      instr[30]    = 0
7237      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7238      instr[28,25] = 1111
7239      instr[24]    = 1
7240      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7241      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
7242      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
7243 
7244   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7245   /* dispatch on combined type:o1:o2.  */
7246   uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
7247 
7248   if (M_S != 0)
7249     HALT_UNALLOC;
7250 
7251   switch (dispatch)
7252     {
7253     case 0: fmadds (cpu); return;
7254     case 1: fmsubs (cpu); return;
7255     case 2: fnmadds (cpu); return;
7256     case 3: fnmsubs (cpu); return;
7257     case 4: fmaddd (cpu); return;
7258     case 5: fmsubd (cpu); return;
7259     case 6: fnmaddd (cpu); return;
7260     case 7: fnmsubd (cpu); return;
7261     default:
7262       /* type > 1 is currently unallocated.  */
7263       HALT_UNALLOC;
7264     }
7265 }
7266 
7267 static void
7268 dexSimpleFPFixedConvert (sim_cpu *cpu)
7269 {
7270   HALT_NYI;
7271 }
7272 
7273 static void
7274 dexSimpleFPCondCompare (sim_cpu *cpu)
7275 {
7276   /* instr [31,23] = 0001 1110 0
7277      instr [22]    = type
7278      instr [21]    = 1
7279      instr [20,16] = Rm
7280      instr [15,12] = condition
7281      instr [11,10] = 01
7282      instr [9,5]   = Rn
7283      instr [4]     = 0
7284      instr [3,0]   = nzcv  */
7285 
7286   unsigned rm = INSTR (20, 16);
7287   unsigned rn = INSTR (9, 5);
7288 
7289   NYI_assert (31, 23, 0x3C);
7290   NYI_assert (11, 10, 0x1);
7291   NYI_assert (4,  4,  0);
7292 
7293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7294   if (! testConditionCode (cpu, INSTR (15, 12)))
7295     {
7296       aarch64_set_CPSR (cpu, INSTR (3, 0));
7297       return;
7298     }
7299 
7300   if (INSTR (22, 22))
7301     {
7302       /* Double precision.  */
7303       double val1 = aarch64_get_vec_double (cpu, rn, 0);
7304       double val2 = aarch64_get_vec_double (cpu, rm, 0);
7305 
7306       /* FIXME: Check for NaNs.  */
7307       if (val1 == val2)
7308 	aarch64_set_CPSR (cpu, (Z | C));
7309       else if (val1 < val2)
7310 	aarch64_set_CPSR (cpu, N);
7311       else /* val1 > val2 */
7312 	aarch64_set_CPSR (cpu, C);
7313     }
7314   else
7315     {
7316       /* Single precision.  */
7317       float val1 = aarch64_get_vec_float (cpu, rn, 0);
7318       float val2 = aarch64_get_vec_float (cpu, rm, 0);
7319 
7320       /* FIXME: Check for NaNs.  */
7321       if (val1 == val2)
7322 	aarch64_set_CPSR (cpu, (Z | C));
7323       else if (val1 < val2)
7324 	aarch64_set_CPSR (cpu, N);
7325       else /* val1 > val2 */
7326 	aarch64_set_CPSR (cpu, C);
7327     }
7328 }
7329 
7330 /* 2 sources.  */
7331 
7332 /* Float add.  */
7333 static void
7334 fadds (sim_cpu *cpu)
7335 {
7336   unsigned sm = INSTR (20, 16);
7337   unsigned sn = INSTR ( 9,  5);
7338   unsigned sd = INSTR ( 4,  0);
7339 
7340   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7341   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7342 			+ aarch64_get_FP_float (cpu, sm));
7343 }
7344 
7345 /* Double add.  */
7346 static void
7347 faddd (sim_cpu *cpu)
7348 {
7349   unsigned sm = INSTR (20, 16);
7350   unsigned sn = INSTR ( 9,  5);
7351   unsigned sd = INSTR ( 4,  0);
7352 
7353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7354   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7355 			 + aarch64_get_FP_double (cpu, sm));
7356 }
7357 
7358 /* Float divide.  */
7359 static void
7360 fdivs (sim_cpu *cpu)
7361 {
7362   unsigned sm = INSTR (20, 16);
7363   unsigned sn = INSTR ( 9,  5);
7364   unsigned sd = INSTR ( 4,  0);
7365 
7366   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7367   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7368 			/ aarch64_get_FP_float (cpu, sm));
7369 }
7370 
7371 /* Double divide.  */
7372 static void
7373 fdivd (sim_cpu *cpu)
7374 {
7375   unsigned sm = INSTR (20, 16);
7376   unsigned sn = INSTR ( 9,  5);
7377   unsigned sd = INSTR ( 4,  0);
7378 
7379   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7380   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7381 			 / aarch64_get_FP_double (cpu, sm));
7382 }
7383 
7384 /* Float multiply.  */
7385 static void
7386 fmuls (sim_cpu *cpu)
7387 {
7388   unsigned sm = INSTR (20, 16);
7389   unsigned sn = INSTR ( 9,  5);
7390   unsigned sd = INSTR ( 4,  0);
7391 
7392   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7393   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7394 			* aarch64_get_FP_float (cpu, sm));
7395 }
7396 
7397 /* Double multiply.  */
7398 static void
7399 fmuld (sim_cpu *cpu)
7400 {
7401   unsigned sm = INSTR (20, 16);
7402   unsigned sn = INSTR ( 9,  5);
7403   unsigned sd = INSTR ( 4,  0);
7404 
7405   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7406   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7407 			 * aarch64_get_FP_double (cpu, sm));
7408 }
7409 
7410 /* Float negate and multiply.  */
7411 static void
7412 fnmuls (sim_cpu *cpu)
7413 {
7414   unsigned sm = INSTR (20, 16);
7415   unsigned sn = INSTR ( 9,  5);
7416   unsigned sd = INSTR ( 4,  0);
7417 
7418   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7419   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
7420 				    * aarch64_get_FP_float (cpu, sm)));
7421 }
7422 
7423 /* Double negate and multiply.  */
7424 static void
7425 fnmuld (sim_cpu *cpu)
7426 {
7427   unsigned sm = INSTR (20, 16);
7428   unsigned sn = INSTR ( 9,  5);
7429   unsigned sd = INSTR ( 4,  0);
7430 
7431   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7432   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
7433 				     * aarch64_get_FP_double (cpu, sm)));
7434 }
7435 
7436 /* Float subtract.  */
7437 static void
7438 fsubs (sim_cpu *cpu)
7439 {
7440   unsigned sm = INSTR (20, 16);
7441   unsigned sn = INSTR ( 9,  5);
7442   unsigned sd = INSTR ( 4,  0);
7443 
7444   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7445   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7446 			- aarch64_get_FP_float (cpu, sm));
7447 }
7448 
7449 /* Double subtract.  */
7450 static void
7451 fsubd (sim_cpu *cpu)
7452 {
7453   unsigned sm = INSTR (20, 16);
7454   unsigned sn = INSTR ( 9,  5);
7455   unsigned sd = INSTR ( 4,  0);
7456 
7457   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7458   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7459 			 - aarch64_get_FP_double (cpu, sm));
7460 }
7461 
7462 static void
7463 do_FMINNM (sim_cpu *cpu)
7464 {
7465   /* instr[31,23] = 0 0011 1100
7466      instr[22]    = float(0)/double(1)
7467      instr[21]    = 1
7468      instr[20,16] = Sm
7469      instr[15,10] = 01 1110
7470      instr[9,5]   = Sn
7471      instr[4,0]   = Cpu  */
7472 
7473   unsigned sm = INSTR (20, 16);
7474   unsigned sn = INSTR ( 9,  5);
7475   unsigned sd = INSTR ( 4,  0);
7476 
7477   NYI_assert (31, 23, 0x03C);
7478   NYI_assert (15, 10, 0x1E);
7479 
7480   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7481   if (INSTR (22, 22))
7482     aarch64_set_FP_double (cpu, sd,
7483 			   dminnm (aarch64_get_FP_double (cpu, sn),
7484 				   aarch64_get_FP_double (cpu, sm)));
7485   else
7486     aarch64_set_FP_float (cpu, sd,
7487 			  fminnm (aarch64_get_FP_float (cpu, sn),
7488 				  aarch64_get_FP_float (cpu, sm)));
7489 }
7490 
7491 static void
7492 do_FMAXNM (sim_cpu *cpu)
7493 {
7494   /* instr[31,23] = 0 0011 1100
7495      instr[22]    = float(0)/double(1)
7496      instr[21]    = 1
7497      instr[20,16] = Sm
7498      instr[15,10] = 01 1010
7499      instr[9,5]   = Sn
7500      instr[4,0]   = Cpu  */
7501 
7502   unsigned sm = INSTR (20, 16);
7503   unsigned sn = INSTR ( 9,  5);
7504   unsigned sd = INSTR ( 4,  0);
7505 
7506   NYI_assert (31, 23, 0x03C);
7507   NYI_assert (15, 10, 0x1A);
7508 
7509   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7510   if (INSTR (22, 22))
7511     aarch64_set_FP_double (cpu, sd,
7512 			   dmaxnm (aarch64_get_FP_double (cpu, sn),
7513 				   aarch64_get_FP_double (cpu, sm)));
7514   else
7515     aarch64_set_FP_float (cpu, sd,
7516 			  fmaxnm (aarch64_get_FP_float (cpu, sn),
7517 				  aarch64_get_FP_float (cpu, sm)));
7518 }
7519 
7520 static void
7521 dexSimpleFPDataProc2Source (sim_cpu *cpu)
7522 {
7523   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7524      instr[30]    = 0
7525      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7526      instr[28,25] = 1111
7527      instr[24]    = 0
7528      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7529      instr[21]    = 1
7530      instr[20,16] = Vm
7531      instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
7532                                0010 ==> FADD, 0011 ==> FSUB,
7533                                0100 ==> FMAX, 0101 ==> FMIN
7534                                0110 ==> FMAXNM, 0111 ==> FMINNM
7535                                1000 ==> FNMUL, ow ==> UNALLOC
7536      instr[11,10] = 10
7537      instr[9,5]   = Vn
7538      instr[4,0]   = Vd  */
7539 
7540   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7541   uint32_t type = INSTR (23, 22);
7542   /* Dispatch on opcode.  */
7543   uint32_t dispatch = INSTR (15, 12);
7544 
7545   if (type > 1)
7546     HALT_UNALLOC;
7547 
7548   if (M_S != 0)
7549     HALT_UNALLOC;
7550 
7551   if (type)
7552     switch (dispatch)
7553       {
7554       case 0: fmuld (cpu); return;
7555       case 1: fdivd (cpu); return;
7556       case 2: faddd (cpu); return;
7557       case 3: fsubd (cpu); return;
7558       case 6: do_FMAXNM (cpu); return;
7559       case 7: do_FMINNM (cpu); return;
7560       case 8: fnmuld (cpu); return;
7561 
7562 	/* Have not yet implemented fmax and fmin.  */
7563       case 4:
7564       case 5:
7565 	HALT_NYI;
7566 
7567       default:
7568 	HALT_UNALLOC;
7569       }
7570   else /* type == 0 => floats.  */
7571     switch (dispatch)
7572       {
7573       case 0: fmuls (cpu); return;
7574       case 1: fdivs (cpu); return;
7575       case 2: fadds (cpu); return;
7576       case 3: fsubs (cpu); return;
7577       case 6: do_FMAXNM (cpu); return;
7578       case 7: do_FMINNM (cpu); return;
7579       case 8: fnmuls (cpu); return;
7580 
7581       case 4:
7582       case 5:
7583 	HALT_NYI;
7584 
7585       default:
7586 	HALT_UNALLOC;
7587       }
7588 }
7589 
7590 static void
7591 dexSimpleFPCondSelect (sim_cpu *cpu)
7592 {
7593   /* FCSEL
7594      instr[31,23] = 0 0011 1100
7595      instr[22]    = 0=>single 1=>double
7596      instr[21]    = 1
7597      instr[20,16] = Sm
7598      instr[15,12] = cond
7599      instr[11,10] = 11
7600      instr[9,5]   = Sn
7601      instr[4,0]   = Cpu  */
7602   unsigned sm = INSTR (20, 16);
7603   unsigned sn = INSTR ( 9, 5);
7604   unsigned sd = INSTR ( 4, 0);
7605   uint32_t set = testConditionCode (cpu, INSTR (15, 12));
7606 
7607   NYI_assert (31, 23, 0x03C);
7608   NYI_assert (11, 10, 0x3);
7609 
7610   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7611   if (INSTR (22, 22))
7612     aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
7613 				     : aarch64_get_FP_double (cpu, sm)));
7614   else
7615     aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
7616 				    : aarch64_get_FP_float (cpu, sm)));
7617 }
7618 
7619 /* Store 32 bit unscaled signed 9 bit.  */
7620 static void
7621 fsturs (sim_cpu *cpu, int32_t offset)
7622 {
7623   unsigned int rn = INSTR (9, 5);
7624   unsigned int st = INSTR (4, 0);
7625 
7626   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7627   aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7628 		       aarch64_get_vec_u32 (cpu, st, 0));
7629 }
7630 
7631 /* Store 64 bit unscaled signed 9 bit.  */
7632 static void
7633 fsturd (sim_cpu *cpu, int32_t offset)
7634 {
7635   unsigned int rn = INSTR (9, 5);
7636   unsigned int st = INSTR (4, 0);
7637 
7638   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7639   aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7640 		       aarch64_get_vec_u64 (cpu, st, 0));
7641 }
7642 
7643 /* Store 128 bit unscaled signed 9 bit.  */
7644 static void
7645 fsturq (sim_cpu *cpu, int32_t offset)
7646 {
7647   unsigned int rn = INSTR (9, 5);
7648   unsigned int st = INSTR (4, 0);
7649   FRegister a;
7650 
7651   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7652   aarch64_get_FP_long_double (cpu, st, & a);
7653   aarch64_set_mem_long_double (cpu,
7654 			       aarch64_get_reg_u64 (cpu, rn, 1)
7655 			       + offset, a);
7656 }
7657 
7658 /* TODO FP move register.  */
7659 
7660 /* 32 bit fp to fp move register.  */
7661 static void
7662 ffmovs (sim_cpu *cpu)
7663 {
7664   unsigned int rn = INSTR (9, 5);
7665   unsigned int st = INSTR (4, 0);
7666 
7667   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7668   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
7669 }
7670 
7671 /* 64 bit fp to fp move register.  */
7672 static void
7673 ffmovd (sim_cpu *cpu)
7674 {
7675   unsigned int rn = INSTR (9, 5);
7676   unsigned int st = INSTR (4, 0);
7677 
7678   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7679   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
7680 }
7681 
7682 /* 32 bit GReg to Vec move register.  */
7683 static void
7684 fgmovs (sim_cpu *cpu)
7685 {
7686   unsigned int rn = INSTR (9, 5);
7687   unsigned int st = INSTR (4, 0);
7688 
7689   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7690   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
7691 }
7692 
7693 /* 64 bit g to fp move register.  */
7694 static void
7695 fgmovd (sim_cpu *cpu)
7696 {
7697   unsigned int rn = INSTR (9, 5);
7698   unsigned int st = INSTR (4, 0);
7699 
7700   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7701   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
7702 }
7703 
7704 /* 32 bit fp to g move register.  */
7705 static void
7706 gfmovs (sim_cpu *cpu)
7707 {
7708   unsigned int rn = INSTR (9, 5);
7709   unsigned int st = INSTR (4, 0);
7710 
7711   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7712   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
7713 }
7714 
7715 /* 64 bit fp to g move register.  */
7716 static void
7717 gfmovd (sim_cpu *cpu)
7718 {
7719   unsigned int rn = INSTR (9, 5);
7720   unsigned int st = INSTR (4, 0);
7721 
7722   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7723   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
7724 }
7725 
7726 /* FP move immediate
7727 
7728    These install an immediate 8 bit value in the target register
7729    where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
7730    bit exponent.  */
7731 
7732 static void
7733 fmovs (sim_cpu *cpu)
7734 {
7735   unsigned int sd = INSTR (4, 0);
7736   uint32_t imm = INSTR (20, 13);
7737   float f = fp_immediate_for_encoding_32 (imm);
7738 
7739   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7740   aarch64_set_FP_float (cpu, sd, f);
7741 }
7742 
7743 static void
7744 fmovd (sim_cpu *cpu)
7745 {
7746   unsigned int sd = INSTR (4, 0);
7747   uint32_t imm = INSTR (20, 13);
7748   double d = fp_immediate_for_encoding_64 (imm);
7749 
7750   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7751   aarch64_set_FP_double (cpu, sd, d);
7752 }
7753 
7754 static void
7755 dexSimpleFPImmediate (sim_cpu *cpu)
7756 {
7757   /* instr[31,23] == 00111100
7758      instr[22]    == type : single(0)/double(1)
7759      instr[21]    == 1
7760      instr[20,13] == imm8
7761      instr[12,10] == 100
7762      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
7763      instr[4,0]   == Rd  */
7764   uint32_t imm5 = INSTR (9, 5);
7765 
7766   NYI_assert (31, 23, 0x3C);
7767 
7768   if (imm5 != 0)
7769     HALT_UNALLOC;
7770 
7771   if (INSTR (22, 22))
7772     fmovd (cpu);
7773   else
7774     fmovs (cpu);
7775 }
7776 
7777 /* TODO specific decode and execute for group Load Store.  */
7778 
7779 /* TODO FP load/store single register (unscaled offset).  */
7780 
7781 /* TODO load 8 bit unscaled signed 9 bit.  */
7782 /* TODO load 16 bit unscaled signed 9 bit.  */
7783 
7784 /* Load 32 bit unscaled signed 9 bit.  */
7785 static void
7786 fldurs (sim_cpu *cpu, int32_t offset)
7787 {
7788   unsigned int rn = INSTR (9, 5);
7789   unsigned int st = INSTR (4, 0);
7790 
7791   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7792   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
7793 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7794 }
7795 
7796 /* Load 64 bit unscaled signed 9 bit.  */
7797 static void
7798 fldurd (sim_cpu *cpu, int32_t offset)
7799 {
7800   unsigned int rn = INSTR (9, 5);
7801   unsigned int st = INSTR (4, 0);
7802 
7803   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7804   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
7805 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7806 }
7807 
7808 /* Load 128 bit unscaled signed 9 bit.  */
7809 static void
7810 fldurq (sim_cpu *cpu, int32_t offset)
7811 {
7812   unsigned int rn = INSTR (9, 5);
7813   unsigned int st = INSTR (4, 0);
7814   FRegister a;
7815   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
7816 
7817   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7818   aarch64_get_mem_long_double (cpu, addr, & a);
7819   aarch64_set_FP_long_double (cpu, st, a);
7820 }
7821 
7822 /* TODO store 8 bit unscaled signed 9 bit.  */
7823 /* TODO store 16 bit unscaled signed 9 bit.  */
7824 
7825 
7826 /* 1 source.  */
7827 
7828 /* Float absolute value.  */
7829 static void
7830 fabss (sim_cpu *cpu)
7831 {
7832   unsigned sn = INSTR (9, 5);
7833   unsigned sd = INSTR (4, 0);
7834   float value = aarch64_get_FP_float (cpu, sn);
7835 
7836   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7837   aarch64_set_FP_float (cpu, sd, fabsf (value));
7838 }
7839 
7840 /* Double absolute value.  */
7841 static void
7842 fabcpu (sim_cpu *cpu)
7843 {
7844   unsigned sn = INSTR (9, 5);
7845   unsigned sd = INSTR (4, 0);
7846   double value = aarch64_get_FP_double (cpu, sn);
7847 
7848   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7849   aarch64_set_FP_double (cpu, sd, fabs (value));
7850 }
7851 
7852 /* Float negative value.  */
7853 static void
7854 fnegs (sim_cpu *cpu)
7855 {
7856   unsigned sn = INSTR (9, 5);
7857   unsigned sd = INSTR (4, 0);
7858 
7859   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7860   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
7861 }
7862 
7863 /* Double negative value.  */
7864 static void
7865 fnegd (sim_cpu *cpu)
7866 {
7867   unsigned sn = INSTR (9, 5);
7868   unsigned sd = INSTR (4, 0);
7869 
7870   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7871   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
7872 }
7873 
7874 /* Float square root.  */
7875 static void
7876 fsqrts (sim_cpu *cpu)
7877 {
7878   unsigned sn = INSTR (9, 5);
7879   unsigned sd = INSTR (4, 0);
7880 
7881   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7882   aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
7883 }
7884 
7885 /* Double square root.  */
7886 static void
7887 fsqrtd (sim_cpu *cpu)
7888 {
7889   unsigned sn = INSTR (9, 5);
7890   unsigned sd = INSTR (4, 0);
7891 
7892   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7893   aarch64_set_FP_double (cpu, sd,
7894 			 sqrt (aarch64_get_FP_double (cpu, sn)));
7895 }
7896 
7897 /* Convert double to float.  */
7898 static void
7899 fcvtds (sim_cpu *cpu)
7900 {
7901   unsigned sn = INSTR (9, 5);
7902   unsigned sd = INSTR (4, 0);
7903 
7904   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7905   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
7906 }
7907 
7908 /* Convert float to double.  */
7909 static void
7910 fcvtcpu (sim_cpu *cpu)
7911 {
7912   unsigned sn = INSTR (9, 5);
7913   unsigned sd = INSTR (4, 0);
7914 
7915   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7916   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
7917 }
7918 
7919 static void
7920 do_FRINT (sim_cpu *cpu)
7921 {
7922   /* instr[31,23] = 0001 1110 0
7923      instr[22]    = single(0)/double(1)
7924      instr[21,18] = 1001
7925      instr[17,15] = rounding mode
7926      instr[14,10] = 10000
7927      instr[9,5]   = source
7928      instr[4,0]   = dest  */
7929 
7930   float val;
7931   unsigned rs = INSTR (9, 5);
7932   unsigned rd = INSTR (4, 0);
7933   unsigned int rmode = INSTR (17, 15);
7934 
7935   NYI_assert (31, 23, 0x03C);
7936   NYI_assert (21, 18, 0x9);
7937   NYI_assert (14, 10, 0x10);
7938 
7939   if (rmode == 6 || rmode == 7)
7940     /* FIXME: Add support for rmode == 6 exactness check.  */
7941     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
7942 
7943   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7944   if (INSTR (22, 22))
7945     {
7946       double val = aarch64_get_FP_double (cpu, rs);
7947 
7948       switch (rmode)
7949 	{
7950 	case 0: /* mode N: nearest or even.  */
7951 	  {
7952 	    double rval = round (val);
7953 
7954 	    if (val - rval == 0.5)
7955 	      {
7956 		if (((rval / 2.0) * 2.0) != rval)
7957 		  rval += 1.0;
7958 	      }
7959 
7960 	    aarch64_set_FP_double (cpu, rd, round (val));
7961 	    return;
7962 	  }
7963 
7964 	case 1: /* mode P: towards +inf.  */
7965 	  if (val < 0.0)
7966 	    aarch64_set_FP_double (cpu, rd, trunc (val));
7967 	  else
7968 	    aarch64_set_FP_double (cpu, rd, round (val));
7969 	  return;
7970 
7971 	case 2: /* mode M: towards -inf.  */
7972 	  if (val < 0.0)
7973 	    aarch64_set_FP_double (cpu, rd, round (val));
7974 	  else
7975 	    aarch64_set_FP_double (cpu, rd, trunc (val));
7976 	  return;
7977 
7978 	case 3: /* mode Z: towards 0.  */
7979 	  aarch64_set_FP_double (cpu, rd, trunc (val));
7980 	  return;
7981 
7982 	case 4: /* mode A: away from 0.  */
7983 	  aarch64_set_FP_double (cpu, rd, round (val));
7984 	  return;
7985 
7986 	case 6: /* mode X: use FPCR with exactness check.  */
7987 	case 7: /* mode I: use FPCR mode.  */
7988 	  HALT_NYI;
7989 
7990 	default:
7991 	  HALT_UNALLOC;
7992 	}
7993     }
7994 
7995   val = aarch64_get_FP_float (cpu, rs);
7996 
7997   switch (rmode)
7998     {
7999     case 0: /* mode N: nearest or even.  */
8000       {
8001 	float rval = roundf (val);
8002 
8003 	if (val - rval == 0.5)
8004 	  {
8005 	    if (((rval / 2.0) * 2.0) != rval)
8006 	      rval += 1.0;
8007 	  }
8008 
8009 	aarch64_set_FP_float (cpu, rd, rval);
8010 	return;
8011       }
8012 
8013     case 1: /* mode P: towards +inf.  */
8014       if (val < 0.0)
8015 	aarch64_set_FP_float (cpu, rd, truncf (val));
8016       else
8017 	aarch64_set_FP_float (cpu, rd, roundf (val));
8018       return;
8019 
8020     case 2: /* mode M: towards -inf.  */
8021       if (val < 0.0)
8022 	aarch64_set_FP_float (cpu, rd, truncf (val));
8023       else
8024 	aarch64_set_FP_float (cpu, rd, roundf (val));
8025       return;
8026 
8027     case 3: /* mode Z: towards 0.  */
8028       aarch64_set_FP_float (cpu, rd, truncf (val));
8029       return;
8030 
8031     case 4: /* mode A: away from 0.  */
8032       aarch64_set_FP_float (cpu, rd, roundf (val));
8033       return;
8034 
8035     case 6: /* mode X: use FPCR with exactness check.  */
8036     case 7: /* mode I: use FPCR mode.  */
8037       HALT_NYI;
8038 
8039     default:
8040       HALT_UNALLOC;
8041     }
8042 }
8043 
8044 /* Convert half to float.  */
8045 static void
8046 do_FCVT_half_to_single (sim_cpu *cpu)
8047 {
8048   unsigned rn = INSTR (9, 5);
8049   unsigned rd = INSTR (4, 0);
8050 
8051   NYI_assert (31, 10, 0x7B890);
8052 
8053   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8054   aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
8055 }
8056 
8057 /* Convert half to double.  */
8058 static void
8059 do_FCVT_half_to_double (sim_cpu *cpu)
8060 {
8061   unsigned rn = INSTR (9, 5);
8062   unsigned rd = INSTR (4, 0);
8063 
8064   NYI_assert (31, 10, 0x7B8B0);
8065 
8066   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8067   aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
8068 }
8069 
8070 static void
8071 do_FCVT_single_to_half (sim_cpu *cpu)
8072 {
8073   unsigned rn = INSTR (9, 5);
8074   unsigned rd = INSTR (4, 0);
8075 
8076   NYI_assert (31, 10, 0x788F0);
8077 
8078   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8079   aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
8080 }
8081 
8082 /* Convert double to half.  */
8083 static void
8084 do_FCVT_double_to_half (sim_cpu *cpu)
8085 {
8086   unsigned rn = INSTR (9, 5);
8087   unsigned rd = INSTR (4, 0);
8088 
8089   NYI_assert (31, 10, 0x798F0);
8090 
8091   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8092   aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
8093 }
8094 
8095 static void
8096 dexSimpleFPDataProc1Source (sim_cpu *cpu)
8097 {
8098   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
8099      instr[30]    = 0
8100      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
8101      instr[28,25] = 1111
8102      instr[24]    = 0
8103      instr[23,22] ==> type : 00 ==> source is single,
8104                              01 ==> source is double
8105                              10 ==> UNALLOC
8106                              11 ==> UNALLOC or source is half
8107      instr[21]    = 1
8108      instr[20,15] ==> opcode : with type 00 or 01
8109                                000000 ==> FMOV, 000001 ==> FABS,
8110                                000010 ==> FNEG, 000011 ==> FSQRT,
8111                                000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
8112                                000110 ==> UNALLOC, 000111 ==> FCVT (to half)
8113                                001000 ==> FRINTN, 001001 ==> FRINTP,
8114                                001010 ==> FRINTM, 001011 ==> FRINTZ,
8115                                001100 ==> FRINTA, 001101 ==> UNALLOC
8116                                001110 ==> FRINTX, 001111 ==> FRINTI
8117                                with type 11
8118                                000100 ==> FCVT (half-to-single)
8119                                000101 ==> FCVT (half-to-double)
8120 			       instr[14,10] = 10000.  */
8121 
8122   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8123   uint32_t type   = INSTR (23, 22);
8124   uint32_t opcode = INSTR (20, 15);
8125 
8126   if (M_S != 0)
8127     HALT_UNALLOC;
8128 
8129   if (type == 3)
8130     {
8131       if (opcode == 4)
8132 	do_FCVT_half_to_single (cpu);
8133       else if (opcode == 5)
8134 	do_FCVT_half_to_double (cpu);
8135       else
8136 	HALT_UNALLOC;
8137       return;
8138     }
8139 
8140   if (type == 2)
8141     HALT_UNALLOC;
8142 
8143   switch (opcode)
8144     {
8145     case 0:
8146       if (type)
8147 	ffmovd (cpu);
8148       else
8149 	ffmovs (cpu);
8150       return;
8151 
8152     case 1:
8153       if (type)
8154 	fabcpu (cpu);
8155       else
8156 	fabss (cpu);
8157       return;
8158 
8159     case 2:
8160       if (type)
8161 	fnegd (cpu);
8162       else
8163 	fnegs (cpu);
8164       return;
8165 
8166     case 3:
8167       if (type)
8168 	fsqrtd (cpu);
8169       else
8170 	fsqrts (cpu);
8171       return;
8172 
8173     case 4:
8174       if (type)
8175 	fcvtds (cpu);
8176       else
8177 	HALT_UNALLOC;
8178       return;
8179 
8180     case 5:
8181       if (type)
8182 	HALT_UNALLOC;
8183       fcvtcpu (cpu);
8184       return;
8185 
8186     case 8:		/* FRINTN etc.  */
8187     case 9:
8188     case 10:
8189     case 11:
8190     case 12:
8191     case 14:
8192     case 15:
8193        do_FRINT (cpu);
8194        return;
8195 
8196     case 7:
8197       if (INSTR (22, 22))
8198 	do_FCVT_double_to_half (cpu);
8199       else
8200 	do_FCVT_single_to_half (cpu);
8201       return;
8202 
8203     case 13:
8204       HALT_NYI;
8205 
8206     default:
8207       HALT_UNALLOC;
8208     }
8209 }
8210 
8211 /* 32 bit signed int to float.  */
8212 static void
8213 scvtf32 (sim_cpu *cpu)
8214 {
8215   unsigned rn = INSTR (9, 5);
8216   unsigned sd = INSTR (4, 0);
8217 
8218   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8219   aarch64_set_FP_float
8220     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8221 }
8222 
8223 /* signed int to float.  */
8224 static void
8225 scvtf (sim_cpu *cpu)
8226 {
8227   unsigned rn = INSTR (9, 5);
8228   unsigned sd = INSTR (4, 0);
8229 
8230   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8231   aarch64_set_FP_float
8232     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8233 }
8234 
8235 /* 32 bit signed int to double.  */
8236 static void
8237 scvtd32 (sim_cpu *cpu)
8238 {
8239   unsigned rn = INSTR (9, 5);
8240   unsigned sd = INSTR (4, 0);
8241 
8242   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8243   aarch64_set_FP_double
8244     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8245 }
8246 
8247 /* signed int to double.  */
8248 static void
8249 scvtd (sim_cpu *cpu)
8250 {
8251   unsigned rn = INSTR (9, 5);
8252   unsigned sd = INSTR (4, 0);
8253 
8254   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8255   aarch64_set_FP_double
8256     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8257 }
8258 
8259 static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
8260 static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
8261 static const double DOUBLE_INT_MAX  = (double) INT_MAX;
8262 static const double DOUBLE_INT_MIN  = (double) INT_MIN;
8263 static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
8264 static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
8265 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
8266 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
8267 
8268 #define UINT_MIN 0
8269 #define ULONG_MIN 0
8270 static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
8271 static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
8272 static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
8273 static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
8274 static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
8275 static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
8276 static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
8277 static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
8278 
8279 /* Check for FP exception conditions:
8280      NaN raises IO
8281      Infinity raises IO
8282      Out of Range raises IO and IX and saturates value
8283      Denormal raises ID and IX and sets to zero.  */
8284 #define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)	\
8285   do							\
8286     {							\
8287       switch (fpclassify (F))				\
8288 	{						\
8289 	case FP_INFINITE:				\
8290 	case FP_NAN:					\
8291 	  aarch64_set_FPSR (cpu, IO);			\
8292 	  if (signbit (F))				\
8293 	    VALUE = ITYPE##_MAX;			\
8294 	  else						\
8295 	    VALUE = ITYPE##_MIN;			\
8296 	  break;					\
8297 							\
8298 	case FP_NORMAL:					\
8299 	  if (F >= FTYPE##_##ITYPE##_MAX)		\
8300 	    {						\
8301 	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
8302 	      VALUE = ITYPE##_MAX;			\
8303 	    }						\
8304 	  else if (F <= FTYPE##_##ITYPE##_MIN)		\
8305 	    {						\
8306 	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
8307 	      VALUE = ITYPE##_MIN;			\
8308 	    }						\
8309 	  break;					\
8310 							\
8311 	case FP_SUBNORMAL:				\
8312 	  aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);	\
8313 	  VALUE = 0;					\
8314 	  break;					\
8315 							\
8316 	default:					\
8317 	case FP_ZERO:					\
8318 	  VALUE = 0;					\
8319 	  break;					\
8320 	}						\
8321     }							\
8322   while (0)
8323 
8324 /* 32 bit convert float to signed int truncate towards zero.  */
8325 static void
8326 fcvtszs32 (sim_cpu *cpu)
8327 {
8328   unsigned sn = INSTR (9, 5);
8329   unsigned rd = INSTR (4, 0);
8330   /* TODO : check that this rounds toward zero.  */
8331   float   f = aarch64_get_FP_float (cpu, sn);
8332   int32_t value = (int32_t) f;
8333 
8334   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
8335 
8336   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8337   /* Avoid sign extension to 64 bit.  */
8338   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8339 }
8340 
8341 /* 64 bit convert float to signed int truncate towards zero.  */
8342 static void
8343 fcvtszs (sim_cpu *cpu)
8344 {
8345   unsigned sn = INSTR (9, 5);
8346   unsigned rd = INSTR (4, 0);
8347   float f = aarch64_get_FP_float (cpu, sn);
8348   int64_t value = (int64_t) f;
8349 
8350   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
8351 
8352   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8353   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8354 }
8355 
8356 /* 32 bit convert double to signed int truncate towards zero.  */
8357 static void
8358 fcvtszd32 (sim_cpu *cpu)
8359 {
8360   unsigned sn = INSTR (9, 5);
8361   unsigned rd = INSTR (4, 0);
8362   /* TODO : check that this rounds toward zero.  */
8363   double   d = aarch64_get_FP_double (cpu, sn);
8364   int32_t  value = (int32_t) d;
8365 
8366   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
8367 
8368   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8369   /* Avoid sign extension to 64 bit.  */
8370   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8371 }
8372 
8373 /* 64 bit convert double to signed int truncate towards zero.  */
8374 static void
8375 fcvtszd (sim_cpu *cpu)
8376 {
8377   unsigned sn = INSTR (9, 5);
8378   unsigned rd = INSTR (4, 0);
8379   /* TODO : check that this rounds toward zero.  */
8380   double  d = aarch64_get_FP_double (cpu, sn);
8381   int64_t value;
8382 
8383   value = (int64_t) d;
8384 
8385   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
8386 
8387   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8388   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8389 }
8390 
8391 static void
8392 do_fcvtzu (sim_cpu *cpu)
8393 {
8394   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8395      instr[30,23] = 00111100
8396      instr[22]    = type: single (0)/ double (1)
8397      instr[21]    = enable (0)/disable(1) precision
8398      instr[20,16] = 11001
8399      instr[15,10] = precision
8400      instr[9,5]   = Rs
8401      instr[4,0]   = Rd.  */
8402 
8403   unsigned rs = INSTR (9, 5);
8404   unsigned rd = INSTR (4, 0);
8405 
8406   NYI_assert (30, 23, 0x3C);
8407   NYI_assert (20, 16, 0x19);
8408 
8409   if (INSTR (21, 21) != 1)
8410     /* Convert to fixed point.  */
8411     HALT_NYI;
8412 
8413   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8414   if (INSTR (31, 31))
8415     {
8416       /* Convert to unsigned 64-bit integer.  */
8417       if (INSTR (22, 22))
8418 	{
8419 	  double  d = aarch64_get_FP_double (cpu, rs);
8420 	  uint64_t value = (uint64_t) d;
8421 
8422 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
8423 	  if (value != (1UL << 63))
8424 	    RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
8425 
8426 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8427 	}
8428       else
8429 	{
8430 	  float  f = aarch64_get_FP_float (cpu, rs);
8431 	  uint64_t value = (uint64_t) f;
8432 
8433 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
8434 	  if (value != (1UL << 63))
8435 	    RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
8436 
8437 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8438 	}
8439     }
8440   else
8441     {
8442       uint32_t value;
8443 
8444       /* Convert to unsigned 32-bit integer.  */
8445       if (INSTR (22, 22))
8446 	{
8447 	  double  d = aarch64_get_FP_double (cpu, rs);
8448 
8449 	  value = (uint32_t) d;
8450 	  /* Do not raise an exception if we have reached UINT_MAX.  */
8451 	  if (value != (1UL << 31))
8452 	    RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
8453 	}
8454       else
8455 	{
8456 	  float  f = aarch64_get_FP_float (cpu, rs);
8457 
8458 	  value = (uint32_t) f;
8459 	  /* Do not raise an exception if we have reached UINT_MAX.  */
8460 	  if (value != (1UL << 31))
8461 	    RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
8462 	}
8463 
8464       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8465     }
8466 }
8467 
8468 static void
8469 do_UCVTF (sim_cpu *cpu)
8470 {
8471   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8472      instr[30,23] = 001 1110 0
8473      instr[22]    = type: single (0)/ double (1)
8474      instr[21]    = enable (0)/disable(1) precision
8475      instr[20,16] = 0 0011
8476      instr[15,10] = precision
8477      instr[9,5]   = Rs
8478      instr[4,0]   = Rd.  */
8479 
8480   unsigned rs = INSTR (9, 5);
8481   unsigned rd = INSTR (4, 0);
8482 
8483   NYI_assert (30, 23, 0x3C);
8484   NYI_assert (20, 16, 0x03);
8485 
8486   if (INSTR (21, 21) != 1)
8487     HALT_NYI;
8488 
8489   /* FIXME: Add exception raising.  */
8490   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8491   if (INSTR (31, 31))
8492     {
8493       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
8494 
8495       if (INSTR (22, 22))
8496 	aarch64_set_FP_double (cpu, rd, (double) value);
8497       else
8498 	aarch64_set_FP_float (cpu, rd, (float) value);
8499     }
8500   else
8501     {
8502       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
8503 
8504       if (INSTR (22, 22))
8505 	aarch64_set_FP_double (cpu, rd, (double) value);
8506       else
8507 	aarch64_set_FP_float (cpu, rd, (float) value);
8508     }
8509 }
8510 
8511 static void
8512 float_vector_move (sim_cpu *cpu)
8513 {
8514   /* instr[31,17] == 100 1111 0101 0111
8515      instr[16]    ==> direction 0=> to GR, 1=> from GR
8516      instr[15,10] => ???
8517      instr[9,5]   ==> source
8518      instr[4,0]   ==> dest.  */
8519 
8520   unsigned rn = INSTR (9, 5);
8521   unsigned rd = INSTR (4, 0);
8522 
8523   NYI_assert (31, 17, 0x4F57);
8524 
8525   if (INSTR (15, 10) != 0)
8526     HALT_UNALLOC;
8527 
8528   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8529   if (INSTR (16, 16))
8530     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
8531   else
8532     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
8533 }
8534 
8535 static void
8536 dexSimpleFPIntegerConvert (sim_cpu *cpu)
8537 {
8538   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
8539      instr[30     = 0
8540      instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
8541      instr[28,25] = 1111
8542      instr[24]    = 0
8543      instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
8544      instr[21]    = 1
8545      instr[20,19] = rmode
8546      instr[18,16] = opcode
8547      instr[15,10] = 10 0000  */
8548 
8549   uint32_t rmode_opcode;
8550   uint32_t size_type;
8551   uint32_t type;
8552   uint32_t size;
8553   uint32_t S;
8554 
8555   if (INSTR (31, 17) == 0x4F57)
8556     {
8557       float_vector_move (cpu);
8558       return;
8559     }
8560 
8561   size = INSTR (31, 31);
8562   S = INSTR (29, 29);
8563   if (S != 0)
8564     HALT_UNALLOC;
8565 
8566   type = INSTR (23, 22);
8567   if (type > 1)
8568     HALT_UNALLOC;
8569 
8570   rmode_opcode = INSTR (20, 16);
8571   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
8572 
8573   switch (rmode_opcode)
8574     {
8575     case 2:			/* SCVTF.  */
8576       switch (size_type)
8577 	{
8578 	case 0: scvtf32 (cpu); return;
8579 	case 1: scvtd32 (cpu); return;
8580 	case 2: scvtf (cpu); return;
8581 	case 3: scvtd (cpu); return;
8582 	}
8583 
8584     case 6:			/* FMOV GR, Vec.  */
8585       switch (size_type)
8586 	{
8587 	case 0:  gfmovs (cpu); return;
8588 	case 3:  gfmovd (cpu); return;
8589 	default: HALT_UNALLOC;
8590 	}
8591 
8592     case 7:			/* FMOV vec, GR.  */
8593       switch (size_type)
8594 	{
8595 	case 0:  fgmovs (cpu); return;
8596 	case 3:  fgmovd (cpu); return;
8597 	default: HALT_UNALLOC;
8598 	}
8599 
8600     case 24:			/* FCVTZS.  */
8601       switch (size_type)
8602 	{
8603 	case 0: fcvtszs32 (cpu); return;
8604 	case 1: fcvtszd32 (cpu); return;
8605 	case 2: fcvtszs (cpu); return;
8606 	case 3: fcvtszd (cpu); return;
8607 	}
8608 
8609     case 25: do_fcvtzu (cpu); return;
8610     case 3:  do_UCVTF (cpu); return;
8611 
8612     case 0:	/* FCVTNS.  */
8613     case 1:	/* FCVTNU.  */
8614     case 4:	/* FCVTAS.  */
8615     case 5:	/* FCVTAU.  */
8616     case 8:	/* FCVPTS.  */
8617     case 9:	/* FCVTPU.  */
8618     case 16:	/* FCVTMS.  */
8619     case 17:	/* FCVTMU.  */
8620     default:
8621       HALT_NYI;
8622     }
8623 }
8624 
8625 static void
8626 set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
8627 {
8628   uint32_t flags;
8629 
8630   /* FIXME: Add exception raising.  */
8631   if (isnan (fvalue1) || isnan (fvalue2))
8632     flags = C|V;
8633   else if (isinf (fvalue1) && isinf (fvalue2))
8634     {
8635       /* Subtracting two infinities may give a NaN.  We only need to compare
8636 	 the signs, which we can get from isinf.  */
8637       int result = isinf (fvalue1) - isinf (fvalue2);
8638 
8639       if (result == 0)
8640 	flags = Z|C;
8641       else if (result < 0)
8642 	flags = N;
8643       else /* (result > 0).  */
8644 	flags = C;
8645     }
8646   else
8647     {
8648       float result = fvalue1 - fvalue2;
8649 
8650       if (result == 0.0)
8651 	flags = Z|C;
8652       else if (result < 0)
8653 	flags = N;
8654       else /* (result > 0).  */
8655 	flags = C;
8656     }
8657 
8658   aarch64_set_CPSR (cpu, flags);
8659 }
8660 
8661 static void
8662 fcmps (sim_cpu *cpu)
8663 {
8664   unsigned sm = INSTR (20, 16);
8665   unsigned sn = INSTR ( 9,  5);
8666 
8667   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8668   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8669 
8670   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8671   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8672 }
8673 
8674 /* Float compare to zero -- Invalid Operation exception
8675    only on signaling NaNs.  */
8676 static void
8677 fcmpzs (sim_cpu *cpu)
8678 {
8679   unsigned sn = INSTR ( 9,  5);
8680   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8681 
8682   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8683   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8684 }
8685 
8686 /* Float compare -- Invalid Operation exception on all NaNs.  */
8687 static void
8688 fcmpes (sim_cpu *cpu)
8689 {
8690   unsigned sm = INSTR (20, 16);
8691   unsigned sn = INSTR ( 9,  5);
8692 
8693   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8694   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8695 
8696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8697   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8698 }
8699 
8700 /* Float compare to zero -- Invalid Operation exception on all NaNs.  */
8701 static void
8702 fcmpzes (sim_cpu *cpu)
8703 {
8704   unsigned sn = INSTR ( 9,  5);
8705   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8706 
8707   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8708   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8709 }
8710 
8711 static void
8712 set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
8713 {
8714   uint32_t flags;
8715 
8716   /* FIXME: Add exception raising.  */
8717   if (isnan (dval1) || isnan (dval2))
8718     flags = C|V;
8719   else if (isinf (dval1) && isinf (dval2))
8720     {
8721       /* Subtracting two infinities may give a NaN.  We only need to compare
8722 	 the signs, which we can get from isinf.  */
8723       int result = isinf (dval1) - isinf (dval2);
8724 
8725       if (result == 0)
8726 	flags = Z|C;
8727       else if (result < 0)
8728 	flags = N;
8729       else /* (result > 0).  */
8730 	flags = C;
8731     }
8732   else
8733     {
8734       double result = dval1 - dval2;
8735 
8736       if (result == 0.0)
8737 	flags = Z|C;
8738       else if (result < 0)
8739 	flags = N;
8740       else /* (result > 0).  */
8741 	flags = C;
8742     }
8743 
8744   aarch64_set_CPSR (cpu, flags);
8745 }
8746 
8747 /* Double compare -- Invalid Operation exception only on signaling NaNs.  */
8748 static void
8749 fcmpd (sim_cpu *cpu)
8750 {
8751   unsigned sm = INSTR (20, 16);
8752   unsigned sn = INSTR ( 9,  5);
8753 
8754   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8755   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8756 
8757   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8758   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8759 }
8760 
8761 /* Double compare to zero -- Invalid Operation exception
8762    only on signaling NaNs.  */
8763 static void
8764 fcmpzd (sim_cpu *cpu)
8765 {
8766   unsigned sn = INSTR ( 9,  5);
8767   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8768 
8769   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8770   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8771 }
8772 
8773 /* Double compare -- Invalid Operation exception on all NaNs.  */
8774 static void
8775 fcmped (sim_cpu *cpu)
8776 {
8777   unsigned sm = INSTR (20, 16);
8778   unsigned sn = INSTR ( 9,  5);
8779 
8780   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8781   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8782 
8783   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8784   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8785 }
8786 
8787 /* Double compare to zero -- Invalid Operation exception on all NaNs.  */
8788 static void
8789 fcmpzed (sim_cpu *cpu)
8790 {
8791   unsigned sn = INSTR ( 9,  5);
8792   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8793 
8794   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8795   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8796 }
8797 
8798 static void
8799 dexSimpleFPCompare (sim_cpu *cpu)
8800 {
8801   /* assert instr[28,25] == 1111
8802      instr[30:24:21:13,10] = 0011000
8803      instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
8804      instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
8805      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
8806      instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
8807      instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
8808                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
8809                               ow ==> UNALLOC  */
8810   uint32_t dispatch;
8811   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8812   uint32_t type = INSTR (23, 22);
8813   uint32_t op = INSTR (15, 14);
8814   uint32_t op2_2_0 = INSTR (2, 0);
8815 
8816   if (op2_2_0 != 0)
8817     HALT_UNALLOC;
8818 
8819   if (M_S != 0)
8820     HALT_UNALLOC;
8821 
8822   if (type > 1)
8823     HALT_UNALLOC;
8824 
8825   if (op != 0)
8826     HALT_UNALLOC;
8827 
8828   /* dispatch on type and top 2 bits of opcode.  */
8829   dispatch = (type << 2) | INSTR (4, 3);
8830 
8831   switch (dispatch)
8832     {
8833     case 0: fcmps (cpu); return;
8834     case 1: fcmpzs (cpu); return;
8835     case 2: fcmpes (cpu); return;
8836     case 3: fcmpzes (cpu); return;
8837     case 4: fcmpd (cpu); return;
8838     case 5: fcmpzd (cpu); return;
8839     case 6: fcmped (cpu); return;
8840     case 7: fcmpzed (cpu); return;
8841     }
8842 }
8843 
8844 static void
8845 do_scalar_FADDP (sim_cpu *cpu)
8846 {
8847   /* instr [31,23] = 0111 1110 0
8848      instr [22]    = single(0)/double(1)
8849      instr [21,10] = 11 0000 1101 10
8850      instr [9,5]   = Fn
8851      instr [4,0]   = Fd.  */
8852 
8853   unsigned Fn = INSTR (9, 5);
8854   unsigned Fd = INSTR (4, 0);
8855 
8856   NYI_assert (31, 23, 0x0FC);
8857   NYI_assert (21, 10, 0xC36);
8858 
8859   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8860   if (INSTR (22, 22))
8861     {
8862       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
8863       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
8864 
8865       aarch64_set_FP_double (cpu, Fd, val1 + val2);
8866     }
8867   else
8868     {
8869       float val1 = aarch64_get_vec_float (cpu, Fn, 0);
8870       float val2 = aarch64_get_vec_float (cpu, Fn, 1);
8871 
8872       aarch64_set_FP_float (cpu, Fd, val1 + val2);
8873     }
8874 }
8875 
8876 /* Floating point absolute difference.  */
8877 
8878 static void
8879 do_scalar_FABD (sim_cpu *cpu)
8880 {
8881   /* instr [31,23] = 0111 1110 1
8882      instr [22]    = float(0)/double(1)
8883      instr [21]    = 1
8884      instr [20,16] = Rm
8885      instr [15,10] = 1101 01
8886      instr [9, 5]  = Rn
8887      instr [4, 0]  = Rd.  */
8888 
8889   unsigned rm = INSTR (20, 16);
8890   unsigned rn = INSTR (9, 5);
8891   unsigned rd = INSTR (4, 0);
8892 
8893   NYI_assert (31, 23, 0x0FD);
8894   NYI_assert (21, 21, 1);
8895   NYI_assert (15, 10, 0x35);
8896 
8897   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8898   if (INSTR (22, 22))
8899     aarch64_set_FP_double (cpu, rd,
8900 			   fabs (aarch64_get_FP_double (cpu, rn)
8901 				 - aarch64_get_FP_double (cpu, rm)));
8902   else
8903     aarch64_set_FP_float (cpu, rd,
8904 			  fabsf (aarch64_get_FP_float (cpu, rn)
8905 				 - aarch64_get_FP_float (cpu, rm)));
8906 }
8907 
8908 static void
8909 do_scalar_CMGT (sim_cpu *cpu)
8910 {
8911   /* instr [31,21] = 0101 1110 111
8912      instr [20,16] = Rm
8913      instr [15,10] = 00 1101
8914      instr [9, 5]  = Rn
8915      instr [4, 0]  = Rd.  */
8916 
8917   unsigned rm = INSTR (20, 16);
8918   unsigned rn = INSTR (9, 5);
8919   unsigned rd = INSTR (4, 0);
8920 
8921   NYI_assert (31, 21, 0x2F7);
8922   NYI_assert (15, 10, 0x0D);
8923 
8924   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8925   aarch64_set_vec_u64 (cpu, rd, 0,
8926 		       aarch64_get_vec_u64 (cpu, rn, 0) >
8927 		       aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
8928 }
8929 
8930 static void
8931 do_scalar_USHR (sim_cpu *cpu)
8932 {
8933   /* instr [31,23] = 0111 1111 0
8934      instr [22,16] = shift amount
8935      instr [15,10] = 0000 01
8936      instr [9, 5]  = Rn
8937      instr [4, 0]  = Rd.  */
8938 
8939   unsigned amount = 128 - INSTR (22, 16);
8940   unsigned rn = INSTR (9, 5);
8941   unsigned rd = INSTR (4, 0);
8942 
8943   NYI_assert (31, 23, 0x0FE);
8944   NYI_assert (15, 10, 0x01);
8945 
8946   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8947   aarch64_set_vec_u64 (cpu, rd, 0,
8948 		       aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
8949 }
8950 
8951 static void
8952 do_scalar_SSHL (sim_cpu *cpu)
8953 {
8954   /* instr [31,21] = 0101 1110 111
8955      instr [20,16] = Rm
8956      instr [15,10] = 0100 01
8957      instr [9, 5]  = Rn
8958      instr [4, 0]  = Rd.  */
8959 
8960   unsigned rm = INSTR (20, 16);
8961   unsigned rn = INSTR (9, 5);
8962   unsigned rd = INSTR (4, 0);
8963   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
8964 
8965   NYI_assert (31, 21, 0x2F7);
8966   NYI_assert (15, 10, 0x11);
8967 
8968   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8969   if (shift >= 0)
8970     aarch64_set_vec_s64 (cpu, rd, 0,
8971 			 aarch64_get_vec_s64 (cpu, rn, 0) << shift);
8972   else
8973     aarch64_set_vec_s64 (cpu, rd, 0,
8974 			 aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
8975 }
8976 
8977 /* Floating point scalar compare greater than or equal to 0.  */
8978 static void
8979 do_scalar_FCMGE_zero (sim_cpu *cpu)
8980 {
8981   /* instr [31,23] = 0111 1110 1
8982      instr [22,22] = size
8983      instr [21,16] = 1000 00
8984      instr [15,10] = 1100 10
8985      instr [9, 5]  = Rn
8986      instr [4, 0]  = Rd.  */
8987 
8988   unsigned size = INSTR (22, 22);
8989   unsigned rn = INSTR (9, 5);
8990   unsigned rd = INSTR (4, 0);
8991 
8992   NYI_assert (31, 23, 0x0FD);
8993   NYI_assert (21, 16, 0x20);
8994   NYI_assert (15, 10, 0x32);
8995 
8996   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8997   if (size)
8998     aarch64_set_vec_u64 (cpu, rd, 0,
8999 			 aarch64_get_vec_double (cpu, rn, 0) >= 0.0 ? -1 : 0);
9000   else
9001     aarch64_set_vec_u32 (cpu, rd, 0,
9002 			 aarch64_get_vec_float (cpu, rn, 0) >= 0.0 ? -1 : 0);
9003 }
9004 
9005 /* Floating point scalar compare less than or equal to 0.  */
9006 static void
9007 do_scalar_FCMLE_zero (sim_cpu *cpu)
9008 {
9009   /* instr [31,23] = 0111 1110 1
9010      instr [22,22] = size
9011      instr [21,16] = 1000 00
9012      instr [15,10] = 1101 10
9013      instr [9, 5]  = Rn
9014      instr [4, 0]  = Rd.  */
9015 
9016   unsigned size = INSTR (22, 22);
9017   unsigned rn = INSTR (9, 5);
9018   unsigned rd = INSTR (4, 0);
9019 
9020   NYI_assert (31, 23, 0x0FD);
9021   NYI_assert (21, 16, 0x20);
9022   NYI_assert (15, 10, 0x36);
9023 
9024   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9025   if (size)
9026     aarch64_set_vec_u64 (cpu, rd, 0,
9027 			 aarch64_get_vec_double (cpu, rn, 0) <= 0.0 ? -1 : 0);
9028   else
9029     aarch64_set_vec_u32 (cpu, rd, 0,
9030 			 aarch64_get_vec_float (cpu, rn, 0) <= 0.0 ? -1 : 0);
9031 }
9032 
9033 /* Floating point scalar compare greater than 0.  */
9034 static void
9035 do_scalar_FCMGT_zero (sim_cpu *cpu)
9036 {
9037   /* instr [31,23] = 0101 1110 1
9038      instr [22,22] = size
9039      instr [21,16] = 1000 00
9040      instr [15,10] = 1100 10
9041      instr [9, 5]  = Rn
9042      instr [4, 0]  = Rd.  */
9043 
9044   unsigned size = INSTR (22, 22);
9045   unsigned rn = INSTR (9, 5);
9046   unsigned rd = INSTR (4, 0);
9047 
9048   NYI_assert (31, 23, 0x0BD);
9049   NYI_assert (21, 16, 0x20);
9050   NYI_assert (15, 10, 0x32);
9051 
9052   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9053   if (size)
9054     aarch64_set_vec_u64 (cpu, rd, 0,
9055 			 aarch64_get_vec_double (cpu, rn, 0) > 0.0 ? -1 : 0);
9056   else
9057     aarch64_set_vec_u32 (cpu, rd, 0,
9058 			 aarch64_get_vec_float (cpu, rn, 0) > 0.0 ? -1 : 0);
9059 }
9060 
9061 /* Floating point scalar compare equal to 0.  */
9062 static void
9063 do_scalar_FCMEQ_zero (sim_cpu *cpu)
9064 {
9065   /* instr [31,23] = 0101 1110 1
9066      instr [22,22] = size
9067      instr [21,16] = 1000 00
9068      instr [15,10] = 1101 10
9069      instr [9, 5]  = Rn
9070      instr [4, 0]  = Rd.  */
9071 
9072   unsigned size = INSTR (22, 22);
9073   unsigned rn = INSTR (9, 5);
9074   unsigned rd = INSTR (4, 0);
9075 
9076   NYI_assert (31, 23, 0x0BD);
9077   NYI_assert (21, 16, 0x20);
9078   NYI_assert (15, 10, 0x36);
9079 
9080   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9081   if (size)
9082     aarch64_set_vec_u64 (cpu, rd, 0,
9083 			 aarch64_get_vec_double (cpu, rn, 0) == 0.0 ? -1 : 0);
9084   else
9085     aarch64_set_vec_u32 (cpu, rd, 0,
9086 			 aarch64_get_vec_float (cpu, rn, 0) == 0.0 ? -1 : 0);
9087 }
9088 
9089 /* Floating point scalar compare less than 0.  */
9090 static void
9091 do_scalar_FCMLT_zero (sim_cpu *cpu)
9092 {
9093   /* instr [31,23] = 0101 1110 1
9094      instr [22,22] = size
9095      instr [21,16] = 1000 00
9096      instr [15,10] = 1110 10
9097      instr [9, 5]  = Rn
9098      instr [4, 0]  = Rd.  */
9099 
9100   unsigned size = INSTR (22, 22);
9101   unsigned rn = INSTR (9, 5);
9102   unsigned rd = INSTR (4, 0);
9103 
9104   NYI_assert (31, 23, 0x0BD);
9105   NYI_assert (21, 16, 0x20);
9106   NYI_assert (15, 10, 0x3A);
9107 
9108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9109   if (size)
9110     aarch64_set_vec_u64 (cpu, rd, 0,
9111 			 aarch64_get_vec_double (cpu, rn, 0) < 0.0 ? -1 : 0);
9112   else
9113     aarch64_set_vec_u32 (cpu, rd, 0,
9114 			 aarch64_get_vec_float (cpu, rn, 0) < 0.0 ? -1 : 0);
9115 }
9116 
9117 static void
9118 do_scalar_shift (sim_cpu *cpu)
9119 {
9120   /* instr [31,23] = 0101 1111 0
9121      instr [22,16] = shift amount
9122      instr [15,10] = 0101 01   [SHL]
9123      instr [15,10] = 0000 01   [SSHR]
9124      instr [9, 5]  = Rn
9125      instr [4, 0]  = Rd.  */
9126 
9127   unsigned rn = INSTR (9, 5);
9128   unsigned rd = INSTR (4, 0);
9129   unsigned amount;
9130 
9131   NYI_assert (31, 23, 0x0BE);
9132 
9133   if (INSTR (22, 22) == 0)
9134     HALT_UNALLOC;
9135 
9136   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9137   switch (INSTR (15, 10))
9138     {
9139     case 0x01: /* SSHR */
9140       amount = 128 - INSTR (22, 16);
9141       aarch64_set_vec_s64 (cpu, rd, 0,
9142 			   aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
9143       return;
9144     case 0x15: /* SHL */
9145       amount = INSTR (22, 16) - 64;
9146       aarch64_set_vec_u64 (cpu, rd, 0,
9147 			   aarch64_get_vec_u64 (cpu, rn, 0) << amount);
9148       return;
9149     default:
9150       HALT_NYI;
9151     }
9152 }
9153 
9154 /* FCMEQ FCMGT FCMGE.  */
9155 static void
9156 do_scalar_FCM (sim_cpu *cpu)
9157 {
9158   /* instr [31,30] = 01
9159      instr [29]    = U
9160      instr [28,24] = 1 1110
9161      instr [23]    = E
9162      instr [22]    = size
9163      instr [21]    = 1
9164      instr [20,16] = Rm
9165      instr [15,12] = 1110
9166      instr [11]    = AC
9167      instr [10]    = 1
9168      instr [9, 5]  = Rn
9169      instr [4, 0]  = Rd.  */
9170 
9171   unsigned rm = INSTR (20, 16);
9172   unsigned rn = INSTR (9, 5);
9173   unsigned rd = INSTR (4, 0);
9174   unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
9175   unsigned result;
9176   float val1;
9177   float val2;
9178 
9179   NYI_assert (31, 30, 1);
9180   NYI_assert (28, 24, 0x1E);
9181   NYI_assert (21, 21, 1);
9182   NYI_assert (15, 12, 0xE);
9183   NYI_assert (10, 10, 1);
9184 
9185   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9186   if (INSTR (22, 22))
9187     {
9188       double val1 = aarch64_get_FP_double (cpu, rn);
9189       double val2 = aarch64_get_FP_double (cpu, rm);
9190 
9191       switch (EUac)
9192 	{
9193 	case 0: /* 000 */
9194 	  result = val1 == val2;
9195 	  break;
9196 
9197 	case 3: /* 011 */
9198 	  val1 = fabs (val1);
9199 	  val2 = fabs (val2);
9200 	  /* Fall through. */
9201 	case 2: /* 010 */
9202 	  result = val1 >= val2;
9203 	  break;
9204 
9205 	case 7: /* 111 */
9206 	  val1 = fabs (val1);
9207 	  val2 = fabs (val2);
9208 	  /* Fall through. */
9209 	case 6: /* 110 */
9210 	  result = val1 > val2;
9211 	  break;
9212 
9213 	default:
9214 	  HALT_UNALLOC;
9215 	}
9216 
9217       aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
9218       return;
9219     }
9220 
9221   val1 = aarch64_get_FP_float (cpu, rn);
9222   val2 = aarch64_get_FP_float (cpu, rm);
9223 
9224   switch (EUac)
9225     {
9226     case 0: /* 000 */
9227       result = val1 == val2;
9228       break;
9229 
9230     case 3: /* 011 */
9231       val1 = fabsf (val1);
9232       val2 = fabsf (val2);
9233       /* Fall through. */
9234     case 2: /* 010 */
9235       result = val1 >= val2;
9236       break;
9237 
9238     case 7: /* 111 */
9239       val1 = fabsf (val1);
9240       val2 = fabsf (val2);
9241       /* Fall through. */
9242     case 6: /* 110 */
9243       result = val1 > val2;
9244       break;
9245 
9246     default:
9247       HALT_UNALLOC;
9248     }
9249 
9250   aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
9251 }
9252 
9253 /* An alias of DUP.  */
9254 static void
9255 do_scalar_MOV (sim_cpu *cpu)
9256 {
9257   /* instr [31,21] = 0101 1110 000
9258      instr [20,16] = imm5
9259      instr [15,10] = 0000 01
9260      instr [9, 5]  = Rn
9261      instr [4, 0]  = Rd.  */
9262 
9263   unsigned rn = INSTR (9, 5);
9264   unsigned rd = INSTR (4, 0);
9265   unsigned index;
9266 
9267   NYI_assert (31, 21, 0x2F0);
9268   NYI_assert (15, 10, 0x01);
9269 
9270   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9271   if (INSTR (16, 16))
9272     {
9273       /* 8-bit.  */
9274       index = INSTR (20, 17);
9275       aarch64_set_vec_u8
9276 	(cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
9277     }
9278   else if (INSTR (17, 17))
9279     {
9280       /* 16-bit.  */
9281       index = INSTR (20, 18);
9282       aarch64_set_vec_u16
9283 	(cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
9284     }
9285   else if (INSTR (18, 18))
9286     {
9287       /* 32-bit.  */
9288       index = INSTR (20, 19);
9289       aarch64_set_vec_u32
9290 	(cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
9291     }
9292   else if (INSTR (19, 19))
9293     {
9294       /* 64-bit.  */
9295       index = INSTR (20, 20);
9296       aarch64_set_vec_u64
9297 	(cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
9298     }
9299   else
9300     HALT_UNALLOC;
9301 }
9302 
9303 static void
9304 do_scalar_NEG (sim_cpu *cpu)
9305 {
9306   /* instr [31,10] = 0111 1110 1110 0000 1011 10
9307      instr [9, 5]  = Rn
9308      instr [4, 0]  = Rd.  */
9309 
9310   unsigned rn = INSTR (9, 5);
9311   unsigned rd = INSTR (4, 0);
9312 
9313   NYI_assert (31, 10, 0x1FB82E);
9314 
9315   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9316   aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
9317 }
9318 
9319 static void
9320 do_scalar_USHL (sim_cpu *cpu)
9321 {
9322   /* instr [31,21] = 0111 1110 111
9323      instr [20,16] = Rm
9324      instr [15,10] = 0100 01
9325      instr [9, 5]  = Rn
9326      instr [4, 0]  = Rd.  */
9327 
9328   unsigned rm = INSTR (20, 16);
9329   unsigned rn = INSTR (9, 5);
9330   unsigned rd = INSTR (4, 0);
9331   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
9332 
9333   NYI_assert (31, 21, 0x3F7);
9334   NYI_assert (15, 10, 0x11);
9335 
9336   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9337   if (shift >= 0)
9338     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
9339   else
9340     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
9341 }
9342 
9343 static void
9344 do_double_add (sim_cpu *cpu)
9345 {
9346   /* instr [31,21] = 0101 1110 111
9347      instr [20,16] = Fn
9348      instr [15,10] = 1000 01
9349      instr [9,5]   = Fm
9350      instr [4,0]   = Fd.  */
9351   unsigned Fd;
9352   unsigned Fm;
9353   unsigned Fn;
9354   double val1;
9355   double val2;
9356 
9357   NYI_assert (31, 21, 0x2F7);
9358   NYI_assert (15, 10, 0x21);
9359 
9360   Fd = INSTR (4, 0);
9361   Fm = INSTR (9, 5);
9362   Fn = INSTR (20, 16);
9363 
9364   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9365   val1 = aarch64_get_FP_double (cpu, Fm);
9366   val2 = aarch64_get_FP_double (cpu, Fn);
9367 
9368   aarch64_set_FP_double (cpu, Fd, val1 + val2);
9369 }
9370 
9371 static void
9372 do_scalar_UCVTF (sim_cpu *cpu)
9373 {
9374   /* instr [31,23] = 0111 1110 0
9375      instr [22]    = single(0)/double(1)
9376      instr [21,10] = 10 0001 1101 10
9377      instr [9,5]   = rn
9378      instr [4,0]   = rd.  */
9379 
9380   unsigned rn = INSTR (9, 5);
9381   unsigned rd = INSTR (4, 0);
9382 
9383   NYI_assert (31, 23, 0x0FC);
9384   NYI_assert (21, 10, 0x876);
9385 
9386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9387   if (INSTR (22, 22))
9388     {
9389       uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
9390 
9391       aarch64_set_vec_double (cpu, rd, 0, (double) val);
9392     }
9393   else
9394     {
9395       uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
9396 
9397       aarch64_set_vec_float (cpu, rd, 0, (float) val);
9398     }
9399 }
9400 
9401 static void
9402 do_scalar_vec (sim_cpu *cpu)
9403 {
9404   /* instr [30] = 1.  */
9405   /* instr [28,25] = 1111.  */
9406   switch (INSTR (31, 23))
9407     {
9408     case 0xBC:
9409       switch (INSTR (15, 10))
9410 	{
9411 	case 0x01: do_scalar_MOV (cpu); return;
9412 	case 0x39: do_scalar_FCM (cpu); return;
9413 	case 0x3B: do_scalar_FCM (cpu); return;
9414 	}
9415       break;
9416 
9417     case 0xBE: do_scalar_shift (cpu); return;
9418 
9419     case 0xFC:
9420       switch (INSTR (15, 10))
9421 	{
9422 	case 0x36:
9423 	  switch (INSTR (21, 16))
9424 	    {
9425 	    case 0x30: do_scalar_FADDP (cpu); return;
9426 	    case 0x21: do_scalar_UCVTF (cpu); return;
9427 	    }
9428 	  HALT_NYI;
9429 	case 0x39: do_scalar_FCM (cpu); return;
9430 	case 0x3B: do_scalar_FCM (cpu); return;
9431 	}
9432       break;
9433 
9434     case 0xFD:
9435       switch (INSTR (15, 10))
9436 	{
9437 	case 0x0D: do_scalar_CMGT (cpu); return;
9438 	case 0x11: do_scalar_USHL (cpu); return;
9439 	case 0x2E: do_scalar_NEG (cpu); return;
9440 	case 0x32: do_scalar_FCMGE_zero (cpu); return;
9441 	case 0x35: do_scalar_FABD (cpu); return;
9442 	case 0x36: do_scalar_FCMLE_zero (cpu); return;
9443 	case 0x39: do_scalar_FCM (cpu); return;
9444 	case 0x3B: do_scalar_FCM (cpu); return;
9445 	default:
9446 	  HALT_NYI;
9447 	}
9448 
9449     case 0xFE: do_scalar_USHR (cpu); return;
9450 
9451     case 0xBD:
9452       switch (INSTR (15, 10))
9453 	{
9454 	case 0x21: do_double_add (cpu); return;
9455 	case 0x11: do_scalar_SSHL (cpu); return;
9456 	case 0x32: do_scalar_FCMGT_zero (cpu); return;
9457 	case 0x36: do_scalar_FCMEQ_zero (cpu); return;
9458 	case 0x3A: do_scalar_FCMLT_zero (cpu); return;
9459 	default:
9460 	  HALT_NYI;
9461 	}
9462 
9463     default:
9464       HALT_NYI;
9465     }
9466 }
9467 
9468 static void
9469 dexAdvSIMD1 (sim_cpu *cpu)
9470 {
9471   /* instr [28,25] = 1 111.  */
9472 
9473   /* We are currently only interested in the basic
9474      scalar fp routines which all have bit 30 = 0.  */
9475   if (INSTR (30, 30))
9476     do_scalar_vec (cpu);
9477 
9478   /* instr[24] is set for FP data processing 3-source and clear for
9479      all other basic scalar fp instruction groups.  */
9480   else if (INSTR (24, 24))
9481     dexSimpleFPDataProc3Source (cpu);
9482 
9483   /* instr[21] is clear for floating <-> fixed conversions and set for
9484      all other basic scalar fp instruction groups.  */
9485   else if (!INSTR (21, 21))
9486     dexSimpleFPFixedConvert (cpu);
9487 
9488   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
9489      11 ==> cond select,  00 ==> other.  */
9490   else
9491     switch (INSTR (11, 10))
9492       {
9493       case 1: dexSimpleFPCondCompare (cpu); return;
9494       case 2: dexSimpleFPDataProc2Source (cpu); return;
9495       case 3: dexSimpleFPCondSelect (cpu); return;
9496 
9497       default:
9498 	/* Now an ordered cascade of tests.
9499 	   FP immediate has instr [12] == 1.
9500 	   FP compare has   instr [13] == 1.
9501 	   FP Data Proc 1 Source has instr [14] == 1.
9502 	   FP floating <--> integer conversions has instr [15] == 0.  */
9503 	if (INSTR (12, 12))
9504 	  dexSimpleFPImmediate (cpu);
9505 
9506 	else if (INSTR (13, 13))
9507 	  dexSimpleFPCompare (cpu);
9508 
9509 	else if (INSTR (14, 14))
9510 	  dexSimpleFPDataProc1Source (cpu);
9511 
9512 	else if (!INSTR (15, 15))
9513 	  dexSimpleFPIntegerConvert (cpu);
9514 
9515 	else
9516 	  /* If we get here then instr[15] == 1 which means UNALLOC.  */
9517 	  HALT_UNALLOC;
9518       }
9519 }
9520 
9521 /* PC relative addressing.  */
9522 
9523 static void
9524 pcadr (sim_cpu *cpu)
9525 {
9526   /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
9527      instr[30,29] = immlo
9528      instr[23,5] = immhi.  */
9529   uint64_t address;
9530   unsigned rd = INSTR (4, 0);
9531   uint32_t isPage = INSTR (31, 31);
9532   union { int64_t u64; uint64_t s64; } imm;
9533   uint64_t offset;
9534 
9535   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
9536   offset = imm.u64;
9537   offset = (offset << 2) | INSTR (30, 29);
9538 
9539   address = aarch64_get_PC (cpu);
9540 
9541   if (isPage)
9542     {
9543       offset <<= 12;
9544       address &= ~0xfff;
9545     }
9546 
9547   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9548   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
9549 }
9550 
9551 /* Specific decode and execute for group Data Processing Immediate.  */
9552 
9553 static void
9554 dexPCRelAddressing (sim_cpu *cpu)
9555 {
9556   /* assert instr[28,24] = 10000.  */
9557   pcadr (cpu);
9558 }
9559 
9560 /* Immediate logical.
9561    The bimm32/64 argument is constructed by replicating a 2, 4, 8,
9562    16, 32 or 64 bit sequence pulled out at decode and possibly
9563    inverting it..
9564 
9565    N.B. the output register (dest) can normally be Xn or SP
9566    the exception occurs for flag setting instructions which may
9567    only use Xn for the output (dest).  The input register can
9568    never be SP.  */
9569 
9570 /* 32 bit and immediate.  */
9571 static void
9572 and32 (sim_cpu *cpu, uint32_t bimm)
9573 {
9574   unsigned rn = INSTR (9, 5);
9575   unsigned rd = INSTR (4, 0);
9576 
9577   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9578   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9579 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
9580 }
9581 
9582 /* 64 bit and immediate.  */
9583 static void
9584 and64 (sim_cpu *cpu, uint64_t bimm)
9585 {
9586   unsigned rn = INSTR (9, 5);
9587   unsigned rd = INSTR (4, 0);
9588 
9589   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9590   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9591 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
9592 }
9593 
9594 /* 32 bit and immediate set flags.  */
9595 static void
9596 ands32 (sim_cpu *cpu, uint32_t bimm)
9597 {
9598   unsigned rn = INSTR (9, 5);
9599   unsigned rd = INSTR (4, 0);
9600 
9601   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9602   uint32_t value2 = bimm;
9603 
9604   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9605   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9606   set_flags_for_binop32 (cpu, value1 & value2);
9607 }
9608 
9609 /* 64 bit and immediate set flags.  */
9610 static void
9611 ands64 (sim_cpu *cpu, uint64_t bimm)
9612 {
9613   unsigned rn = INSTR (9, 5);
9614   unsigned rd = INSTR (4, 0);
9615 
9616   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9617   uint64_t value2 = bimm;
9618 
9619   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9620   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9621   set_flags_for_binop64 (cpu, value1 & value2);
9622 }
9623 
9624 /* 32 bit exclusive or immediate.  */
9625 static void
9626 eor32 (sim_cpu *cpu, uint32_t bimm)
9627 {
9628   unsigned rn = INSTR (9, 5);
9629   unsigned rd = INSTR (4, 0);
9630 
9631   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9632   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9633 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
9634 }
9635 
9636 /* 64 bit exclusive or immediate.  */
9637 static void
9638 eor64 (sim_cpu *cpu, uint64_t bimm)
9639 {
9640   unsigned rn = INSTR (9, 5);
9641   unsigned rd = INSTR (4, 0);
9642 
9643   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9644   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9645 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
9646 }
9647 
9648 /* 32 bit or immediate.  */
9649 static void
9650 orr32 (sim_cpu *cpu, uint32_t bimm)
9651 {
9652   unsigned rn = INSTR (9, 5);
9653   unsigned rd = INSTR (4, 0);
9654 
9655   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9656   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9657 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
9658 }
9659 
9660 /* 64 bit or immediate.  */
9661 static void
9662 orr64 (sim_cpu *cpu, uint64_t bimm)
9663 {
9664   unsigned rn = INSTR (9, 5);
9665   unsigned rd = INSTR (4, 0);
9666 
9667   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9668   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9669 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
9670 }
9671 
9672 /* Logical shifted register.
9673    These allow an optional LSL, ASR, LSR or ROR to the second source
9674    register with a count up to the register bit count.
9675    N.B register args may not be SP.  */
9676 
9677 /* 32 bit AND shifted register.  */
9678 static void
9679 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9680 {
9681   unsigned rm = INSTR (20, 16);
9682   unsigned rn = INSTR (9, 5);
9683   unsigned rd = INSTR (4, 0);
9684 
9685   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9686   aarch64_set_reg_u64
9687     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9688      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9689 }
9690 
9691 /* 64 bit AND shifted register.  */
9692 static void
9693 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9694 {
9695   unsigned rm = INSTR (20, 16);
9696   unsigned rn = INSTR (9, 5);
9697   unsigned rd = INSTR (4, 0);
9698 
9699   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9700   aarch64_set_reg_u64
9701     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9702      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9703 }
9704 
9705 /* 32 bit AND shifted register setting flags.  */
9706 static void
9707 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9708 {
9709   unsigned rm = INSTR (20, 16);
9710   unsigned rn = INSTR (9, 5);
9711   unsigned rd = INSTR (4, 0);
9712 
9713   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9714   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9715 			       shift, count);
9716 
9717   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9718   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9719   set_flags_for_binop32 (cpu, value1 & value2);
9720 }
9721 
9722 /* 64 bit AND shifted register setting flags.  */
9723 static void
9724 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9725 {
9726   unsigned rm = INSTR (20, 16);
9727   unsigned rn = INSTR (9, 5);
9728   unsigned rd = INSTR (4, 0);
9729 
9730   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9731   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9732 			       shift, count);
9733 
9734   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9735   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9736   set_flags_for_binop64 (cpu, value1 & value2);
9737 }
9738 
9739 /* 32 bit BIC shifted register.  */
9740 static void
9741 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9742 {
9743   unsigned rm = INSTR (20, 16);
9744   unsigned rn = INSTR (9, 5);
9745   unsigned rd = INSTR (4, 0);
9746 
9747   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9748   aarch64_set_reg_u64
9749     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9750      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9751 }
9752 
9753 /* 64 bit BIC shifted register.  */
9754 static void
9755 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9756 {
9757   unsigned rm = INSTR (20, 16);
9758   unsigned rn = INSTR (9, 5);
9759   unsigned rd = INSTR (4, 0);
9760 
9761   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9762   aarch64_set_reg_u64
9763     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9764      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9765 }
9766 
9767 /* 32 bit BIC shifted register setting flags.  */
9768 static void
9769 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9770 {
9771   unsigned rm = INSTR (20, 16);
9772   unsigned rn = INSTR (9, 5);
9773   unsigned rd = INSTR (4, 0);
9774 
9775   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9776   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9777 				 shift, count);
9778 
9779   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9780   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9781   set_flags_for_binop32 (cpu, value1 & value2);
9782 }
9783 
9784 /* 64 bit BIC shifted register setting flags.  */
9785 static void
9786 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9787 {
9788   unsigned rm = INSTR (20, 16);
9789   unsigned rn = INSTR (9, 5);
9790   unsigned rd = INSTR (4, 0);
9791 
9792   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9793   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9794 				 shift, count);
9795 
9796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9797   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9798   set_flags_for_binop64 (cpu, value1 & value2);
9799 }
9800 
9801 /* 32 bit EON shifted register.  */
9802 static void
9803 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9804 {
9805   unsigned rm = INSTR (20, 16);
9806   unsigned rn = INSTR (9, 5);
9807   unsigned rd = INSTR (4, 0);
9808 
9809   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9810   aarch64_set_reg_u64
9811     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9812      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9813 }
9814 
9815 /* 64 bit EON shifted register.  */
9816 static void
9817 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9818 {
9819   unsigned rm = INSTR (20, 16);
9820   unsigned rn = INSTR (9, 5);
9821   unsigned rd = INSTR (4, 0);
9822 
9823   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9824   aarch64_set_reg_u64
9825     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9826      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9827 }
9828 
9829 /* 32 bit EOR shifted register.  */
9830 static void
9831 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9832 {
9833   unsigned rm = INSTR (20, 16);
9834   unsigned rn = INSTR (9, 5);
9835   unsigned rd = INSTR (4, 0);
9836 
9837   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9838   aarch64_set_reg_u64
9839     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9840      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9841 }
9842 
9843 /* 64 bit EOR shifted register.  */
9844 static void
9845 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9846 {
9847   unsigned rm = INSTR (20, 16);
9848   unsigned rn = INSTR (9, 5);
9849   unsigned rd = INSTR (4, 0);
9850 
9851   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9852   aarch64_set_reg_u64
9853     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9854      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9855 }
9856 
9857 /* 32 bit ORR shifted register.  */
9858 static void
9859 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9860 {
9861   unsigned rm = INSTR (20, 16);
9862   unsigned rn = INSTR (9, 5);
9863   unsigned rd = INSTR (4, 0);
9864 
9865   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9866   aarch64_set_reg_u64
9867     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9868      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9869 }
9870 
9871 /* 64 bit ORR shifted register.  */
9872 static void
9873 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9874 {
9875   unsigned rm = INSTR (20, 16);
9876   unsigned rn = INSTR (9, 5);
9877   unsigned rd = INSTR (4, 0);
9878 
9879   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9880   aarch64_set_reg_u64
9881     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9882      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9883 }
9884 
9885 /* 32 bit ORN shifted register.  */
9886 static void
9887 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9888 {
9889   unsigned rm = INSTR (20, 16);
9890   unsigned rn = INSTR (9, 5);
9891   unsigned rd = INSTR (4, 0);
9892 
9893   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9894   aarch64_set_reg_u64
9895     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9896      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9897 }
9898 
9899 /* 64 bit ORN shifted register.  */
9900 static void
9901 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9902 {
9903   unsigned rm = INSTR (20, 16);
9904   unsigned rn = INSTR (9, 5);
9905   unsigned rd = INSTR (4, 0);
9906 
9907   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9908   aarch64_set_reg_u64
9909     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9910      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9911 }
9912 
9913 static void
9914 dexLogicalImmediate (sim_cpu *cpu)
9915 {
9916   /* assert instr[28,23] = 1001000
9917      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9918      instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
9919      instr[22] = N : used to construct immediate mask
9920      instr[21,16] = immr
9921      instr[15,10] = imms
9922      instr[9,5] = Rn
9923      instr[4,0] = Rd  */
9924 
9925   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
9926   uint32_t size = INSTR (31, 31);
9927   uint32_t N = INSTR (22, 22);
9928   /* uint32_t immr = INSTR (21, 16);.  */
9929   /* uint32_t imms = INSTR (15, 10);.  */
9930   uint32_t index = INSTR (22, 10);
9931   uint64_t bimm64 = LITable [index];
9932   uint32_t dispatch = INSTR (30, 29);
9933 
9934   if (~size & N)
9935     HALT_UNALLOC;
9936 
9937   if (!bimm64)
9938     HALT_UNALLOC;
9939 
9940   if (size == 0)
9941     {
9942       uint32_t bimm = (uint32_t) bimm64;
9943 
9944       switch (dispatch)
9945 	{
9946 	case 0: and32 (cpu, bimm); return;
9947 	case 1: orr32 (cpu, bimm); return;
9948 	case 2: eor32 (cpu, bimm); return;
9949 	case 3: ands32 (cpu, bimm); return;
9950 	}
9951     }
9952   else
9953     {
9954       switch (dispatch)
9955 	{
9956 	case 0: and64 (cpu, bimm64); return;
9957 	case 1: orr64 (cpu, bimm64); return;
9958 	case 2: eor64 (cpu, bimm64); return;
9959 	case 3: ands64 (cpu, bimm64); return;
9960 	}
9961     }
9962   HALT_UNALLOC;
9963 }
9964 
9965 /* Immediate move.
9966    The uimm argument is a 16 bit value to be inserted into the
9967    target register the pos argument locates the 16 bit word in the
9968    dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
9969    3} for 64 bit.
9970    N.B register arg may not be SP so it should be.
9971    accessed using the setGZRegisterXXX accessors.  */
9972 
9973 /* 32 bit move 16 bit immediate zero remaining shorts.  */
9974 static void
9975 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9976 {
9977   unsigned rd = INSTR (4, 0);
9978 
9979   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9980   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
9981 }
9982 
9983 /* 64 bit move 16 bit immediate zero remaining shorts.  */
9984 static void
9985 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9986 {
9987   unsigned rd = INSTR (4, 0);
9988 
9989   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9990   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
9991 }
9992 
9993 /* 32 bit move 16 bit immediate negated.  */
9994 static void
9995 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9996 {
9997   unsigned rd = INSTR (4, 0);
9998 
9999   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10000   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
10001 }
10002 
10003 /* 64 bit move 16 bit immediate negated.  */
10004 static void
10005 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
10006 {
10007   unsigned rd = INSTR (4, 0);
10008 
10009   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10010   aarch64_set_reg_u64
10011     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
10012 		      ^ 0xffffffffffffffffULL));
10013 }
10014 
10015 /* 32 bit move 16 bit immediate keep remaining shorts.  */
10016 static void
10017 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
10018 {
10019   unsigned rd = INSTR (4, 0);
10020   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
10021   uint32_t value = val << (pos * 16);
10022   uint32_t mask = ~(0xffffU << (pos * 16));
10023 
10024   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10025   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
10026 }
10027 
10028 /* 64 bit move 16 it immediate keep remaining shorts.  */
10029 static void
10030 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
10031 {
10032   unsigned rd = INSTR (4, 0);
10033   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
10034   uint64_t value = (uint64_t) val << (pos * 16);
10035   uint64_t mask = ~(0xffffULL << (pos * 16));
10036 
10037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10038   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
10039 }
10040 
10041 static void
10042 dexMoveWideImmediate (sim_cpu *cpu)
10043 {
10044   /* assert instr[28:23] = 100101
10045      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10046      instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
10047      instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
10048      instr[20,5] = uimm16
10049      instr[4,0] = Rd  */
10050 
10051   /* N.B. the (multiple of 16) shift is applied by the called routine,
10052      we just pass the multiplier.  */
10053 
10054   uint32_t imm;
10055   uint32_t size = INSTR (31, 31);
10056   uint32_t op = INSTR (30, 29);
10057   uint32_t shift = INSTR (22, 21);
10058 
10059   /* 32 bit can only shift 0 or 1 lot of 16.
10060      anything else is an unallocated instruction.  */
10061   if (size == 0 && (shift > 1))
10062     HALT_UNALLOC;
10063 
10064   if (op == 1)
10065     HALT_UNALLOC;
10066 
10067   imm = INSTR (20, 5);
10068 
10069   if (size == 0)
10070     {
10071       if (op == 0)
10072 	movn32 (cpu, imm, shift);
10073       else if (op == 2)
10074 	movz32 (cpu, imm, shift);
10075       else
10076 	movk32 (cpu, imm, shift);
10077     }
10078   else
10079     {
10080       if (op == 0)
10081 	movn64 (cpu, imm, shift);
10082       else if (op == 2)
10083 	movz64 (cpu, imm, shift);
10084       else
10085 	movk64 (cpu, imm, shift);
10086     }
10087 }
10088 
10089 /* Bitfield operations.
10090    These take a pair of bit positions r and s which are in {0..31}
10091    or {0..63} depending on the instruction word size.
10092    N.B register args may not be SP.  */
10093 
10094 /* OK, we start with ubfm which just needs to pick
10095    some bits out of source zero the rest and write
10096    the result to dest.  Just need two logical shifts.  */
10097 
10098 /* 32 bit bitfield move, left and right of affected zeroed
10099    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
10100 static void
10101 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
10102 {
10103   unsigned rd;
10104   unsigned rn = INSTR (9, 5);
10105   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10106 
10107   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
10108   if (r <= s)
10109     {
10110       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
10111          We want only bits s:xxx:r at the bottom of the word
10112          so we LSL bit s up to bit 31 i.e. by 31 - s
10113          and then we LSR to bring bit 31 down to bit s - r
10114 	 i.e. by 31 + r - s.  */
10115       value <<= 31 - s;
10116       value >>= 31 + r - s;
10117     }
10118   else
10119     {
10120       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
10121          We want only bits s:xxx:0 starting at it 31-(r-1)
10122          so we LSL bit s up to bit 31 i.e. by 31 - s
10123          and then we LSL to bring bit 31 down to 31-(r-1)+s
10124 	 i.e. by r - (s + 1).  */
10125       value <<= 31 - s;
10126       value >>= r - (s + 1);
10127     }
10128 
10129   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10130   rd = INSTR (4, 0);
10131   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
10132 }
10133 
10134 /* 64 bit bitfield move, left and right of affected zeroed
10135    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10136 static void
10137 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10138 {
10139   unsigned rd;
10140   unsigned rn = INSTR (9, 5);
10141   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
10142 
10143   if (r <= s)
10144     {
10145       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
10146          We want only bits s:xxx:r at the bottom of the word.
10147          So we LSL bit s up to bit 63 i.e. by 63 - s
10148          and then we LSR to bring bit 63 down to bit s - r
10149 	 i.e. by 63 + r - s.  */
10150       value <<= 63 - s;
10151       value >>= 63 + r - s;
10152     }
10153   else
10154     {
10155       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
10156          We want only bits s:xxx:0 starting at it 63-(r-1).
10157          So we LSL bit s up to bit 63 i.e. by 63 - s
10158          and then we LSL to bring bit 63 down to 63-(r-1)+s
10159 	 i.e. by r - (s + 1).  */
10160       value <<= 63 - s;
10161       value >>= r - (s + 1);
10162     }
10163 
10164   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10165   rd = INSTR (4, 0);
10166   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
10167 }
10168 
10169 /* The signed versions need to insert sign bits
10170    on the left of the inserted bit field. so we do
10171    much the same as the unsigned version except we
10172    use an arithmetic shift right -- this just means
10173    we need to operate on signed values.  */
10174 
10175 /* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
10176 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
10177 static void
10178 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
10179 {
10180   unsigned rd;
10181   unsigned rn = INSTR (9, 5);
10182   /* as per ubfm32 but use an ASR instead of an LSR.  */
10183   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
10184 
10185   if (r <= s)
10186     {
10187       value <<= 31 - s;
10188       value >>= 31 + r - s;
10189     }
10190   else
10191     {
10192       value <<= 31 - s;
10193       value >>= r - (s + 1);
10194     }
10195 
10196   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10197   rd = INSTR (4, 0);
10198   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
10199 }
10200 
10201 /* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
10202 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10203 static void
10204 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10205 {
10206   unsigned rd;
10207   unsigned rn = INSTR (9, 5);
10208   /* acpu per ubfm but use an ASR instead of an LSR.  */
10209   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
10210 
10211   if (r <= s)
10212     {
10213       value <<= 63 - s;
10214       value >>= 63 + r - s;
10215     }
10216   else
10217     {
10218       value <<= 63 - s;
10219       value >>= r - (s + 1);
10220     }
10221 
10222   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10223   rd = INSTR (4, 0);
10224   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
10225 }
10226 
10227 /* Finally, these versions leave non-affected bits
10228    as is. so we need to generate the bits as per
10229    ubfm and also generate a mask to pick the
10230    bits from the original and computed values.  */
10231 
10232 /* 32 bit bitfield move, non-affected bits left as is.
10233    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
10234 static void
10235 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
10236 {
10237   unsigned rn = INSTR (9, 5);
10238   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10239   uint32_t mask = -1;
10240   unsigned rd;
10241   uint32_t value2;
10242 
10243   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
10244   if (r <= s)
10245     {
10246       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
10247          We want only bits s:xxx:r at the bottom of the word
10248          so we LSL bit s up to bit 31 i.e. by 31 - s
10249          and then we LSR to bring bit 31 down to bit s - r
10250 	 i.e. by 31 + r - s.  */
10251       value <<= 31 - s;
10252       value >>= 31 + r - s;
10253       /* the mask must include the same bits.  */
10254       mask <<= 31 - s;
10255       mask >>= 31 + r - s;
10256     }
10257   else
10258     {
10259       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
10260          We want only bits s:xxx:0 starting at it 31-(r-1)
10261          so we LSL bit s up to bit 31 i.e. by 31 - s
10262          and then we LSL to bring bit 31 down to 31-(r-1)+s
10263 	 i.e. by r - (s + 1).  */
10264       value <<= 31 - s;
10265       value >>= r - (s + 1);
10266       /* The mask must include the same bits.  */
10267       mask <<= 31 - s;
10268       mask >>= r - (s + 1);
10269     }
10270 
10271   rd = INSTR (4, 0);
10272   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
10273 
10274   value2 &= ~mask;
10275   value2 |= value;
10276 
10277   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10278   aarch64_set_reg_u64
10279     (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
10280 }
10281 
10282 /* 64 bit bitfield move, non-affected bits left as is.
10283    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10284 static void
10285 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10286 {
10287   unsigned rd;
10288   unsigned rn = INSTR (9, 5);
10289   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
10290   uint64_t mask = 0xffffffffffffffffULL;
10291 
10292   if (r <= s)
10293     {
10294       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
10295          We want only bits s:xxx:r at the bottom of the word
10296          so we LSL bit s up to bit 63 i.e. by 63 - s
10297          and then we LSR to bring bit 63 down to bit s - r
10298 	 i.e. by 63 + r - s.  */
10299       value <<= 63 - s;
10300       value >>= 63 + r - s;
10301       /* The mask must include the same bits.  */
10302       mask <<= 63 - s;
10303       mask >>= 63 + r - s;
10304     }
10305   else
10306     {
10307       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
10308          We want only bits s:xxx:0 starting at it 63-(r-1)
10309          so we LSL bit s up to bit 63 i.e. by 63 - s
10310          and then we LSL to bring bit 63 down to 63-(r-1)+s
10311 	 i.e. by r - (s + 1).  */
10312       value <<= 63 - s;
10313       value >>= r - (s + 1);
10314       /* The mask must include the same bits.  */
10315       mask <<= 63 - s;
10316       mask >>= r - (s + 1);
10317     }
10318 
10319   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10320   rd = INSTR (4, 0);
10321   aarch64_set_reg_u64
10322     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
10323 }
10324 
10325 static void
10326 dexBitfieldImmediate (sim_cpu *cpu)
10327 {
10328   /* assert instr[28:23] = 100110
10329      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10330      instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
10331      instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
10332      instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
10333      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10334      instr[9,5] = Rn
10335      instr[4,0] = Rd  */
10336 
10337   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10338   uint32_t dispatch;
10339   uint32_t imms;
10340   uint32_t size = INSTR (31, 31);
10341   uint32_t N = INSTR (22, 22);
10342   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
10343   /* or else we have an UNALLOC.  */
10344   uint32_t immr = INSTR (21, 16);
10345 
10346   if (~size & N)
10347     HALT_UNALLOC;
10348 
10349   if (!size && uimm (immr, 5, 5))
10350     HALT_UNALLOC;
10351 
10352   imms = INSTR (15, 10);
10353   if (!size && uimm (imms, 5, 5))
10354     HALT_UNALLOC;
10355 
10356   /* Switch on combined size and op.  */
10357   dispatch = INSTR (31, 29);
10358   switch (dispatch)
10359     {
10360     case 0: sbfm32 (cpu, immr, imms); return;
10361     case 1: bfm32 (cpu, immr, imms); return;
10362     case 2: ubfm32 (cpu, immr, imms); return;
10363     case 4: sbfm (cpu, immr, imms); return;
10364     case 5: bfm (cpu, immr, imms); return;
10365     case 6: ubfm (cpu, immr, imms); return;
10366     default: HALT_UNALLOC;
10367     }
10368 }
10369 
10370 static void
10371 do_EXTR_32 (sim_cpu *cpu)
10372 {
10373   /* instr[31:21] = 00010011100
10374      instr[20,16] = Rm
10375      instr[15,10] = imms :  0xxxxx for 32 bit
10376      instr[9,5]   = Rn
10377      instr[4,0]   = Rd  */
10378   unsigned rm   = INSTR (20, 16);
10379   unsigned imms = INSTR (15, 10) & 31;
10380   unsigned rn   = INSTR ( 9,  5);
10381   unsigned rd   = INSTR ( 4,  0);
10382   uint64_t val1;
10383   uint64_t val2;
10384 
10385   val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
10386   val1 >>= imms;
10387   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10388   val2 <<= (32 - imms);
10389 
10390   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10391   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
10392 }
10393 
10394 static void
10395 do_EXTR_64 (sim_cpu *cpu)
10396 {
10397   /* instr[31:21] = 10010011100
10398      instr[20,16] = Rm
10399      instr[15,10] = imms
10400      instr[9,5]   = Rn
10401      instr[4,0]   = Rd  */
10402   unsigned rm   = INSTR (20, 16);
10403   unsigned imms = INSTR (15, 10) & 63;
10404   unsigned rn   = INSTR ( 9,  5);
10405   unsigned rd   = INSTR ( 4,  0);
10406   uint64_t val;
10407 
10408   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
10409   val >>= imms;
10410   val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));
10411 
10412   aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
10413 }
10414 
10415 static void
10416 dexExtractImmediate (sim_cpu *cpu)
10417 {
10418   /* assert instr[28:23] = 100111
10419      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
10420      instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
10421      instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
10422      instr[21]    = op0 : must be 0 or UNALLOC
10423      instr[20,16] = Rm
10424      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10425      instr[9,5]   = Rn
10426      instr[4,0]   = Rd  */
10427 
10428   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10429   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
10430   uint32_t dispatch;
10431   uint32_t size = INSTR (31, 31);
10432   uint32_t N = INSTR (22, 22);
10433   /* 32 bit operations must have imms[5] = 0
10434      or else we have an UNALLOC.  */
10435   uint32_t imms = INSTR (15, 10);
10436 
10437   if (size ^ N)
10438     HALT_UNALLOC;
10439 
10440   if (!size && uimm (imms, 5, 5))
10441     HALT_UNALLOC;
10442 
10443   /* Switch on combined size and op.  */
10444   dispatch = INSTR (31, 29);
10445 
10446   if (dispatch == 0)
10447     do_EXTR_32 (cpu);
10448 
10449   else if (dispatch == 4)
10450     do_EXTR_64 (cpu);
10451 
10452   else if (dispatch == 1)
10453     HALT_NYI;
10454   else
10455     HALT_UNALLOC;
10456 }
10457 
10458 static void
10459 dexDPImm (sim_cpu *cpu)
10460 {
10461   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
10462      assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
10463      bits [25,23] of a DPImm are the secondary dispatch vector.  */
10464   uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));
10465 
10466   switch (group2)
10467     {
10468     case DPIMM_PCADR_000:
10469     case DPIMM_PCADR_001:
10470       dexPCRelAddressing (cpu);
10471       return;
10472 
10473     case DPIMM_ADDSUB_010:
10474     case DPIMM_ADDSUB_011:
10475       dexAddSubtractImmediate (cpu);
10476       return;
10477 
10478     case DPIMM_LOG_100:
10479       dexLogicalImmediate (cpu);
10480       return;
10481 
10482     case DPIMM_MOV_101:
10483       dexMoveWideImmediate (cpu);
10484       return;
10485 
10486     case DPIMM_BITF_110:
10487       dexBitfieldImmediate (cpu);
10488       return;
10489 
10490     case DPIMM_EXTR_111:
10491       dexExtractImmediate (cpu);
10492       return;
10493 
10494     default:
10495       /* Should never reach here.  */
10496       HALT_NYI;
10497     }
10498 }
10499 
10500 static void
10501 dexLoadUnscaledImmediate (sim_cpu *cpu)
10502 {
10503   /* instr[29,24] == 111_00
10504      instr[21] == 0
10505      instr[11,10] == 00
10506      instr[31,30] = size
10507      instr[26] = V
10508      instr[23,22] = opc
10509      instr[20,12] = simm9
10510      instr[9,5] = rn may be SP.  */
10511   /* unsigned rt = INSTR (4, 0);  */
10512   uint32_t V = INSTR (26, 26);
10513   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10514   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
10515 
10516   if (!V)
10517     {
10518       /* GReg operations.  */
10519       switch (dispatch)
10520 	{
10521 	case 0:	 sturb (cpu, imm); return;
10522 	case 1:	 ldurb32 (cpu, imm); return;
10523 	case 2:	 ldursb64 (cpu, imm); return;
10524 	case 3:	 ldursb32 (cpu, imm); return;
10525 	case 4:	 sturh (cpu, imm); return;
10526 	case 5:	 ldurh32 (cpu, imm); return;
10527 	case 6:	 ldursh64 (cpu, imm); return;
10528 	case 7:	 ldursh32 (cpu, imm); return;
10529 	case 8:	 stur32 (cpu, imm); return;
10530 	case 9:	 ldur32 (cpu, imm); return;
10531 	case 10: ldursw (cpu, imm); return;
10532 	case 12: stur64 (cpu, imm); return;
10533 	case 13: ldur64 (cpu, imm); return;
10534 
10535 	case 14:
10536 	  /* PRFUM NYI.  */
10537 	  HALT_NYI;
10538 
10539 	default:
10540 	case 11:
10541 	case 15:
10542 	  HALT_UNALLOC;
10543 	}
10544     }
10545 
10546   /* FReg operations.  */
10547   switch (dispatch)
10548     {
10549     case 2:  fsturq (cpu, imm); return;
10550     case 3:  fldurq (cpu, imm); return;
10551     case 8:  fsturs (cpu, imm); return;
10552     case 9:  fldurs (cpu, imm); return;
10553     case 12: fsturd (cpu, imm); return;
10554     case 13: fldurd (cpu, imm); return;
10555 
10556     case 0: /* STUR 8 bit FP.  */
10557     case 1: /* LDUR 8 bit FP.  */
10558     case 4: /* STUR 16 bit FP.  */
10559     case 5: /* LDUR 8 bit FP.  */
10560       HALT_NYI;
10561 
10562     default:
10563     case 6:
10564     case 7:
10565     case 10:
10566     case 11:
10567     case 14:
10568     case 15:
10569       HALT_UNALLOC;
10570     }
10571 }
10572 
10573 /*  N.B. A preliminary note regarding all the ldrs<x>32
10574     instructions
10575 
10576    The signed value loaded by these instructions is cast to unsigned
10577    before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
10578    64 bit element of the GReg union. this performs a 32 bit sign extension
10579    (as required) but avoids 64 bit sign extension, thus ensuring that the
10580    top half of the register word is zero. this is what the spec demands
10581    when a 32 bit load occurs.  */
10582 
10583 /* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
10584 static void
10585 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
10586 {
10587   unsigned int rn = INSTR (9, 5);
10588   unsigned int rt = INSTR (4, 0);
10589 
10590   /* The target register may not be SP but the source may be
10591      there is no scaling required for a byte load.  */
10592   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
10593   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10594 		       (int64_t) aarch64_get_mem_s8 (cpu, address));
10595 }
10596 
10597 /* 32 bit load sign-extended byte scaled or unscaled zero-
10598    or sign-extended 32-bit register offset.  */
10599 static void
10600 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10601 {
10602   unsigned int rm = INSTR (20, 16);
10603   unsigned int rn = INSTR (9, 5);
10604   unsigned int rt = INSTR (4, 0);
10605 
10606   /* rn may reference SP, rm and rt must reference ZR.  */
10607 
10608   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10609   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10610 				 extension);
10611 
10612   /* There is no scaling required for a byte load.  */
10613   aarch64_set_reg_u64
10614     (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
10615 						   + displacement));
10616 }
10617 
10618 /* 32 bit load sign-extended byte unscaled signed 9 bit with
10619    pre- or post-writeback.  */
10620 static void
10621 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10622 {
10623   uint64_t address;
10624   unsigned int rn = INSTR (9, 5);
10625   unsigned int rt = INSTR (4, 0);
10626 
10627   if (rn == rt && wb != NoWriteBack)
10628     HALT_UNALLOC;
10629 
10630   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10631 
10632   if (wb == Pre)
10633       address += offset;
10634 
10635   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10636 		       (int64_t) aarch64_get_mem_s8 (cpu, address));
10637 
10638   if (wb == Post)
10639     address += offset;
10640 
10641   if (wb != NoWriteBack)
10642     aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
10643 }
10644 
10645 /* 8 bit store scaled.  */
10646 static void
10647 fstrb_abs (sim_cpu *cpu, uint32_t offset)
10648 {
10649   unsigned st = INSTR (4, 0);
10650   unsigned rn = INSTR (9, 5);
10651 
10652   aarch64_set_mem_u8 (cpu,
10653 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
10654 		      aarch64_get_vec_u8 (cpu, st, 0));
10655 }
10656 
10657 /* 8 bit store scaled or unscaled zero- or
10658    sign-extended 8-bit register offset.  */
10659 static void
10660 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10661 {
10662   unsigned rm = INSTR (20, 16);
10663   unsigned rn = INSTR (9, 5);
10664   unsigned st = INSTR (4, 0);
10665 
10666   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10667   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10668 			       extension);
10669   uint64_t  displacement = scaling == Scaled ? extended : 0;
10670 
10671   aarch64_set_mem_u8
10672     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
10673 }
10674 
10675 /* 16 bit store scaled.  */
10676 static void
10677 fstrh_abs (sim_cpu *cpu, uint32_t offset)
10678 {
10679   unsigned st = INSTR (4, 0);
10680   unsigned rn = INSTR (9, 5);
10681 
10682   aarch64_set_mem_u16
10683     (cpu,
10684      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
10685      aarch64_get_vec_u16 (cpu, st, 0));
10686 }
10687 
10688 /* 16 bit store scaled or unscaled zero-
10689    or sign-extended 16-bit register offset.  */
10690 static void
10691 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10692 {
10693   unsigned rm = INSTR (20, 16);
10694   unsigned rn = INSTR (9, 5);
10695   unsigned st = INSTR (4, 0);
10696 
10697   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10698   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10699 			       extension);
10700   uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
10701 
10702   aarch64_set_mem_u16
10703     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
10704 }
10705 
10706 /* 32 bit store scaled unsigned 12 bit.  */
10707 static void
10708 fstrs_abs (sim_cpu *cpu, uint32_t offset)
10709 {
10710   unsigned st = INSTR (4, 0);
10711   unsigned rn = INSTR (9, 5);
10712 
10713   aarch64_set_mem_u32
10714     (cpu,
10715      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
10716      aarch64_get_vec_u32 (cpu, st, 0));
10717 }
10718 
10719 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10720 static void
10721 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10722 {
10723   unsigned rn = INSTR (9, 5);
10724   unsigned st = INSTR (4, 0);
10725 
10726   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10727 
10728   if (wb != Post)
10729     address += offset;
10730 
10731   aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
10732 
10733   if (wb == Post)
10734     address += offset;
10735 
10736   if (wb != NoWriteBack)
10737     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10738 }
10739 
10740 /* 32 bit store scaled or unscaled zero-
10741    or sign-extended 32-bit register offset.  */
10742 static void
10743 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10744 {
10745   unsigned rm = INSTR (20, 16);
10746   unsigned rn = INSTR (9, 5);
10747   unsigned st = INSTR (4, 0);
10748 
10749   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10750   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10751 			       extension);
10752   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
10753 
10754   aarch64_set_mem_u32
10755     (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
10756 }
10757 
10758 /* 64 bit store scaled unsigned 12 bit.  */
10759 static void
10760 fstrd_abs (sim_cpu *cpu, uint32_t offset)
10761 {
10762   unsigned st = INSTR (4, 0);
10763   unsigned rn = INSTR (9, 5);
10764 
10765   aarch64_set_mem_u64
10766     (cpu,
10767      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
10768      aarch64_get_vec_u64 (cpu, st, 0));
10769 }
10770 
10771 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10772 static void
10773 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10774 {
10775   unsigned rn = INSTR (9, 5);
10776   unsigned st = INSTR (4, 0);
10777 
10778   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10779 
10780   if (wb != Post)
10781     address += offset;
10782 
10783   aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
10784 
10785   if (wb == Post)
10786     address += offset;
10787 
10788   if (wb != NoWriteBack)
10789     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10790 }
10791 
10792 /* 64 bit store scaled or unscaled zero-
10793    or sign-extended 32-bit register offset.  */
10794 static void
10795 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10796 {
10797   unsigned rm = INSTR (20, 16);
10798   unsigned rn = INSTR (9, 5);
10799   unsigned st = INSTR (4, 0);
10800 
10801   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10802   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10803 			       extension);
10804   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
10805 
10806   aarch64_set_mem_u64
10807     (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
10808 }
10809 
10810 /* 128 bit store scaled unsigned 12 bit.  */
10811 static void
10812 fstrq_abs (sim_cpu *cpu, uint32_t offset)
10813 {
10814   FRegister a;
10815   unsigned st = INSTR (4, 0);
10816   unsigned rn = INSTR (9, 5);
10817   uint64_t addr;
10818 
10819   aarch64_get_FP_long_double (cpu, st, & a);
10820 
10821   addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
10822   aarch64_set_mem_long_double (cpu, addr, a);
10823 }
10824 
10825 /* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10826 static void
10827 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10828 {
10829   FRegister a;
10830   unsigned rn = INSTR (9, 5);
10831   unsigned st = INSTR (4, 0);
10832   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10833 
10834   if (wb != Post)
10835     address += offset;
10836 
10837   aarch64_get_FP_long_double (cpu, st, & a);
10838   aarch64_set_mem_long_double (cpu, address, a);
10839 
10840   if (wb == Post)
10841     address += offset;
10842 
10843   if (wb != NoWriteBack)
10844     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10845 }
10846 
10847 /* 128 bit store scaled or unscaled zero-
10848    or sign-extended 32-bit register offset.  */
10849 static void
10850 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10851 {
10852   unsigned rm = INSTR (20, 16);
10853   unsigned rn = INSTR (9, 5);
10854   unsigned st = INSTR (4, 0);
10855 
10856   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10857   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10858 			       extension);
10859   uint64_t  displacement = OPT_SCALE (extended, 128, scaling);
10860 
10861   FRegister a;
10862 
10863   aarch64_get_FP_long_double (cpu, st, & a);
10864   aarch64_set_mem_long_double (cpu, address + displacement, a);
10865 }
10866 
10867 static void
10868 dexLoadImmediatePrePost (sim_cpu *cpu)
10869 {
10870   /* instr[31,30] = size
10871      instr[29,27] = 111
10872      instr[26]    = V
10873      instr[25,24] = 00
10874      instr[23,22] = opc
10875      instr[21]    = 0
10876      instr[20,12] = simm9
10877      instr[11]    = wb : 0 ==> Post, 1 ==> Pre
10878      instr[10]    = 0
10879      instr[9,5]   = Rn may be SP.
10880      instr[4,0]   = Rt */
10881 
10882   uint32_t  V        = INSTR (26, 26);
10883   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10884   int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
10885   WriteBack wb       = INSTR (11, 11);
10886 
10887   if (!V)
10888     {
10889       /* GReg operations.  */
10890       switch (dispatch)
10891 	{
10892 	case 0:	 strb_wb (cpu, imm, wb); return;
10893 	case 1:	 ldrb32_wb (cpu, imm, wb); return;
10894 	case 2:	 ldrsb_wb (cpu, imm, wb); return;
10895 	case 3:	 ldrsb32_wb (cpu, imm, wb); return;
10896 	case 4:	 strh_wb (cpu, imm, wb); return;
10897 	case 5:	 ldrh32_wb (cpu, imm, wb); return;
10898 	case 6:	 ldrsh64_wb (cpu, imm, wb); return;
10899 	case 7:	 ldrsh32_wb (cpu, imm, wb); return;
10900 	case 8:	 str32_wb (cpu, imm, wb); return;
10901 	case 9:	 ldr32_wb (cpu, imm, wb); return;
10902 	case 10: ldrsw_wb (cpu, imm, wb); return;
10903 	case 12: str_wb (cpu, imm, wb); return;
10904 	case 13: ldr_wb (cpu, imm, wb); return;
10905 
10906 	default:
10907 	case 11:
10908 	case 14:
10909 	case 15:
10910 	  HALT_UNALLOC;
10911 	}
10912     }
10913 
10914   /* FReg operations.  */
10915   switch (dispatch)
10916     {
10917     case 2:  fstrq_wb (cpu, imm, wb); return;
10918     case 3:  fldrq_wb (cpu, imm, wb); return;
10919     case 8:  fstrs_wb (cpu, imm, wb); return;
10920     case 9:  fldrs_wb (cpu, imm, wb); return;
10921     case 12: fstrd_wb (cpu, imm, wb); return;
10922     case 13: fldrd_wb (cpu, imm, wb); return;
10923 
10924     case 0:	  /* STUR 8 bit FP.  */
10925     case 1:	  /* LDUR 8 bit FP.  */
10926     case 4:	  /* STUR 16 bit FP.  */
10927     case 5:	  /* LDUR 8 bit FP.  */
10928       HALT_NYI;
10929 
10930     default:
10931     case 6:
10932     case 7:
10933     case 10:
10934     case 11:
10935     case 14:
10936     case 15:
10937       HALT_UNALLOC;
10938     }
10939 }
10940 
10941 static void
10942 dexLoadRegisterOffset (sim_cpu *cpu)
10943 {
10944   /* instr[31,30] = size
10945      instr[29,27] = 111
10946      instr[26]    = V
10947      instr[25,24] = 00
10948      instr[23,22] = opc
10949      instr[21]    = 1
10950      instr[20,16] = rm
10951      instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
10952                              110 ==> SXTW, 111 ==> SXTX,
10953                              ow ==> RESERVED
10954      instr[12]    = scaled
10955      instr[11,10] = 10
10956      instr[9,5]   = rn
10957      instr[4,0]   = rt.  */
10958 
10959   uint32_t  V = INSTR (26, 26);
10960   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10961   Scaling   scale = INSTR (12, 12);
10962   Extension extensionType = INSTR (15, 13);
10963 
10964   /* Check for illegal extension types.  */
10965   if (uimm (extensionType, 1, 1) == 0)
10966     HALT_UNALLOC;
10967 
10968   if (extensionType == UXTX || extensionType == SXTX)
10969     extensionType = NoExtension;
10970 
10971   if (!V)
10972     {
10973       /* GReg operations.  */
10974       switch (dispatch)
10975 	{
10976 	case 0:	 strb_scale_ext (cpu, scale, extensionType); return;
10977 	case 1:	 ldrb32_scale_ext (cpu, scale, extensionType); return;
10978 	case 2:	 ldrsb_scale_ext (cpu, scale, extensionType); return;
10979 	case 3:	 ldrsb32_scale_ext (cpu, scale, extensionType); return;
10980 	case 4:	 strh_scale_ext (cpu, scale, extensionType); return;
10981 	case 5:	 ldrh32_scale_ext (cpu, scale, extensionType); return;
10982 	case 6:	 ldrsh_scale_ext (cpu, scale, extensionType); return;
10983 	case 7:	 ldrsh32_scale_ext (cpu, scale, extensionType); return;
10984 	case 8:	 str32_scale_ext (cpu, scale, extensionType); return;
10985 	case 9:	 ldr32_scale_ext (cpu, scale, extensionType); return;
10986 	case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
10987 	case 12: str_scale_ext (cpu, scale, extensionType); return;
10988 	case 13: ldr_scale_ext (cpu, scale, extensionType); return;
10989 	case 14: prfm_scale_ext (cpu, scale, extensionType); return;
10990 
10991 	default:
10992 	case 11:
10993 	case 15:
10994 	  HALT_UNALLOC;
10995 	}
10996     }
10997 
10998   /* FReg operations.  */
10999   switch (dispatch)
11000     {
11001     case 1: /* LDUR 8 bit FP.  */
11002       HALT_NYI;
11003     case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
11004     case 5: /* LDUR 8 bit FP.  */
11005       HALT_NYI;
11006     case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
11007     case 13: fldrd_scale_ext (cpu, scale, extensionType); return;
11008 
11009     case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
11010     case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
11011     case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
11012     case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
11013     case 12: fstrd_scale_ext (cpu, scale, extensionType); return;
11014 
11015     default:
11016     case 6:
11017     case 7:
11018     case 10:
11019     case 11:
11020     case 14:
11021     case 15:
11022       HALT_UNALLOC;
11023     }
11024 }
11025 
11026 static void
11027 dexLoadUnsignedImmediate (sim_cpu *cpu)
11028 {
11029   /* instr[29,24] == 111_01
11030      instr[31,30] = size
11031      instr[26]    = V
11032      instr[23,22] = opc
11033      instr[21,10] = uimm12 : unsigned immediate offset
11034      instr[9,5]   = rn may be SP.
11035      instr[4,0]   = rt.  */
11036 
11037   uint32_t V = INSTR (26,26);
11038   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
11039   uint32_t imm = INSTR (21, 10);
11040 
11041   if (!V)
11042     {
11043       /* GReg operations.  */
11044       switch (dispatch)
11045 	{
11046 	case 0:  strb_abs (cpu, imm); return;
11047 	case 1:  ldrb32_abs (cpu, imm); return;
11048 	case 2:  ldrsb_abs (cpu, imm); return;
11049 	case 3:  ldrsb32_abs (cpu, imm); return;
11050 	case 4:  strh_abs (cpu, imm); return;
11051 	case 5:  ldrh32_abs (cpu, imm); return;
11052 	case 6:  ldrsh_abs (cpu, imm); return;
11053 	case 7:  ldrsh32_abs (cpu, imm); return;
11054 	case 8:  str32_abs (cpu, imm); return;
11055 	case 9:  ldr32_abs (cpu, imm); return;
11056 	case 10: ldrsw_abs (cpu, imm); return;
11057 	case 12: str_abs (cpu, imm); return;
11058 	case 13: ldr_abs (cpu, imm); return;
11059 	case 14: prfm_abs (cpu, imm); return;
11060 
11061 	default:
11062 	case 11:
11063 	case 15:
11064 	  HALT_UNALLOC;
11065 	}
11066     }
11067 
11068   /* FReg operations.  */
11069   switch (dispatch)
11070     {
11071     case 0:  fstrb_abs (cpu, imm); return;
11072     case 4:  fstrh_abs (cpu, imm); return;
11073     case 8:  fstrs_abs (cpu, imm); return;
11074     case 12: fstrd_abs (cpu, imm); return;
11075     case 2:  fstrq_abs (cpu, imm); return;
11076 
11077     case 1:  fldrb_abs (cpu, imm); return;
11078     case 5:  fldrh_abs (cpu, imm); return;
11079     case 9:  fldrs_abs (cpu, imm); return;
11080     case 13: fldrd_abs (cpu, imm); return;
11081     case 3:  fldrq_abs (cpu, imm); return;
11082 
11083     default:
11084     case 6:
11085     case 7:
11086     case 10:
11087     case 11:
11088     case 14:
11089     case 15:
11090       HALT_UNALLOC;
11091     }
11092 }
11093 
11094 static void
11095 dexLoadExclusive (sim_cpu *cpu)
11096 {
11097   /* assert instr[29:24] = 001000;
11098      instr[31,30] = size
11099      instr[23] = 0 if exclusive
11100      instr[22] = L : 1 if load, 0 if store
11101      instr[21] = 1 if pair
11102      instr[20,16] = Rs
11103      instr[15] = o0 : 1 if ordered
11104      instr[14,10] = Rt2
11105      instr[9,5] = Rn
11106      instr[4.0] = Rt.  */
11107 
11108   switch (INSTR (22, 21))
11109     {
11110     case 2:   ldxr (cpu); return;
11111     case 0:   stxr (cpu); return;
11112     default:  HALT_NYI;
11113     }
11114 }
11115 
11116 static void
11117 dexLoadOther (sim_cpu *cpu)
11118 {
11119   uint32_t dispatch;
11120 
11121   /* instr[29,25] = 111_0
11122      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
11123      instr[21:11,10] is the secondary dispatch.  */
11124   if (INSTR (24, 24))
11125     {
11126       dexLoadUnsignedImmediate (cpu);
11127       return;
11128     }
11129 
11130   dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
11131   switch (dispatch)
11132     {
11133     case 0: dexLoadUnscaledImmediate (cpu); return;
11134     case 1: dexLoadImmediatePrePost (cpu); return;
11135     case 3: dexLoadImmediatePrePost (cpu); return;
11136     case 6: dexLoadRegisterOffset (cpu); return;
11137 
11138     default:
11139     case 2:
11140     case 4:
11141     case 5:
11142     case 7:
11143       HALT_NYI;
11144     }
11145 }
11146 
11147 static void
11148 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11149 {
11150   unsigned rn = INSTR (14, 10);
11151   unsigned rd = INSTR (9, 5);
11152   unsigned rm = INSTR (4, 0);
11153   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11154 
11155   if ((rn == rd || rm == rd) && wb != NoWriteBack)
11156     HALT_UNALLOC; /* ??? */
11157 
11158   offset <<= 2;
11159 
11160   if (wb != Post)
11161     address += offset;
11162 
11163   aarch64_set_mem_u32 (cpu, address,
11164 		       aarch64_get_reg_u32 (cpu, rm, NO_SP));
11165   aarch64_set_mem_u32 (cpu, address + 4,
11166 		       aarch64_get_reg_u32 (cpu, rn, NO_SP));
11167 
11168   if (wb == Post)
11169     address += offset;
11170 
11171   if (wb != NoWriteBack)
11172     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11173 }
11174 
11175 static void
11176 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11177 {
11178   unsigned rn = INSTR (14, 10);
11179   unsigned rd = INSTR (9, 5);
11180   unsigned rm = INSTR (4, 0);
11181   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11182 
11183   if ((rn == rd || rm == rd) && wb != NoWriteBack)
11184     HALT_UNALLOC; /* ??? */
11185 
11186   offset <<= 3;
11187 
11188   if (wb != Post)
11189     address += offset;
11190 
11191   aarch64_set_mem_u64 (cpu, address,
11192 		       aarch64_get_reg_u64 (cpu, rm, NO_SP));
11193   aarch64_set_mem_u64 (cpu, address + 8,
11194 		       aarch64_get_reg_u64 (cpu, rn, NO_SP));
11195 
11196   if (wb == Post)
11197     address += offset;
11198 
11199   if (wb != NoWriteBack)
11200     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11201 }
11202 
11203 static void
11204 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11205 {
11206   unsigned rn = INSTR (14, 10);
11207   unsigned rd = INSTR (9, 5);
11208   unsigned rm = INSTR (4, 0);
11209   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11210 
11211   /* Treat this as unalloc to make sure we don't do it.  */
11212   if (rn == rm)
11213     HALT_UNALLOC;
11214 
11215   offset <<= 2;
11216 
11217   if (wb != Post)
11218     address += offset;
11219 
11220   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
11221   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));
11222 
11223   if (wb == Post)
11224     address += offset;
11225 
11226   if (wb != NoWriteBack)
11227     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11228 }
11229 
11230 static void
11231 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11232 {
11233   unsigned rn = INSTR (14, 10);
11234   unsigned rd = INSTR (9, 5);
11235   unsigned rm = INSTR (4, 0);
11236   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11237 
11238   /* Treat this as unalloc to make sure we don't do it.  */
11239   if (rn == rm)
11240     HALT_UNALLOC;
11241 
11242   offset <<= 2;
11243 
11244   if (wb != Post)
11245     address += offset;
11246 
11247   aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
11248   aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));
11249 
11250   if (wb == Post)
11251     address += offset;
11252 
11253   if (wb != NoWriteBack)
11254     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11255 }
11256 
11257 static void
11258 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
11259 {
11260   unsigned rn = INSTR (14, 10);
11261   unsigned rd = INSTR (9, 5);
11262   unsigned rm = INSTR (4, 0);
11263   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11264 
11265   /* Treat this as unalloc to make sure we don't do it.  */
11266   if (rn == rm)
11267     HALT_UNALLOC;
11268 
11269   offset <<= 3;
11270 
11271   if (wb != Post)
11272     address += offset;
11273 
11274   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
11275   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));
11276 
11277   if (wb == Post)
11278     address += offset;
11279 
11280   if (wb != NoWriteBack)
11281     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11282 }
11283 
11284 static void
11285 dex_load_store_pair_gr (sim_cpu *cpu)
11286 {
11287   /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
11288      instr[29,25] = instruction encoding: 101_0
11289      instr[26]    = V : 1 if fp 0 if gp
11290      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11291      instr[22]    = load/store (1=> load)
11292      instr[21,15] = signed, scaled, offset
11293      instr[14,10] = Rn
11294      instr[ 9, 5] = Rd
11295      instr[ 4, 0] = Rm.  */
11296 
11297   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11298   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11299 
11300   switch (dispatch)
11301     {
11302     case 2: store_pair_u32 (cpu, offset, Post); return;
11303     case 3: load_pair_u32  (cpu, offset, Post); return;
11304     case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
11305     case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
11306     case 6: store_pair_u32 (cpu, offset, Pre); return;
11307     case 7: load_pair_u32  (cpu, offset, Pre); return;
11308 
11309     case 11: load_pair_s32  (cpu, offset, Post); return;
11310     case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
11311     case 15: load_pair_s32  (cpu, offset, Pre); return;
11312 
11313     case 18: store_pair_u64 (cpu, offset, Post); return;
11314     case 19: load_pair_u64  (cpu, offset, Post); return;
11315     case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
11316     case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
11317     case 22: store_pair_u64 (cpu, offset, Pre); return;
11318     case 23: load_pair_u64  (cpu, offset, Pre); return;
11319 
11320     default:
11321       HALT_UNALLOC;
11322     }
11323 }
11324 
11325 static void
11326 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11327 {
11328   unsigned rn = INSTR (14, 10);
11329   unsigned rd = INSTR (9, 5);
11330   unsigned rm = INSTR (4, 0);
11331   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11332 
11333   offset <<= 2;
11334 
11335   if (wb != Post)
11336     address += offset;
11337 
11338   aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
11339   aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
11340 
11341   if (wb == Post)
11342     address += offset;
11343 
11344   if (wb != NoWriteBack)
11345     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11346 }
11347 
11348 static void
11349 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11350 {
11351   unsigned rn = INSTR (14, 10);
11352   unsigned rd = INSTR (9, 5);
11353   unsigned rm = INSTR (4, 0);
11354   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11355 
11356   offset <<= 3;
11357 
11358   if (wb != Post)
11359     address += offset;
11360 
11361   aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
11362   aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
11363 
11364   if (wb == Post)
11365     address += offset;
11366 
11367   if (wb != NoWriteBack)
11368     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11369 }
11370 
11371 static void
11372 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11373 {
11374   FRegister a;
11375   unsigned rn = INSTR (14, 10);
11376   unsigned rd = INSTR (9, 5);
11377   unsigned rm = INSTR (4, 0);
11378   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11379 
11380   offset <<= 4;
11381 
11382   if (wb != Post)
11383     address += offset;
11384 
11385   aarch64_get_FP_long_double (cpu, rm, & a);
11386   aarch64_set_mem_long_double (cpu, address, a);
11387   aarch64_get_FP_long_double (cpu, rn, & a);
11388   aarch64_set_mem_long_double (cpu, address + 16, a);
11389 
11390   if (wb == Post)
11391     address += offset;
11392 
11393   if (wb != NoWriteBack)
11394     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11395 }
11396 
11397 static void
11398 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11399 {
11400   unsigned rn = INSTR (14, 10);
11401   unsigned rd = INSTR (9, 5);
11402   unsigned rm = INSTR (4, 0);
11403   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11404 
11405   if (rm == rn)
11406     HALT_UNALLOC;
11407 
11408   offset <<= 2;
11409 
11410   if (wb != Post)
11411     address += offset;
11412 
11413   aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
11414   aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
11415 
11416   if (wb == Post)
11417     address += offset;
11418 
11419   if (wb != NoWriteBack)
11420     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11421 }
11422 
11423 static void
11424 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11425 {
11426   unsigned rn = INSTR (14, 10);
11427   unsigned rd = INSTR (9, 5);
11428   unsigned rm = INSTR (4, 0);
11429   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11430 
11431   if (rm == rn)
11432     HALT_UNALLOC;
11433 
11434   offset <<= 3;
11435 
11436   if (wb != Post)
11437     address += offset;
11438 
11439   aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
11440   aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
11441 
11442   if (wb == Post)
11443     address += offset;
11444 
11445   if (wb != NoWriteBack)
11446     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11447 }
11448 
11449 static void
11450 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11451 {
11452   FRegister a;
11453   unsigned rn = INSTR (14, 10);
11454   unsigned rd = INSTR (9, 5);
11455   unsigned rm = INSTR (4, 0);
11456   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11457 
11458   if (rm == rn)
11459     HALT_UNALLOC;
11460 
11461   offset <<= 4;
11462 
11463   if (wb != Post)
11464     address += offset;
11465 
11466   aarch64_get_mem_long_double (cpu, address, & a);
11467   aarch64_set_FP_long_double (cpu, rm, a);
11468   aarch64_get_mem_long_double (cpu, address + 16, & a);
11469   aarch64_set_FP_long_double (cpu, rn, a);
11470 
11471   if (wb == Post)
11472     address += offset;
11473 
11474   if (wb != NoWriteBack)
11475     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11476 }
11477 
11478 static void
11479 dex_load_store_pair_fp (sim_cpu *cpu)
11480 {
11481   /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
11482      instr[29,25] = instruction encoding
11483      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11484      instr[22]    = load/store (1=> load)
11485      instr[21,15] = signed, scaled, offset
11486      instr[14,10] = Rn
11487      instr[ 9, 5] = Rd
11488      instr[ 4, 0] = Rm  */
11489 
11490   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11491   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11492 
11493   switch (dispatch)
11494     {
11495     case 2: store_pair_float (cpu, offset, Post); return;
11496     case 3: load_pair_float  (cpu, offset, Post); return;
11497     case 4: store_pair_float (cpu, offset, NoWriteBack); return;
11498     case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
11499     case 6: store_pair_float (cpu, offset, Pre); return;
11500     case 7: load_pair_float  (cpu, offset, Pre); return;
11501 
11502     case 10: store_pair_double (cpu, offset, Post); return;
11503     case 11: load_pair_double  (cpu, offset, Post); return;
11504     case 12: store_pair_double (cpu, offset, NoWriteBack); return;
11505     case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
11506     case 14: store_pair_double (cpu, offset, Pre); return;
11507     case 15: load_pair_double  (cpu, offset, Pre); return;
11508 
11509     case 18: store_pair_long_double (cpu, offset, Post); return;
11510     case 19: load_pair_long_double  (cpu, offset, Post); return;
11511     case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
11512     case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
11513     case 22: store_pair_long_double (cpu, offset, Pre); return;
11514     case 23: load_pair_long_double  (cpu, offset, Pre); return;
11515 
11516     default:
11517       HALT_UNALLOC;
11518     }
11519 }
11520 
11521 static inline unsigned
11522 vec_reg (unsigned v, unsigned o)
11523 {
11524   return (v + o) & 0x3F;
11525 }
11526 
11527 /* Load multiple N-element structures to N consecutive registers.  */
11528 static void
11529 vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
11530 {
11531   int      all  = INSTR (30, 30);
11532   unsigned size = INSTR (11, 10);
11533   unsigned vd   = INSTR (4, 0);
11534   unsigned i;
11535 
11536   switch (size)
11537     {
11538     case 0: /* 8-bit operations.  */
11539       if (all)
11540 	for (i = 0; i < (16 * N); i++)
11541 	  aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
11542 			      aarch64_get_mem_u8 (cpu, address + i));
11543       else
11544 	for (i = 0; i < (8 * N); i++)
11545 	  aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
11546 			      aarch64_get_mem_u8 (cpu, address + i));
11547       return;
11548 
11549     case 1: /* 16-bit operations.  */
11550       if (all)
11551 	for (i = 0; i < (8 * N); i++)
11552 	  aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
11553 			       aarch64_get_mem_u16 (cpu, address + i * 2));
11554       else
11555 	for (i = 0; i < (4 * N); i++)
11556 	  aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
11557 			       aarch64_get_mem_u16 (cpu, address + i * 2));
11558       return;
11559 
11560     case 2: /* 32-bit operations.  */
11561       if (all)
11562 	for (i = 0; i < (4 * N); i++)
11563 	  aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
11564 			       aarch64_get_mem_u32 (cpu, address + i * 4));
11565       else
11566 	for (i = 0; i < (2 * N); i++)
11567 	  aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
11568 			       aarch64_get_mem_u32 (cpu, address + i * 4));
11569       return;
11570 
11571     case 3: /* 64-bit operations.  */
11572       if (all)
11573 	for (i = 0; i < (2 * N); i++)
11574 	  aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
11575 			       aarch64_get_mem_u64 (cpu, address + i * 8));
11576       else
11577 	for (i = 0; i < N; i++)
11578 	  aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
11579 			       aarch64_get_mem_u64 (cpu, address + i * 8));
11580       return;
11581     }
11582 }
11583 
11584 /* LD4: load multiple 4-element to four consecutive registers.  */
11585 static void
11586 LD4 (sim_cpu *cpu, uint64_t address)
11587 {
11588   vec_load (cpu, address, 4);
11589 }
11590 
11591 /* LD3: load multiple 3-element structures to three consecutive registers.  */
11592 static void
11593 LD3 (sim_cpu *cpu, uint64_t address)
11594 {
11595   vec_load (cpu, address, 3);
11596 }
11597 
11598 /* LD2: load multiple 2-element structures to two consecutive registers.  */
11599 static void
11600 LD2 (sim_cpu *cpu, uint64_t address)
11601 {
11602   vec_load (cpu, address, 2);
11603 }
11604 
11605 /* Load multiple 1-element structures into one register.  */
11606 static void
11607 LD1_1 (sim_cpu *cpu, uint64_t address)
11608 {
11609   int      all  = INSTR (30, 30);
11610   unsigned size = INSTR (11, 10);
11611   unsigned vd   = INSTR (4, 0);
11612   unsigned i;
11613 
11614   switch (size)
11615     {
11616     case 0:
11617       /* LD1 {Vd.16b}, addr, #16 */
11618       /* LD1 {Vd.8b}, addr, #8 */
11619       for (i = 0; i < (all ? 16 : 8); i++)
11620 	aarch64_set_vec_u8 (cpu, vd, i,
11621 			    aarch64_get_mem_u8 (cpu, address + i));
11622       return;
11623 
11624     case 1:
11625       /* LD1 {Vd.8h}, addr, #16 */
11626       /* LD1 {Vd.4h}, addr, #8 */
11627       for (i = 0; i < (all ? 8 : 4); i++)
11628 	aarch64_set_vec_u16 (cpu, vd, i,
11629 			     aarch64_get_mem_u16 (cpu, address + i * 2));
11630       return;
11631 
11632     case 2:
11633       /* LD1 {Vd.4s}, addr, #16 */
11634       /* LD1 {Vd.2s}, addr, #8 */
11635       for (i = 0; i < (all ? 4 : 2); i++)
11636 	aarch64_set_vec_u32 (cpu, vd, i,
11637 			     aarch64_get_mem_u32 (cpu, address + i * 4));
11638       return;
11639 
11640     case 3:
11641       /* LD1 {Vd.2d}, addr, #16 */
11642       /* LD1 {Vd.1d}, addr, #8 */
11643       for (i = 0; i < (all ? 2 : 1); i++)
11644 	aarch64_set_vec_u64 (cpu, vd, i,
11645 			     aarch64_get_mem_u64 (cpu, address + i * 8));
11646       return;
11647     }
11648 }
11649 
11650 /* Load multiple 1-element structures into two registers.  */
11651 static void
11652 LD1_2 (sim_cpu *cpu, uint64_t address)
11653 {
11654   /* FIXME: This algorithm is *exactly* the same as the LD2 version.
11655      So why have two different instructions ?  There must be something
11656      wrong somewhere.  */
11657   vec_load (cpu, address, 2);
11658 }
11659 
11660 /* Load multiple 1-element structures into three registers.  */
11661 static void
11662 LD1_3 (sim_cpu *cpu, uint64_t address)
11663 {
11664   /* FIXME: This algorithm is *exactly* the same as the LD3 version.
11665      So why have two different instructions ?  There must be something
11666      wrong somewhere.  */
11667   vec_load (cpu, address, 3);
11668 }
11669 
11670 /* Load multiple 1-element structures into four registers.  */
11671 static void
11672 LD1_4 (sim_cpu *cpu, uint64_t address)
11673 {
11674   /* FIXME: This algorithm is *exactly* the same as the LD4 version.
11675      So why have two different instructions ?  There must be something
11676      wrong somewhere.  */
11677   vec_load (cpu, address, 4);
11678 }
11679 
11680 /* Store multiple N-element structures to N consecutive registers.  */
11681 static void
11682 vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
11683 {
11684   int      all  = INSTR (30, 30);
11685   unsigned size = INSTR (11, 10);
11686   unsigned vd   = INSTR (4, 0);
11687   unsigned i;
11688 
11689   switch (size)
11690     {
11691     case 0: /* 8-bit operations.  */
11692       if (all)
11693 	for (i = 0; i < (16 * N); i++)
11694 	  aarch64_set_mem_u8
11695 	    (cpu, address + i,
11696 	     aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
11697       else
11698 	for (i = 0; i < (8 * N); i++)
11699 	  aarch64_set_mem_u8
11700 	    (cpu, address + i,
11701 	     aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
11702       return;
11703 
11704     case 1: /* 16-bit operations.  */
11705       if (all)
11706 	for (i = 0; i < (8 * N); i++)
11707 	  aarch64_set_mem_u16
11708 	    (cpu, address + i * 2,
11709 	     aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
11710       else
11711 	for (i = 0; i < (4 * N); i++)
11712 	  aarch64_set_mem_u16
11713 	    (cpu, address + i * 2,
11714 	     aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
11715       return;
11716 
11717     case 2: /* 32-bit operations.  */
11718       if (all)
11719 	for (i = 0; i < (4 * N); i++)
11720 	  aarch64_set_mem_u32
11721 	    (cpu, address + i * 4,
11722 	     aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
11723       else
11724 	for (i = 0; i < (2 * N); i++)
11725 	  aarch64_set_mem_u32
11726 	    (cpu, address + i * 4,
11727 	     aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
11728       return;
11729 
11730     case 3: /* 64-bit operations.  */
11731       if (all)
11732 	for (i = 0; i < (2 * N); i++)
11733 	  aarch64_set_mem_u64
11734 	    (cpu, address + i * 8,
11735 	     aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
11736       else
11737 	for (i = 0; i < N; i++)
11738 	  aarch64_set_mem_u64
11739 	    (cpu, address + i * 8,
11740 	     aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
11741       return;
11742     }
11743 }
11744 
11745 /* Store multiple 4-element structure to four consecutive registers.  */
11746 static void
11747 ST4 (sim_cpu *cpu, uint64_t address)
11748 {
11749   vec_store (cpu, address, 4);
11750 }
11751 
11752 /* Store multiple 3-element structures to three consecutive registers.  */
11753 static void
11754 ST3 (sim_cpu *cpu, uint64_t address)
11755 {
11756   vec_store (cpu, address, 3);
11757 }
11758 
11759 /* Store multiple 2-element structures to two consecutive registers.  */
11760 static void
11761 ST2 (sim_cpu *cpu, uint64_t address)
11762 {
11763   vec_store (cpu, address, 2);
11764 }
11765 
11766 /* Store multiple 1-element structures into one register.  */
11767 static void
11768 ST1_1 (sim_cpu *cpu, uint64_t address)
11769 {
11770   int      all  = INSTR (30, 30);
11771   unsigned size = INSTR (11, 10);
11772   unsigned vd   = INSTR (4, 0);
11773   unsigned i;
11774 
11775   switch (size)
11776     {
11777     case 0:
11778       for (i = 0; i < (all ? 16 : 8); i++)
11779 	aarch64_set_mem_u8 (cpu, address + i,
11780 			    aarch64_get_vec_u8 (cpu, vd, i));
11781       return;
11782 
11783     case 1:
11784       for (i = 0; i < (all ? 8 : 4); i++)
11785 	aarch64_set_mem_u16 (cpu, address + i * 2,
11786 			     aarch64_get_vec_u16 (cpu, vd, i));
11787       return;
11788 
11789     case 2:
11790       for (i = 0; i < (all ? 4 : 2); i++)
11791 	aarch64_set_mem_u32 (cpu, address + i * 4,
11792 			     aarch64_get_vec_u32 (cpu, vd, i));
11793       return;
11794 
11795     case 3:
11796       for (i = 0; i < (all ? 2 : 1); i++)
11797 	aarch64_set_mem_u64 (cpu, address + i * 8,
11798 			     aarch64_get_vec_u64 (cpu, vd, i));
11799       return;
11800     }
11801 }
11802 
11803 /* Store multiple 1-element structures into two registers.  */
11804 static void
11805 ST1_2 (sim_cpu *cpu, uint64_t address)
11806 {
11807   /* FIXME: This algorithm is *exactly* the same as the ST2 version.
11808      So why have two different instructions ?  There must be
11809      something wrong somewhere.  */
11810   vec_store (cpu, address, 2);
11811 }
11812 
11813 /* Store multiple 1-element structures into three registers.  */
11814 static void
11815 ST1_3 (sim_cpu *cpu, uint64_t address)
11816 {
11817   /* FIXME: This algorithm is *exactly* the same as the ST3 version.
11818      So why have two different instructions ?  There must be
11819      something wrong somewhere.  */
11820   vec_store (cpu, address, 3);
11821 }
11822 
11823 /* Store multiple 1-element structures into four registers.  */
11824 static void
11825 ST1_4 (sim_cpu *cpu, uint64_t address)
11826 {
11827   /* FIXME: This algorithm is *exactly* the same as the ST4 version.
11828      So why have two different instructions ?  There must be
11829      something wrong somewhere.  */
11830   vec_store (cpu, address, 4);
11831 }
11832 
11833 #define LDn_STn_SINGLE_LANE_AND_SIZE()				\
11834   do								\
11835     {								\
11836       switch (INSTR (15, 14))					\
11837 	{							\
11838 	case 0:							\
11839 	  lane = (full << 3) | (s << 2) | size;			\
11840 	  size = 0;						\
11841 	  break;						\
11842 								\
11843 	case 1:							\
11844 	  if ((size & 1) == 1)					\
11845 	    HALT_UNALLOC;					\
11846 	  lane = (full << 2) | (s << 1) | (size >> 1);		\
11847 	  size = 1;						\
11848 	  break;						\
11849 								\
11850 	case 2:							\
11851 	  if ((size & 2) == 2)					\
11852 	    HALT_UNALLOC;					\
11853 								\
11854 	  if ((size & 1) == 0)					\
11855 	    {							\
11856 	      lane = (full << 1) | s;				\
11857 	      size = 2;						\
11858 	    }							\
11859 	  else							\
11860 	    {							\
11861 	      if (s)						\
11862 		HALT_UNALLOC;					\
11863 	      lane = full;					\
11864 	      size = 3;						\
11865 	    }							\
11866 	  break;						\
11867 								\
11868 	default:						\
11869 	  HALT_UNALLOC;						\
11870 	}							\
11871     }								\
11872   while (0)
11873 
11874 /* Load single structure into one lane of N registers.  */
11875 static void
11876 do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
11877 {
11878   /* instr[31]    = 0
11879      instr[30]    = element selector 0=>half, 1=>all elements
11880      instr[29,24] = 00 1101
11881      instr[23]    = 0=>simple, 1=>post
11882      instr[22]    = 1
11883      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11884      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11885                       11111 (immediate post inc)
11886      instr[15,13] = opcode
11887      instr[12]    = S, used for lane number
11888      instr[11,10] = size, also used for lane number
11889      instr[9,5]   = address
11890      instr[4,0]   = Vd  */
11891 
11892   unsigned full = INSTR (30, 30);
11893   unsigned vd = INSTR (4, 0);
11894   unsigned size = INSTR (11, 10);
11895   unsigned s = INSTR (12, 12);
11896   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11897   int lane = 0;
11898   int i;
11899 
11900   NYI_assert (29, 24, 0x0D);
11901   NYI_assert (22, 22, 1);
11902 
11903   /* Compute the lane number first (using size), and then compute size.  */
11904   LDn_STn_SINGLE_LANE_AND_SIZE ();
11905 
11906   for (i = 0; i < nregs; i++)
11907     switch (size)
11908       {
11909       case 0:
11910 	{
11911 	  uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
11912 	  aarch64_set_vec_u8 (cpu, vd + i, lane, val);
11913 	  break;
11914 	}
11915 
11916       case 1:
11917 	{
11918 	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
11919 	  aarch64_set_vec_u16 (cpu, vd + i, lane, val);
11920 	  break;
11921 	}
11922 
11923       case 2:
11924 	{
11925 	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
11926 	  aarch64_set_vec_u32 (cpu, vd + i, lane, val);
11927 	  break;
11928 	}
11929 
11930       case 3:
11931 	{
11932 	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
11933 	  aarch64_set_vec_u64 (cpu, vd + i, lane, val);
11934 	  break;
11935 	}
11936       }
11937 }
11938 
11939 /* Store single structure from one lane from N registers.  */
11940 static void
11941 do_vec_STn_single (sim_cpu *cpu, uint64_t address)
11942 {
11943   /* instr[31]    = 0
11944      instr[30]    = element selector 0=>half, 1=>all elements
11945      instr[29,24] = 00 1101
11946      instr[23]    = 0=>simple, 1=>post
11947      instr[22]    = 0
11948      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11949      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11950                       11111 (immediate post inc)
11951      instr[15,13] = opcode
11952      instr[12]    = S, used for lane number
11953      instr[11,10] = size, also used for lane number
11954      instr[9,5]   = address
11955      instr[4,0]   = Vd  */
11956 
11957   unsigned full = INSTR (30, 30);
11958   unsigned vd = INSTR (4, 0);
11959   unsigned size = INSTR (11, 10);
11960   unsigned s = INSTR (12, 12);
11961   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11962   int lane = 0;
11963   int i;
11964 
11965   NYI_assert (29, 24, 0x0D);
11966   NYI_assert (22, 22, 0);
11967 
11968   /* Compute the lane number first (using size), and then compute size.  */
11969   LDn_STn_SINGLE_LANE_AND_SIZE ();
11970 
11971   for (i = 0; i < nregs; i++)
11972     switch (size)
11973       {
11974       case 0:
11975 	{
11976 	  uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
11977 	  aarch64_set_mem_u8 (cpu, address + i, val);
11978 	  break;
11979 	}
11980 
11981       case 1:
11982 	{
11983 	  uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
11984 	  aarch64_set_mem_u16 (cpu, address + (i * 2), val);
11985 	  break;
11986 	}
11987 
11988       case 2:
11989 	{
11990 	  uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
11991 	  aarch64_set_mem_u32 (cpu, address + (i * 4), val);
11992 	  break;
11993 	}
11994 
11995       case 3:
11996 	{
11997 	  uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
11998 	  aarch64_set_mem_u64 (cpu, address + (i * 8), val);
11999 	  break;
12000 	}
12001       }
12002 }
12003 
12004 /* Load single structure into all lanes of N registers.  */
12005 static void
12006 do_vec_LDnR (sim_cpu *cpu, uint64_t address)
12007 {
12008   /* instr[31]    = 0
12009      instr[30]    = element selector 0=>half, 1=>all elements
12010      instr[29,24] = 00 1101
12011      instr[23]    = 0=>simple, 1=>post
12012      instr[22]    = 1
12013      instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
12014      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
12015                       11111 (immediate post inc)
12016      instr[15,14] = 11
12017      instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
12018      instr[12]    = 0
12019      instr[11,10] = element size 00=> byte(b), 01=> half(h),
12020                                  10=> word(s), 11=> double(d)
12021      instr[9,5]   = address
12022      instr[4,0]   = Vd  */
12023 
12024   unsigned full = INSTR (30, 30);
12025   unsigned vd = INSTR (4, 0);
12026   unsigned size = INSTR (11, 10);
12027   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
12028   int i, n;
12029 
12030   NYI_assert (29, 24, 0x0D);
12031   NYI_assert (22, 22, 1);
12032   NYI_assert (15, 14, 3);
12033   NYI_assert (12, 12, 0);
12034 
12035   for (n = 0; n < nregs; n++)
12036     switch (size)
12037       {
12038       case 0:
12039 	{
12040 	  uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
12041 	  for (i = 0; i < (full ? 16 : 8); i++)
12042 	    aarch64_set_vec_u8 (cpu, vd + n, i, val);
12043 	  break;
12044 	}
12045 
12046       case 1:
12047 	{
12048 	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
12049 	  for (i = 0; i < (full ? 8 : 4); i++)
12050 	    aarch64_set_vec_u16 (cpu, vd + n, i, val);
12051 	  break;
12052 	}
12053 
12054       case 2:
12055 	{
12056 	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
12057 	  for (i = 0; i < (full ? 4 : 2); i++)
12058 	    aarch64_set_vec_u32 (cpu, vd + n, i, val);
12059 	  break;
12060 	}
12061 
12062       case 3:
12063 	{
12064 	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
12065 	  for (i = 0; i < (full ? 2 : 1); i++)
12066 	    aarch64_set_vec_u64 (cpu, vd + n, i, val);
12067 	  break;
12068 	}
12069 
12070       default:
12071 	HALT_UNALLOC;
12072       }
12073 }
12074 
12075 static void
12076 do_vec_load_store (sim_cpu *cpu)
12077 {
12078   /* {LD|ST}<N>   {Vd..Vd+N}, vaddr
12079 
12080      instr[31]    = 0
12081      instr[30]    = element selector 0=>half, 1=>all elements
12082      instr[29,25] = 00110
12083      instr[24]    = 0=>multiple struct, 1=>single struct
12084      instr[23]    = 0=>simple, 1=>post
12085      instr[22]    = 0=>store, 1=>load
12086      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
12087      instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
12088                     11111 (immediate post inc)
12089      instr[15,12] = elements and destinations.  eg for load:
12090                      0000=>LD4 => load multiple 4-element to
12091 		     four consecutive registers
12092                      0100=>LD3 => load multiple 3-element to
12093 		     three consecutive registers
12094                      1000=>LD2 => load multiple 2-element to
12095 		     two consecutive registers
12096                      0010=>LD1 => load multiple 1-element to
12097 		     four consecutive registers
12098                      0110=>LD1 => load multiple 1-element to
12099 		     three consecutive registers
12100                      1010=>LD1 => load multiple 1-element to
12101 		     two consecutive registers
12102                      0111=>LD1 => load multiple 1-element to
12103 		     one register
12104                      1100=>LDR1,LDR2
12105                      1110=>LDR3,LDR4
12106      instr[11,10] = element size 00=> byte(b), 01=> half(h),
12107                                  10=> word(s), 11=> double(d)
12108      instr[9,5]   = Vn, can be SP
12109      instr[4,0]   = Vd  */
12110 
12111   int single;
12112   int post;
12113   int load;
12114   unsigned vn;
12115   uint64_t address;
12116   int type;
12117 
12118   if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
12119     HALT_NYI;
12120 
12121   single = INSTR (24, 24);
12122   post = INSTR (23, 23);
12123   load = INSTR (22, 22);
12124   type = INSTR (15, 12);
12125   vn = INSTR (9, 5);
12126   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
12127 
12128   if (! single && INSTR (21, 21) != 0)
12129     HALT_UNALLOC;
12130 
12131   if (post)
12132     {
12133       unsigned vm = INSTR (20, 16);
12134 
12135       if (vm == R31)
12136 	{
12137 	  unsigned sizeof_operation;
12138 
12139 	  if (single)
12140 	    {
12141 	      if ((type >= 0) && (type <= 11))
12142 		{
12143 		  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
12144 		  switch (INSTR (15, 14))
12145 		    {
12146 		    case 0:
12147 		      sizeof_operation = nregs * 1;
12148 		      break;
12149 		    case 1:
12150 		      sizeof_operation = nregs * 2;
12151 		      break;
12152 		    case 2:
12153 		      if (INSTR (10, 10) == 0)
12154 			sizeof_operation = nregs * 4;
12155 		      else
12156 			sizeof_operation = nregs * 8;
12157 		      break;
12158 		    default:
12159 		      HALT_UNALLOC;
12160 		    }
12161 		}
12162 	      else if (type == 0xC)
12163 		{
12164 		  sizeof_operation = INSTR (21, 21) ? 2 : 1;
12165 		  sizeof_operation <<= INSTR (11, 10);
12166 		}
12167 	      else if (type == 0xE)
12168 		{
12169 		  sizeof_operation = INSTR (21, 21) ? 4 : 3;
12170 		  sizeof_operation <<= INSTR (11, 10);
12171 		}
12172 	      else
12173 		HALT_UNALLOC;
12174 	    }
12175 	  else
12176 	    {
12177 	      switch (type)
12178 		{
12179 		case 0: sizeof_operation = 32; break;
12180 		case 4: sizeof_operation = 24; break;
12181 		case 8: sizeof_operation = 16; break;
12182 
12183 		case 7:
12184 		  /* One register, immediate offset variant.  */
12185 		  sizeof_operation = 8;
12186 		  break;
12187 
12188 		case 10:
12189 		  /* Two registers, immediate offset variant.  */
12190 		  sizeof_operation = 16;
12191 		  break;
12192 
12193 		case 6:
12194 		  /* Three registers, immediate offset variant.  */
12195 		  sizeof_operation = 24;
12196 		  break;
12197 
12198 		case 2:
12199 		  /* Four registers, immediate offset variant.  */
12200 		  sizeof_operation = 32;
12201 		  break;
12202 
12203 		default:
12204 		  HALT_UNALLOC;
12205 		}
12206 
12207 	      if (INSTR (30, 30))
12208 		sizeof_operation *= 2;
12209 	    }
12210 
12211 	  aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
12212 	}
12213       else
12214 	aarch64_set_reg_u64 (cpu, vn, SP_OK,
12215 			     address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
12216     }
12217   else
12218     {
12219       NYI_assert (20, 16, 0);
12220     }
12221 
12222   if (single)
12223     {
12224       if (load)
12225 	{
12226 	  if ((type >= 0) && (type <= 11))
12227 	    do_vec_LDn_single (cpu, address);
12228 	  else if ((type == 0xC) || (type == 0xE))
12229 	    do_vec_LDnR (cpu, address);
12230 	  else
12231 	    HALT_UNALLOC;
12232 	  return;
12233 	}
12234 
12235       /* Stores.  */
12236       if ((type >= 0) && (type <= 11))
12237 	{
12238 	  do_vec_STn_single (cpu, address);
12239 	  return;
12240 	}
12241 
12242       HALT_UNALLOC;
12243     }
12244 
12245   if (load)
12246     {
12247       switch (type)
12248 	{
12249 	case 0:  LD4 (cpu, address); return;
12250 	case 4:  LD3 (cpu, address); return;
12251 	case 8:  LD2 (cpu, address); return;
12252 	case 2:  LD1_4 (cpu, address); return;
12253 	case 6:  LD1_3 (cpu, address); return;
12254 	case 10: LD1_2 (cpu, address); return;
12255 	case 7:  LD1_1 (cpu, address); return;
12256 
12257 	default:
12258 	  HALT_UNALLOC;
12259 	}
12260     }
12261 
12262   /* Stores.  */
12263   switch (type)
12264     {
12265     case 0:  ST4 (cpu, address); return;
12266     case 4:  ST3 (cpu, address); return;
12267     case 8:  ST2 (cpu, address); return;
12268     case 2:  ST1_4 (cpu, address); return;
12269     case 6:  ST1_3 (cpu, address); return;
12270     case 10: ST1_2 (cpu, address); return;
12271     case 7:  ST1_1 (cpu, address); return;
12272     default:
12273       HALT_UNALLOC;
12274     }
12275 }
12276 
12277 static void
12278 dexLdSt (sim_cpu *cpu)
12279 {
12280   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
12281      assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
12282              group == GROUP_LDST_1100 || group == GROUP_LDST_1110
12283      bits [29,28:26] of a LS are the secondary dispatch vector.  */
12284   uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));
12285 
12286   switch (group2)
12287     {
12288     case LS_EXCL_000:
12289       dexLoadExclusive (cpu); return;
12290 
12291     case LS_LIT_010:
12292     case LS_LIT_011:
12293       dexLoadLiteral (cpu); return;
12294 
12295     case LS_OTHER_110:
12296     case LS_OTHER_111:
12297       dexLoadOther (cpu); return;
12298 
12299     case LS_ADVSIMD_001:
12300       do_vec_load_store (cpu); return;
12301 
12302     case LS_PAIR_100:
12303       dex_load_store_pair_gr (cpu); return;
12304 
12305     case LS_PAIR_101:
12306       dex_load_store_pair_fp (cpu); return;
12307 
12308     default:
12309       /* Should never reach here.  */
12310       HALT_NYI;
12311     }
12312 }
12313 
12314 /* Specific decode and execute for group Data Processing Register.  */
12315 
12316 static void
12317 dexLogicalShiftedRegister (sim_cpu *cpu)
12318 {
12319   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12320      instr[30,29] = op
12321      instr[28:24] = 01010
12322      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
12323      instr[21]    = N
12324      instr[20,16] = Rm
12325      instr[15,10] = count : must be 0xxxxx for 32 bit
12326      instr[9,5]   = Rn
12327      instr[4,0]   = Rd  */
12328 
12329   uint32_t size      = INSTR (31, 31);
12330   Shift    shiftType = INSTR (23, 22);
12331   uint32_t count     = INSTR (15, 10);
12332 
12333   /* 32 bit operations must have count[5] = 0.
12334      or else we have an UNALLOC.  */
12335   if (size == 0 && uimm (count, 5, 5))
12336     HALT_UNALLOC;
12337 
12338   /* Dispatch on size:op:N.  */
12339   switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
12340     {
12341     case 0: and32_shift  (cpu, shiftType, count); return;
12342     case 1: bic32_shift  (cpu, shiftType, count); return;
12343     case 2: orr32_shift  (cpu, shiftType, count); return;
12344     case 3: orn32_shift  (cpu, shiftType, count); return;
12345     case 4: eor32_shift  (cpu, shiftType, count); return;
12346     case 5: eon32_shift  (cpu, shiftType, count); return;
12347     case 6: ands32_shift (cpu, shiftType, count); return;
12348     case 7: bics32_shift (cpu, shiftType, count); return;
12349     case 8: and64_shift  (cpu, shiftType, count); return;
12350     case 9: bic64_shift  (cpu, shiftType, count); return;
12351     case 10:orr64_shift  (cpu, shiftType, count); return;
12352     case 11:orn64_shift  (cpu, shiftType, count); return;
12353     case 12:eor64_shift  (cpu, shiftType, count); return;
12354     case 13:eon64_shift  (cpu, shiftType, count); return;
12355     case 14:ands64_shift (cpu, shiftType, count); return;
12356     case 15:bics64_shift (cpu, shiftType, count); return;
12357     }
12358 }
12359 
12360 /* 32 bit conditional select.  */
12361 static void
12362 csel32 (sim_cpu *cpu, CondCode cc)
12363 {
12364   unsigned rm = INSTR (20, 16);
12365   unsigned rn = INSTR (9, 5);
12366   unsigned rd = INSTR (4, 0);
12367 
12368   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12369 		       testConditionCode (cpu, cc)
12370 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12371 		       : aarch64_get_reg_u32 (cpu, rm, NO_SP));
12372 }
12373 
12374 /* 64 bit conditional select.  */
12375 static void
12376 csel64 (sim_cpu *cpu, CondCode cc)
12377 {
12378   unsigned rm = INSTR (20, 16);
12379   unsigned rn = INSTR (9, 5);
12380   unsigned rd = INSTR (4, 0);
12381 
12382   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12383 		       testConditionCode (cpu, cc)
12384 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12385 		       : aarch64_get_reg_u64 (cpu, rm, NO_SP));
12386 }
12387 
12388 /* 32 bit conditional increment.  */
12389 static void
12390 csinc32 (sim_cpu *cpu, CondCode cc)
12391 {
12392   unsigned rm = INSTR (20, 16);
12393   unsigned rn = INSTR (9, 5);
12394   unsigned rd = INSTR (4, 0);
12395 
12396   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12397 		       testConditionCode (cpu, cc)
12398 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12399 		       : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
12400 }
12401 
12402 /* 64 bit conditional increment.  */
12403 static void
12404 csinc64 (sim_cpu *cpu, CondCode cc)
12405 {
12406   unsigned rm = INSTR (20, 16);
12407   unsigned rn = INSTR (9, 5);
12408   unsigned rd = INSTR (4, 0);
12409 
12410   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12411 		       testConditionCode (cpu, cc)
12412 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12413 		       : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
12414 }
12415 
12416 /* 32 bit conditional invert.  */
12417 static void
12418 csinv32 (sim_cpu *cpu, CondCode cc)
12419 {
12420   unsigned rm = INSTR (20, 16);
12421   unsigned rn = INSTR (9, 5);
12422   unsigned rd = INSTR (4, 0);
12423 
12424   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12425 		       testConditionCode (cpu, cc)
12426 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12427 		       : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
12428 }
12429 
12430 /* 64 bit conditional invert.  */
12431 static void
12432 csinv64 (sim_cpu *cpu, CondCode cc)
12433 {
12434   unsigned rm = INSTR (20, 16);
12435   unsigned rn = INSTR (9, 5);
12436   unsigned rd = INSTR (4, 0);
12437 
12438   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12439 		       testConditionCode (cpu, cc)
12440 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12441 		       : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
12442 }
12443 
12444 /* 32 bit conditional negate.  */
12445 static void
12446 csneg32 (sim_cpu *cpu, CondCode cc)
12447 {
12448   unsigned rm = INSTR (20, 16);
12449   unsigned rn = INSTR (9, 5);
12450   unsigned rd = INSTR (4, 0);
12451 
12452   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12453 		       testConditionCode (cpu, cc)
12454 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12455 		       : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
12456 }
12457 
12458 /* 64 bit conditional negate.  */
12459 static void
12460 csneg64 (sim_cpu *cpu, CondCode cc)
12461 {
12462   unsigned rm = INSTR (20, 16);
12463   unsigned rn = INSTR (9, 5);
12464   unsigned rd = INSTR (4, 0);
12465 
12466   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12467 		       testConditionCode (cpu, cc)
12468 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12469 		       : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
12470 }
12471 
12472 static void
12473 dexCondSelect (sim_cpu *cpu)
12474 {
12475   /* instr[28,21] = 11011011
12476      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12477      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
12478                             100 ==> CSINV, 101 ==> CSNEG,
12479                             _1_ ==> UNALLOC
12480      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12481      instr[15,12] = cond
12482      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
12483 
12484   CondCode cc = INSTR (15, 12);
12485   uint32_t S = INSTR (29, 29);
12486   uint32_t op2 = INSTR (11, 10);
12487 
12488   if (S == 1)
12489     HALT_UNALLOC;
12490 
12491   if (op2 & 0x2)
12492     HALT_UNALLOC;
12493 
12494   switch ((INSTR (31, 30) << 1) | op2)
12495     {
12496     case 0: csel32  (cpu, cc); return;
12497     case 1: csinc32 (cpu, cc); return;
12498     case 2: csinv32 (cpu, cc); return;
12499     case 3: csneg32 (cpu, cc); return;
12500     case 4: csel64  (cpu, cc); return;
12501     case 5: csinc64 (cpu, cc); return;
12502     case 6: csinv64 (cpu, cc); return;
12503     case 7: csneg64 (cpu, cc); return;
12504     }
12505 }
12506 
12507 /* Some helpers for counting leading 1 or 0 bits.  */
12508 
12509 /* Counts the number of leading bits which are the same
12510    in a 32 bit value in the range 1 to 32.  */
12511 static uint32_t
12512 leading32 (uint32_t value)
12513 {
12514   int32_t mask= 0xffff0000;
12515   uint32_t count= 16; /* Counts number of bits set in mask.  */
12516   uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
12517   uint32_t hi = 32;   /* Upper bound for number of sign bits.  */
12518 
12519   while (lo + 1 < hi)
12520     {
12521       int32_t test = (value & mask);
12522 
12523       if (test == 0 || test == mask)
12524 	{
12525 	  lo = count;
12526 	  count = (lo + hi) / 2;
12527 	  mask >>= (count - lo);
12528 	}
12529       else
12530 	{
12531 	  hi = count;
12532 	  count = (lo + hi) / 2;
12533 	  mask <<= hi - count;
12534 	}
12535     }
12536 
12537   if (lo != hi)
12538     {
12539       int32_t test;
12540 
12541       mask >>= 1;
12542       test = (value & mask);
12543 
12544       if (test == 0 || test == mask)
12545 	count = hi;
12546       else
12547 	count = lo;
12548     }
12549 
12550   return count;
12551 }
12552 
12553 /* Counts the number of leading bits which are the same
12554    in a 64 bit value in the range 1 to 64.  */
12555 static uint64_t
12556 leading64 (uint64_t value)
12557 {
12558   int64_t mask= 0xffffffff00000000LL;
12559   uint64_t count = 32; /* Counts number of bits set in mask.  */
12560   uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
12561   uint64_t hi = 64;    /* Upper bound for number of sign bits.  */
12562 
12563   while (lo + 1 < hi)
12564     {
12565       int64_t test = (value & mask);
12566 
12567       if (test == 0 || test == mask)
12568 	{
12569 	  lo = count;
12570 	  count = (lo + hi) / 2;
12571 	  mask >>= (count - lo);
12572 	}
12573       else
12574 	{
12575 	  hi = count;
12576 	  count = (lo + hi) / 2;
12577 	  mask <<= hi - count;
12578 	}
12579     }
12580 
12581   if (lo != hi)
12582     {
12583       int64_t test;
12584 
12585       mask >>= 1;
12586       test = (value & mask);
12587 
12588       if (test == 0 || test == mask)
12589 	count = hi;
12590       else
12591 	count = lo;
12592     }
12593 
12594   return count;
12595 }
12596 
12597 /* Bit operations.  */
12598 /* N.B register args may not be SP.  */
12599 
12600 /* 32 bit count leading sign bits.  */
12601 static void
12602 cls32 (sim_cpu *cpu)
12603 {
12604   unsigned rn = INSTR (9, 5);
12605   unsigned rd = INSTR (4, 0);
12606 
12607   /* N.B. the result needs to exclude the leading bit.  */
12608   aarch64_set_reg_u64
12609     (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
12610 }
12611 
12612 /* 64 bit count leading sign bits.  */
12613 static void
12614 cls64 (sim_cpu *cpu)
12615 {
12616   unsigned rn = INSTR (9, 5);
12617   unsigned rd = INSTR (4, 0);
12618 
12619   /* N.B. the result needs to exclude the leading bit.  */
12620   aarch64_set_reg_u64
12621     (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
12622 }
12623 
12624 /* 32 bit count leading zero bits.  */
12625 static void
12626 clz32 (sim_cpu *cpu)
12627 {
12628   unsigned rn = INSTR (9, 5);
12629   unsigned rd = INSTR (4, 0);
12630   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12631 
12632   /* if the sign (top) bit is set then the count is 0.  */
12633   if (pick32 (value, 31, 31))
12634     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12635   else
12636     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
12637 }
12638 
12639 /* 64 bit count leading zero bits.  */
12640 static void
12641 clz64 (sim_cpu *cpu)
12642 {
12643   unsigned rn = INSTR (9, 5);
12644   unsigned rd = INSTR (4, 0);
12645   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12646 
12647   /* if the sign (top) bit is set then the count is 0.  */
12648   if (pick64 (value, 63, 63))
12649     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12650   else
12651     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
12652 }
12653 
12654 /* 32 bit reverse bits.  */
12655 static void
12656 rbit32 (sim_cpu *cpu)
12657 {
12658   unsigned rn = INSTR (9, 5);
12659   unsigned rd = INSTR (4, 0);
12660   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12661   uint32_t result = 0;
12662   int i;
12663 
12664   for (i = 0; i < 32; i++)
12665     {
12666       result <<= 1;
12667       result |= (value & 1);
12668       value >>= 1;
12669     }
12670   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12671 }
12672 
12673 /* 64 bit reverse bits.  */
12674 static void
12675 rbit64 (sim_cpu *cpu)
12676 {
12677   unsigned rn = INSTR (9, 5);
12678   unsigned rd = INSTR (4, 0);
12679   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12680   uint64_t result = 0;
12681   int i;
12682 
12683   for (i = 0; i < 64; i++)
12684     {
12685       result <<= 1;
12686       result |= (value & 1UL);
12687       value >>= 1;
12688     }
12689   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12690 }
12691 
12692 /* 32 bit reverse bytes.  */
12693 static void
12694 rev32 (sim_cpu *cpu)
12695 {
12696   unsigned rn = INSTR (9, 5);
12697   unsigned rd = INSTR (4, 0);
12698   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12699   uint32_t result = 0;
12700   int i;
12701 
12702   for (i = 0; i < 4; i++)
12703     {
12704       result <<= 8;
12705       result |= (value & 0xff);
12706       value >>= 8;
12707     }
12708   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12709 }
12710 
12711 /* 64 bit reverse bytes.  */
12712 static void
12713 rev64 (sim_cpu *cpu)
12714 {
12715   unsigned rn = INSTR (9, 5);
12716   unsigned rd = INSTR (4, 0);
12717   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12718   uint64_t result = 0;
12719   int i;
12720 
12721   for (i = 0; i < 8; i++)
12722     {
12723       result <<= 8;
12724       result |= (value & 0xffULL);
12725       value >>= 8;
12726     }
12727   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12728 }
12729 
12730 /* 32 bit reverse shorts.  */
12731 /* N.B.this reverses the order of the bytes in each half word.  */
12732 static void
12733 revh32 (sim_cpu *cpu)
12734 {
12735   unsigned rn = INSTR (9, 5);
12736   unsigned rd = INSTR (4, 0);
12737   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12738   uint32_t result = 0;
12739   int i;
12740 
12741   for (i = 0; i < 2; i++)
12742     {
12743       result <<= 8;
12744       result |= (value & 0x00ff00ff);
12745       value >>= 8;
12746     }
12747   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12748 }
12749 
12750 /* 64 bit reverse shorts.  */
12751 /* N.B.this reverses the order of the bytes in each half word.  */
12752 static void
12753 revh64 (sim_cpu *cpu)
12754 {
12755   unsigned rn = INSTR (9, 5);
12756   unsigned rd = INSTR (4, 0);
12757   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12758   uint64_t result = 0;
12759   int i;
12760 
12761   for (i = 0; i < 2; i++)
12762     {
12763       result <<= 8;
12764       result |= (value & 0x00ff00ff00ff00ffULL);
12765       value >>= 8;
12766     }
12767   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12768 }
12769 
12770 static void
12771 dexDataProc1Source (sim_cpu *cpu)
12772 {
12773   /* instr[30]    = 1
12774      instr[28,21] = 111010110
12775      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12776      instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
12777      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
12778      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
12779                              000010 ==> REV, 000011 ==> UNALLOC
12780                              000100 ==> CLZ, 000101 ==> CLS
12781                              ow ==> UNALLOC
12782      instr[9,5]   = rn : may not be SP
12783      instr[4,0]   = rd : may not be SP.  */
12784 
12785   uint32_t S = INSTR (29, 29);
12786   uint32_t opcode2 = INSTR (20, 16);
12787   uint32_t opcode = INSTR (15, 10);
12788   uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
12789 
12790   if (S == 1)
12791     HALT_UNALLOC;
12792 
12793   if (opcode2 != 0)
12794     HALT_UNALLOC;
12795 
12796   if (opcode & 0x38)
12797     HALT_UNALLOC;
12798 
12799   switch (dispatch)
12800     {
12801     case 0: rbit32 (cpu); return;
12802     case 1: revh32 (cpu); return;
12803     case 2: rev32 (cpu); return;
12804     case 4: clz32 (cpu); return;
12805     case 5: cls32 (cpu); return;
12806     case 8: rbit64 (cpu); return;
12807     case 9: revh64 (cpu); return;
12808     case 10:rev32 (cpu); return;
12809     case 11:rev64 (cpu); return;
12810     case 12:clz64 (cpu); return;
12811     case 13:cls64 (cpu); return;
12812     default: HALT_UNALLOC;
12813     }
12814 }
12815 
12816 /* Variable shift.
12817    Shifts by count supplied in register.
12818    N.B register args may not be SP.
12819    These all use the shifted auxiliary function for
12820    simplicity and clarity.  Writing the actual shift
12821    inline would avoid a branch and so be faster but
12822    would also necessitate getting signs right.  */
12823 
12824 /* 32 bit arithmetic shift right.  */
12825 static void
12826 asrv32 (sim_cpu *cpu)
12827 {
12828   unsigned rm = INSTR (20, 16);
12829   unsigned rn = INSTR (9, 5);
12830   unsigned rd = INSTR (4, 0);
12831 
12832   aarch64_set_reg_u64
12833     (cpu, rd, NO_SP,
12834      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
12835 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12836 }
12837 
12838 /* 64 bit arithmetic shift right.  */
12839 static void
12840 asrv64 (sim_cpu *cpu)
12841 {
12842   unsigned rm = INSTR (20, 16);
12843   unsigned rn = INSTR (9, 5);
12844   unsigned rd = INSTR (4, 0);
12845 
12846   aarch64_set_reg_u64
12847     (cpu, rd, NO_SP,
12848      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
12849 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12850 }
12851 
12852 /* 32 bit logical shift left.  */
12853 static void
12854 lslv32 (sim_cpu *cpu)
12855 {
12856   unsigned rm = INSTR (20, 16);
12857   unsigned rn = INSTR (9, 5);
12858   unsigned rd = INSTR (4, 0);
12859 
12860   aarch64_set_reg_u64
12861     (cpu, rd, NO_SP,
12862      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
12863 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12864 }
12865 
12866 /* 64 bit arithmetic shift left.  */
12867 static void
12868 lslv64 (sim_cpu *cpu)
12869 {
12870   unsigned rm = INSTR (20, 16);
12871   unsigned rn = INSTR (9, 5);
12872   unsigned rd = INSTR (4, 0);
12873 
12874   aarch64_set_reg_u64
12875     (cpu, rd, NO_SP,
12876      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
12877 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12878 }
12879 
12880 /* 32 bit logical shift right.  */
12881 static void
12882 lsrv32 (sim_cpu *cpu)
12883 {
12884   unsigned rm = INSTR (20, 16);
12885   unsigned rn = INSTR (9, 5);
12886   unsigned rd = INSTR (4, 0);
12887 
12888   aarch64_set_reg_u64
12889     (cpu, rd, NO_SP,
12890      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
12891 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12892 }
12893 
12894 /* 64 bit logical shift right.  */
12895 static void
12896 lsrv64 (sim_cpu *cpu)
12897 {
12898   unsigned rm = INSTR (20, 16);
12899   unsigned rn = INSTR (9, 5);
12900   unsigned rd = INSTR (4, 0);
12901 
12902   aarch64_set_reg_u64
12903     (cpu, rd, NO_SP,
12904      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
12905 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12906 }
12907 
12908 /* 32 bit rotate right.  */
12909 static void
12910 rorv32 (sim_cpu *cpu)
12911 {
12912   unsigned rm = INSTR (20, 16);
12913   unsigned rn = INSTR (9, 5);
12914   unsigned rd = INSTR (4, 0);
12915 
12916   aarch64_set_reg_u64
12917     (cpu, rd, NO_SP,
12918      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
12919 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12920 }
12921 
12922 /* 64 bit rotate right.  */
12923 static void
12924 rorv64 (sim_cpu *cpu)
12925 {
12926   unsigned rm = INSTR (20, 16);
12927   unsigned rn = INSTR (9, 5);
12928   unsigned rd = INSTR (4, 0);
12929 
12930   aarch64_set_reg_u64
12931     (cpu, rd, NO_SP,
12932      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
12933 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12934 }
12935 
12936 
12937 /* divide.  */
12938 
12939 /* 32 bit signed divide.  */
12940 static void
12941 cpuiv32 (sim_cpu *cpu)
12942 {
12943   unsigned rm = INSTR (20, 16);
12944   unsigned rn = INSTR (9, 5);
12945   unsigned rd = INSTR (4, 0);
12946   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12947   /* TODO : check that this rounds towards zero as required.  */
12948   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
12949   int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);
12950 
12951   aarch64_set_reg_s64 (cpu, rd, NO_SP,
12952 		       divisor ? ((int32_t) (dividend / divisor)) : 0);
12953 }
12954 
12955 /* 64 bit signed divide.  */
12956 static void
12957 cpuiv64 (sim_cpu *cpu)
12958 {
12959   unsigned rm = INSTR (20, 16);
12960   unsigned rn = INSTR (9, 5);
12961   unsigned rd = INSTR (4, 0);
12962 
12963   /* TODO : check that this rounds towards zero as required.  */
12964   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
12965 
12966   aarch64_set_reg_s64
12967     (cpu, rd, NO_SP,
12968      divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
12969 }
12970 
12971 /* 32 bit unsigned divide.  */
12972 static void
12973 udiv32 (sim_cpu *cpu)
12974 {
12975   unsigned rm = INSTR (20, 16);
12976   unsigned rn = INSTR (9, 5);
12977   unsigned rd = INSTR (4, 0);
12978 
12979   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12980   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12981   uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);
12982 
12983   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12984 		       divisor ? (uint32_t) (dividend / divisor) : 0);
12985 }
12986 
12987 /* 64 bit unsigned divide.  */
12988 static void
12989 udiv64 (sim_cpu *cpu)
12990 {
12991   unsigned rm = INSTR (20, 16);
12992   unsigned rn = INSTR (9, 5);
12993   unsigned rd = INSTR (4, 0);
12994 
12995   /* TODO : check that this rounds towards zero as required.  */
12996   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12997 
12998   aarch64_set_reg_u64
12999     (cpu, rd, NO_SP,
13000      divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
13001 }
13002 
13003 static void
13004 dexDataProc2Source (sim_cpu *cpu)
13005 {
13006   /* assert instr[30] == 0
13007      instr[28,21] == 11010110
13008      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
13009      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
13010      instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
13011                              001000 ==> LSLV, 001001 ==> LSRV
13012                              001010 ==> ASRV, 001011 ==> RORV
13013                              ow ==> UNALLOC.  */
13014 
13015   uint32_t dispatch;
13016   uint32_t S = INSTR (29, 29);
13017   uint32_t opcode = INSTR (15, 10);
13018 
13019   if (S == 1)
13020     HALT_UNALLOC;
13021 
13022   if (opcode & 0x34)
13023     HALT_UNALLOC;
13024 
13025   dispatch = (  (INSTR (31, 31) << 3)
13026 	      | (uimm (opcode, 3, 3) << 2)
13027 	      |  uimm (opcode, 1, 0));
13028   switch (dispatch)
13029     {
13030     case 2:  udiv32 (cpu); return;
13031     case 3:  cpuiv32 (cpu); return;
13032     case 4:  lslv32 (cpu); return;
13033     case 5:  lsrv32 (cpu); return;
13034     case 6:  asrv32 (cpu); return;
13035     case 7:  rorv32 (cpu); return;
13036     case 10: udiv64 (cpu); return;
13037     case 11: cpuiv64 (cpu); return;
13038     case 12: lslv64 (cpu); return;
13039     case 13: lsrv64 (cpu); return;
13040     case 14: asrv64 (cpu); return;
13041     case 15: rorv64 (cpu); return;
13042     default: HALT_UNALLOC;
13043     }
13044 }
13045 
13046 
13047 /* Multiply.  */
13048 
13049 /* 32 bit multiply and add.  */
13050 static void
13051 madd32 (sim_cpu *cpu)
13052 {
13053   unsigned rm = INSTR (20, 16);
13054   unsigned ra = INSTR (14, 10);
13055   unsigned rn = INSTR (9, 5);
13056   unsigned rd = INSTR (4, 0);
13057 
13058   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13059   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13060 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
13061 		       + aarch64_get_reg_u32 (cpu, rn, NO_SP)
13062 		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
13063 }
13064 
13065 /* 64 bit multiply and add.  */
13066 static void
13067 madd64 (sim_cpu *cpu)
13068 {
13069   unsigned rm = INSTR (20, 16);
13070   unsigned ra = INSTR (14, 10);
13071   unsigned rn = INSTR (9, 5);
13072   unsigned rd = INSTR (4, 0);
13073 
13074   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13075   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13076 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
13077 		       + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
13078 			  * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
13079 }
13080 
13081 /* 32 bit multiply and sub.  */
13082 static void
13083 msub32 (sim_cpu *cpu)
13084 {
13085   unsigned rm = INSTR (20, 16);
13086   unsigned ra = INSTR (14, 10);
13087   unsigned rn = INSTR (9, 5);
13088   unsigned rd = INSTR (4, 0);
13089 
13090   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13091   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13092 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
13093 		       - aarch64_get_reg_u32 (cpu, rn, NO_SP)
13094 		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
13095 }
13096 
13097 /* 64 bit multiply and sub.  */
13098 static void
13099 msub64 (sim_cpu *cpu)
13100 {
13101   unsigned rm = INSTR (20, 16);
13102   unsigned ra = INSTR (14, 10);
13103   unsigned rn = INSTR (9, 5);
13104   unsigned rd = INSTR (4, 0);
13105 
13106   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13107   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13108 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
13109 		       - aarch64_get_reg_u64 (cpu, rn, NO_SP)
13110 		       * aarch64_get_reg_u64 (cpu, rm, NO_SP));
13111 }
13112 
13113 /* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
13114 static void
13115 smaddl (sim_cpu *cpu)
13116 {
13117   unsigned rm = INSTR (20, 16);
13118   unsigned ra = INSTR (14, 10);
13119   unsigned rn = INSTR (9, 5);
13120   unsigned rd = INSTR (4, 0);
13121 
13122   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13123      obtain a 64 bit product.  */
13124   aarch64_set_reg_s64
13125     (cpu, rd, NO_SP,
13126      aarch64_get_reg_s64 (cpu, ra, NO_SP)
13127      + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
13128      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
13129 }
13130 
13131 /* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
13132 static void
13133 smsubl (sim_cpu *cpu)
13134 {
13135   unsigned rm = INSTR (20, 16);
13136   unsigned ra = INSTR (14, 10);
13137   unsigned rn = INSTR (9, 5);
13138   unsigned rd = INSTR (4, 0);
13139 
13140   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13141      obtain a 64 bit product.  */
13142   aarch64_set_reg_s64
13143     (cpu, rd, NO_SP,
13144      aarch64_get_reg_s64 (cpu, ra, NO_SP)
13145      - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
13146      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
13147 }
13148 
13149 /* Integer Multiply/Divide.  */
13150 
13151 /* First some macros and a helper function.  */
13152 /* Macros to test or access elements of 64 bit words.  */
13153 
13154 /* Mask used to access lo 32 bits of 64 bit unsigned int.  */
13155 #define LOW_WORD_MASK ((1ULL << 32) - 1)
13156 /* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
13157 #define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
13158 /* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
13159 #define highWordToU64(_value_u64) ((_value_u64) >> 32)
13160 
13161 /* Offset of sign bit in 64 bit signed integger.  */
13162 #define SIGN_SHIFT_U64 63
13163 /* The sign bit itself -- also identifies the minimum negative int value.  */
13164 #define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
13165 /* Return true if a 64 bit signed int presented as an unsigned int is the
13166    most negative value.  */
13167 #define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
13168 /* Return true (non-zero) if a 64 bit signed int presented as an unsigned
13169    int has its sign bit set to false.  */
13170 #define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
13171 /* Return 1L or -1L according to whether a 64 bit signed int presented as
13172    an unsigned int has its sign bit set or not.  */
13173 #define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
13174 /* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
13175 #define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)
13176 
13177 /* Multiply two 64 bit ints and return.
13178    the hi 64 bits of the 128 bit product.  */
13179 
13180 static uint64_t
13181 mul64hi (uint64_t value1, uint64_t value2)
13182 {
13183   uint64_t resultmid1;
13184   uint64_t result;
13185   uint64_t value1_lo = lowWordToU64 (value1);
13186   uint64_t value1_hi = highWordToU64 (value1) ;
13187   uint64_t value2_lo = lowWordToU64 (value2);
13188   uint64_t value2_hi = highWordToU64 (value2);
13189 
13190   /* Cross-multiply and collect results.  */
13191   uint64_t xproductlo = value1_lo * value2_lo;
13192   uint64_t xproductmid1 = value1_lo * value2_hi;
13193   uint64_t xproductmid2 = value1_hi * value2_lo;
13194   uint64_t xproducthi = value1_hi * value2_hi;
13195   uint64_t carry = 0;
13196   /* Start accumulating 64 bit results.  */
13197   /* Drop bottom half of lowest cross-product.  */
13198   uint64_t resultmid = xproductlo >> 32;
13199   /* Add in middle products.  */
13200   resultmid = resultmid + xproductmid1;
13201 
13202   /* Check for overflow.  */
13203   if (resultmid < xproductmid1)
13204     /* Carry over 1 into top cross-product.  */
13205     carry++;
13206 
13207   resultmid1  = resultmid + xproductmid2;
13208 
13209   /* Check for overflow.  */
13210   if (resultmid1 < xproductmid2)
13211     /* Carry over 1 into top cross-product.  */
13212     carry++;
13213 
13214   /* Drop lowest 32 bits of middle cross-product.  */
13215   result = resultmid1 >> 32;
13216   /* Move carry bit to just above middle cross-product highest bit.  */
13217   carry = carry << 32;
13218 
13219   /* Add top cross-product plus and any carry.  */
13220   result += xproducthi + carry;
13221 
13222   return result;
13223 }
13224 
13225 /* Signed multiply high, source, source2 :
13226    64 bit, dest <-- high 64-bit of result.  */
13227 static void
13228 smulh (sim_cpu *cpu)
13229 {
13230   uint64_t uresult;
13231   int64_t  result;
13232   unsigned rm = INSTR (20, 16);
13233   unsigned rn = INSTR (9, 5);
13234   unsigned rd = INSTR (4, 0);
13235   GReg     ra = INSTR (14, 10);
13236   int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
13237   int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
13238   uint64_t uvalue1;
13239   uint64_t uvalue2;
13240   int  negate = 0;
13241 
13242   if (ra != R31)
13243     HALT_UNALLOC;
13244 
13245   /* Convert to unsigned and use the unsigned mul64hi routine
13246      the fix the sign up afterwards.  */
13247   if (value1 < 0)
13248     {
13249       negate = !negate;
13250       uvalue1 = -value1;
13251     }
13252   else
13253     {
13254       uvalue1 = value1;
13255     }
13256 
13257   if (value2 < 0)
13258     {
13259       negate = !negate;
13260       uvalue2 = -value2;
13261     }
13262   else
13263     {
13264       uvalue2 = value2;
13265     }
13266 
13267   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13268 
13269   uresult = mul64hi (uvalue1, uvalue2);
13270   result = uresult;
13271 
13272   if (negate)
13273     {
13274       /* Multiply 128-bit result by -1, which means highpart gets inverted,
13275 	 and has carry in added only if low part is 0.  */
13276       result = ~result;
13277       if ((uvalue1 * uvalue2) == 0)
13278 	result += 1;
13279     }
13280 
13281   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
13282 }
13283 
13284 /* Unsigned multiply add long -- source, source2 :
13285    32 bit, source3 : 64 bit.  */
13286 static void
13287 umaddl (sim_cpu *cpu)
13288 {
13289   unsigned rm = INSTR (20, 16);
13290   unsigned ra = INSTR (14, 10);
13291   unsigned rn = INSTR (9, 5);
13292   unsigned rd = INSTR (4, 0);
13293 
13294   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13295   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13296      obtain a 64 bit product.  */
13297   aarch64_set_reg_u64
13298     (cpu, rd, NO_SP,
13299      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13300      + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13301      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13302 }
13303 
13304 /* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
13305 static void
13306 umsubl (sim_cpu *cpu)
13307 {
13308   unsigned rm = INSTR (20, 16);
13309   unsigned ra = INSTR (14, 10);
13310   unsigned rn = INSTR (9, 5);
13311   unsigned rd = INSTR (4, 0);
13312 
13313   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13314   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13315      obtain a 64 bit product.  */
13316   aarch64_set_reg_u64
13317     (cpu, rd, NO_SP,
13318      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13319      - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13320      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13321 }
13322 
13323 /* Unsigned multiply high, source, source2 :
13324    64 bit, dest <-- high 64-bit of result.  */
13325 static void
13326 umulh (sim_cpu *cpu)
13327 {
13328   unsigned rm = INSTR (20, 16);
13329   unsigned rn = INSTR (9, 5);
13330   unsigned rd = INSTR (4, 0);
13331   GReg     ra = INSTR (14, 10);
13332 
13333   if (ra != R31)
13334     HALT_UNALLOC;
13335 
13336   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13337   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13338 		       mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
13339 				aarch64_get_reg_u64 (cpu, rm, NO_SP)));
13340 }
13341 
13342 static void
13343 dexDataProc3Source (sim_cpu *cpu)
13344 {
13345   /* assert instr[28,24] == 11011.  */
13346   /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
13347      instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
13348      instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
13349      instr[15] = o0 : 0/1 ==> ok
13350      instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
13351                               0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
13352                               0100 ==> SMULH,                   (64 bit only)
13353                               1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
13354                               1100 ==> UMULH                    (64 bit only)
13355                               ow ==> UNALLOC.  */
13356 
13357   uint32_t dispatch;
13358   uint32_t size = INSTR (31, 31);
13359   uint32_t op54 = INSTR (30, 29);
13360   uint32_t op31 = INSTR (23, 21);
13361   uint32_t o0 = INSTR (15, 15);
13362 
13363   if (op54 != 0)
13364     HALT_UNALLOC;
13365 
13366   if (size == 0)
13367     {
13368       if (op31 != 0)
13369 	HALT_UNALLOC;
13370 
13371       if (o0 == 0)
13372 	madd32 (cpu);
13373       else
13374 	msub32 (cpu);
13375       return;
13376     }
13377 
13378   dispatch = (op31 << 1) | o0;
13379 
13380   switch (dispatch)
13381     {
13382     case 0:  madd64 (cpu); return;
13383     case 1:  msub64 (cpu); return;
13384     case 2:  smaddl (cpu); return;
13385     case 3:  smsubl (cpu); return;
13386     case 4:  smulh (cpu); return;
13387     case 10: umaddl (cpu); return;
13388     case 11: umsubl (cpu); return;
13389     case 12: umulh (cpu); return;
13390     default: HALT_UNALLOC;
13391     }
13392 }
13393 
13394 static void
13395 dexDPReg (sim_cpu *cpu)
13396 {
13397   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13398      assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
13399      bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
13400   uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));
13401 
13402   switch (group2)
13403     {
13404     case DPREG_LOG_000:
13405     case DPREG_LOG_001:
13406       dexLogicalShiftedRegister (cpu); return;
13407 
13408     case DPREG_ADDSHF_010:
13409       dexAddSubtractShiftedRegister (cpu); return;
13410 
13411     case DPREG_ADDEXT_011:
13412       dexAddSubtractExtendedRegister (cpu); return;
13413 
13414     case DPREG_ADDCOND_100:
13415       {
13416 	/* This set bundles a variety of different operations.  */
13417 	/* Check for.  */
13418 	/* 1) add/sub w carry.  */
13419 	uint32_t mask1 = 0x1FE00000U;
13420 	uint32_t val1  = 0x1A000000U;
13421 	/* 2) cond compare register/immediate.  */
13422 	uint32_t mask2 = 0x1FE00000U;
13423 	uint32_t val2  = 0x1A400000U;
13424 	/* 3) cond select.  */
13425 	uint32_t mask3 = 0x1FE00000U;
13426 	uint32_t val3  = 0x1A800000U;
13427 	/* 4) data proc 1/2 source.  */
13428 	uint32_t mask4 = 0x1FE00000U;
13429 	uint32_t val4  = 0x1AC00000U;
13430 
13431 	if ((aarch64_get_instr (cpu) & mask1) == val1)
13432 	  dexAddSubtractWithCarry (cpu);
13433 
13434 	else if ((aarch64_get_instr (cpu) & mask2) == val2)
13435 	  CondCompare (cpu);
13436 
13437 	else if ((aarch64_get_instr (cpu) & mask3) == val3)
13438 	  dexCondSelect (cpu);
13439 
13440 	else if ((aarch64_get_instr (cpu) & mask4) == val4)
13441 	  {
13442 	    /* Bit 30 is clear for data proc 2 source
13443 	       and set for data proc 1 source.  */
13444 	    if (aarch64_get_instr (cpu)  & (1U << 30))
13445 	      dexDataProc1Source (cpu);
13446 	    else
13447 	      dexDataProc2Source (cpu);
13448 	  }
13449 
13450 	else
13451 	  /* Should not reach here.  */
13452 	  HALT_NYI;
13453 
13454 	return;
13455       }
13456 
13457     case DPREG_3SRC_110:
13458       dexDataProc3Source (cpu); return;
13459 
13460     case DPREG_UNALLOC_101:
13461       HALT_UNALLOC;
13462 
13463     case DPREG_3SRC_111:
13464       dexDataProc3Source (cpu); return;
13465 
13466     default:
13467       /* Should never reach here.  */
13468       HALT_NYI;
13469     }
13470 }
13471 
13472 /* Unconditional Branch immediate.
13473    Offset is a PC-relative byte offset in the range +/- 128MiB.
13474    The offset is assumed to be raw from the decode i.e. the
13475    simulator is expected to scale them from word offsets to byte.  */
13476 
13477 /* Unconditional branch.  */
13478 static void
13479 buc (sim_cpu *cpu, int32_t offset)
13480 {
13481   aarch64_set_next_PC_by_offset (cpu, offset);
13482 }
13483 
13484 static unsigned stack_depth = 0;
13485 
13486 /* Unconditional branch and link -- writes return PC to LR.  */
13487 static void
13488 bl (sim_cpu *cpu, int32_t offset)
13489 {
13490   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13491   aarch64_save_LR (cpu);
13492   aarch64_set_next_PC_by_offset (cpu, offset);
13493 
13494   if (TRACE_BRANCH_P (cpu))
13495     {
13496       ++ stack_depth;
13497       TRACE_BRANCH (cpu,
13498 		    " %*scall %" PRIx64 " [%s]"
13499 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13500 		    stack_depth, " ", aarch64_get_next_PC (cpu),
13501 		    aarch64_get_func (CPU_STATE (cpu),
13502 				      aarch64_get_next_PC (cpu)),
13503 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
13504 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
13505 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
13506 		    );
13507     }
13508 }
13509 
13510 /* Unconditional Branch register.
13511    Branch/return address is in source register.  */
13512 
13513 /* Unconditional branch.  */
13514 static void
13515 br (sim_cpu *cpu)
13516 {
13517   unsigned rn = INSTR (9, 5);
13518   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13519   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13520 }
13521 
13522 /* Unconditional branch and link -- writes return PC to LR.  */
13523 static void
13524 blr (sim_cpu *cpu)
13525 {
13526   unsigned rn = INSTR (9, 5);
13527 
13528   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13529   /* The pseudo code in the spec says we update LR before fetching.
13530      the value from the rn.  */
13531   aarch64_save_LR (cpu);
13532   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13533 
13534   if (TRACE_BRANCH_P (cpu))
13535     {
13536       ++ stack_depth;
13537       TRACE_BRANCH (cpu,
13538 		    " %*scall %" PRIx64 " [%s]"
13539 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13540 		    stack_depth, " ", aarch64_get_next_PC (cpu),
13541 		    aarch64_get_func (CPU_STATE (cpu),
13542 				      aarch64_get_next_PC (cpu)),
13543 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
13544 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
13545 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
13546 		    );
13547     }
13548 }
13549 
13550 /* Return -- assembler will default source to LR this is functionally
13551    equivalent to br but, presumably, unlike br it side effects the
13552    branch predictor.  */
13553 static void
13554 ret (sim_cpu *cpu)
13555 {
13556   unsigned rn = INSTR (9, 5);
13557   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13558 
13559   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13560   if (TRACE_BRANCH_P (cpu))
13561     {
13562       TRACE_BRANCH (cpu,
13563 		    " %*sreturn [result: %" PRIx64 "]",
13564 		    stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
13565       -- stack_depth;
13566     }
13567 }
13568 
13569 /* NOP -- we implement this and call it from the decode in case we
13570    want to intercept it later.  */
13571 
13572 static void
13573 nop (sim_cpu *cpu)
13574 {
13575   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13576 }
13577 
13578 /* Data synchronization barrier.  */
13579 
13580 static void
13581 dsb (sim_cpu *cpu)
13582 {
13583   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13584 }
13585 
13586 /* Data memory barrier.  */
13587 
13588 static void
13589 dmb (sim_cpu *cpu)
13590 {
13591   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13592 }
13593 
13594 /* Instruction synchronization barrier.  */
13595 
13596 static void
13597 isb (sim_cpu *cpu)
13598 {
13599   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13600 }
13601 
13602 static void
13603 dexBranchImmediate (sim_cpu *cpu)
13604 {
13605   /* assert instr[30,26] == 00101
13606      instr[31] ==> 0 == B, 1 == BL
13607      instr[25,0] == imm26 branch offset counted in words.  */
13608 
13609   uint32_t top = INSTR (31, 31);
13610   /* We have a 26 byte signed word offset which we need to pass to the
13611      execute routine as a signed byte offset.  */
13612   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
13613 
13614   if (top)
13615     bl (cpu, offset);
13616   else
13617     buc (cpu, offset);
13618 }
13619 
13620 /* Control Flow.  */
13621 
13622 /* Conditional branch
13623 
13624    Offset is a PC-relative byte offset in the range +/- 1MiB pos is
13625    a bit position in the range 0 .. 63
13626 
13627    cc is a CondCode enum value as pulled out of the decode
13628 
13629    N.B. any offset register (source) can only be Xn or Wn.  */
13630 
13631 static void
13632 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
13633 {
13634   /* The test returns TRUE if CC is met.  */
13635   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13636   if (testConditionCode (cpu, cc))
13637     aarch64_set_next_PC_by_offset (cpu, offset);
13638 }
13639 
13640 /* 32 bit branch on register non-zero.  */
13641 static void
13642 cbnz32 (sim_cpu *cpu, int32_t offset)
13643 {
13644   unsigned rt = INSTR (4, 0);
13645 
13646   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13647   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
13648     aarch64_set_next_PC_by_offset (cpu, offset);
13649 }
13650 
13651 /* 64 bit branch on register zero.  */
13652 static void
13653 cbnz (sim_cpu *cpu, int32_t offset)
13654 {
13655   unsigned rt = INSTR (4, 0);
13656 
13657   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13658   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
13659     aarch64_set_next_PC_by_offset (cpu, offset);
13660 }
13661 
13662 /* 32 bit branch on register non-zero.  */
13663 static void
13664 cbz32 (sim_cpu *cpu, int32_t offset)
13665 {
13666   unsigned rt = INSTR (4, 0);
13667 
13668   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13669   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
13670     aarch64_set_next_PC_by_offset (cpu, offset);
13671 }
13672 
13673 /* 64 bit branch on register zero.  */
13674 static void
13675 cbz (sim_cpu *cpu, int32_t offset)
13676 {
13677   unsigned rt = INSTR (4, 0);
13678 
13679   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13680   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
13681     aarch64_set_next_PC_by_offset (cpu, offset);
13682 }
13683 
13684 /* Branch on register bit test non-zero -- one size fits all.  */
13685 static void
13686 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13687 {
13688   unsigned rt = INSTR (4, 0);
13689 
13690   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13691   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
13692     aarch64_set_next_PC_by_offset (cpu, offset);
13693 }
13694 
13695 /* Branch on register bit test zero -- one size fits all.  */
13696 static void
13697 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13698 {
13699   unsigned rt = INSTR (4, 0);
13700 
13701   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13702   if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
13703     aarch64_set_next_PC_by_offset (cpu, offset);
13704 }
13705 
13706 static void
13707 dexCompareBranchImmediate (sim_cpu *cpu)
13708 {
13709   /* instr[30,25] = 01 1010
13710      instr[31]    = size : 0 ==> 32, 1 ==> 64
13711      instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
13712      instr[23,5]  = simm19 branch offset counted in words
13713      instr[4,0]   = rt  */
13714 
13715   uint32_t size = INSTR (31, 31);
13716   uint32_t op   = INSTR (24, 24);
13717   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13718 
13719   if (size == 0)
13720     {
13721       if (op == 0)
13722 	cbz32 (cpu, offset);
13723       else
13724 	cbnz32 (cpu, offset);
13725     }
13726   else
13727     {
13728       if (op == 0)
13729 	cbz (cpu, offset);
13730       else
13731 	cbnz (cpu, offset);
13732     }
13733 }
13734 
13735 static void
13736 dexTestBranchImmediate (sim_cpu *cpu)
13737 {
13738   /* instr[31]    = b5 : bit 5 of test bit idx
13739      instr[30,25] = 01 1011
13740      instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
13741      instr[23,19] = b40 : bits 4 to 0 of test bit idx
13742      instr[18,5]  = simm14 : signed offset counted in words
13743      instr[4,0]   = uimm5  */
13744 
13745   uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
13746   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
13747 
13748   NYI_assert (30, 25, 0x1b);
13749 
13750   if (INSTR (24, 24) == 0)
13751     tbz (cpu, pos, offset);
13752   else
13753     tbnz (cpu, pos, offset);
13754 }
13755 
13756 static void
13757 dexCondBranchImmediate (sim_cpu *cpu)
13758 {
13759   /* instr[31,25] = 010 1010
13760      instr[24]    = op1; op => 00 ==> B.cond
13761      instr[23,5]  = simm19 : signed offset counted in words
13762      instr[4]     = op0
13763      instr[3,0]   = cond  */
13764 
13765   int32_t offset;
13766   uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
13767 
13768   NYI_assert (31, 25, 0x2a);
13769 
13770   if (op != 0)
13771     HALT_UNALLOC;
13772 
13773   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13774 
13775   bcc (cpu, offset, INSTR (3, 0));
13776 }
13777 
13778 static void
13779 dexBranchRegister (sim_cpu *cpu)
13780 {
13781   /* instr[31,25] = 110 1011
13782      instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
13783      instr[20,16] = op2 : must be 11111
13784      instr[15,10] = op3 : must be 000000
13785      instr[4,0]   = op2 : must be 11111.  */
13786 
13787   uint32_t op = INSTR (24, 21);
13788   uint32_t op2 = INSTR (20, 16);
13789   uint32_t op3 = INSTR (15, 10);
13790   uint32_t op4 = INSTR (4, 0);
13791 
13792   NYI_assert (31, 25, 0x6b);
13793 
13794   if (op2 != 0x1F || op3 != 0 || op4 != 0)
13795     HALT_UNALLOC;
13796 
13797   if (op == 0)
13798     br (cpu);
13799 
13800   else if (op == 1)
13801     blr (cpu);
13802 
13803   else if (op == 2)
13804     ret (cpu);
13805 
13806   else
13807     {
13808       /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
13809       /* anything else is unallocated.  */
13810       uint32_t rn = INSTR (4, 0);
13811 
13812       if (rn != 0x1f)
13813 	HALT_UNALLOC;
13814 
13815       if (op == 4 || op == 5)
13816 	HALT_NYI;
13817 
13818       HALT_UNALLOC;
13819     }
13820 }
13821 
13822 /* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
13823    but this may not be available.  So instead we define the values we need
13824    here.  */
13825 #define AngelSVC_Reason_Open		0x01
13826 #define AngelSVC_Reason_Close		0x02
13827 #define AngelSVC_Reason_Write		0x05
13828 #define AngelSVC_Reason_Read		0x06
13829 #define AngelSVC_Reason_IsTTY		0x09
13830 #define AngelSVC_Reason_Seek		0x0A
13831 #define AngelSVC_Reason_FLen		0x0C
13832 #define AngelSVC_Reason_Remove		0x0E
13833 #define AngelSVC_Reason_Rename		0x0F
13834 #define AngelSVC_Reason_Clock		0x10
13835 #define AngelSVC_Reason_Time		0x11
13836 #define AngelSVC_Reason_System		0x12
13837 #define AngelSVC_Reason_Errno		0x13
13838 #define AngelSVC_Reason_GetCmdLine	0x15
13839 #define AngelSVC_Reason_HeapInfo	0x16
13840 #define AngelSVC_Reason_ReportException 0x18
13841 #define AngelSVC_Reason_Elapsed         0x30
13842 
13843 
13844 static void
13845 handle_halt (sim_cpu *cpu, uint32_t val)
13846 {
13847   uint64_t result = 0;
13848 
13849   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13850   if (val != 0xf000)
13851     {
13852       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
13853       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13854 		       sim_stopped, SIM_SIGTRAP);
13855     }
13856 
13857   /* We have encountered an Angel SVC call.  See if we can process it.  */
13858   switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
13859     {
13860     case AngelSVC_Reason_HeapInfo:
13861       {
13862 	/* Get the values.  */
13863 	uint64_t stack_top = aarch64_get_stack_start (cpu);
13864 	uint64_t heap_base = aarch64_get_heap_start (cpu);
13865 
13866 	/* Get the pointer  */
13867 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13868 	ptr = aarch64_get_mem_u64 (cpu, ptr);
13869 
13870 	/* Fill in the memory block.  */
13871 	/* Start addr of heap.  */
13872 	aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
13873 	/* End addr of heap.  */
13874 	aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
13875 	/* Lowest stack addr.  */
13876 	aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
13877 	/* Initial stack addr.  */
13878 	aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);
13879 
13880 	TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
13881       }
13882       break;
13883 
13884     case AngelSVC_Reason_Open:
13885       {
13886 	/* Get the pointer  */
13887 	/* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
13888 	/* FIXME: For now we just assume that we will only be asked
13889 	   to open the standard file descriptors.  */
13890 	static int fd = 0;
13891 	result = fd ++;
13892 
13893 	TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
13894       }
13895       break;
13896 
13897     case AngelSVC_Reason_Close:
13898       {
13899 	uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13900 	TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
13901 	result = 0;
13902       }
13903       break;
13904 
13905     case AngelSVC_Reason_Errno:
13906       result = 0;
13907       TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
13908       break;
13909 
13910     case AngelSVC_Reason_Clock:
13911       result =
13912 #ifdef CLOCKS_PER_SEC
13913 	(CLOCKS_PER_SEC >= 100)
13914 	? (clock () / (CLOCKS_PER_SEC / 100))
13915 	: ((clock () * 100) / CLOCKS_PER_SEC)
13916 #else
13917 	/* Presume unix... clock() returns microseconds.  */
13918 	(clock () / 10000)
13919 #endif
13920 	;
13921 	TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
13922       break;
13923 
13924     case AngelSVC_Reason_GetCmdLine:
13925       {
13926 	/* Get the pointer  */
13927 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13928 	ptr = aarch64_get_mem_u64 (cpu, ptr);
13929 
13930 	/* FIXME: No command line for now.  */
13931 	aarch64_set_mem_u64 (cpu, ptr, 0);
13932 	TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
13933       }
13934       break;
13935 
13936     case AngelSVC_Reason_IsTTY:
13937       result = 1;
13938 	TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
13939       break;
13940 
13941     case AngelSVC_Reason_Write:
13942       {
13943 	/* Get the pointer  */
13944 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13945 	/* Get the write control block.  */
13946 	uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
13947 	uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
13948 	uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);
13949 
13950 	TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
13951 		       PRIx64 " on descriptor %" PRIx64,
13952 		       len, buf, fd);
13953 
13954 	if (len > 1280)
13955 	  {
13956 	    TRACE_SYSCALL (cpu,
13957 			   " AngelSVC: Write: Suspiciously long write: %ld",
13958 			   (long) len);
13959 	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13960 			     sim_stopped, SIM_SIGBUS);
13961 	  }
13962 	else if (fd == 1)
13963 	  {
13964 	    printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
13965 	  }
13966 	else if (fd == 2)
13967 	  {
13968 	    TRACE (cpu, 0, "\n");
13969 	    sim_io_eprintf (CPU_STATE (cpu), "%.*s",
13970 			    (int) len, aarch64_get_mem_ptr (cpu, buf));
13971 	    TRACE (cpu, 0, "\n");
13972 	  }
13973 	else
13974 	  {
13975 	    TRACE_SYSCALL (cpu,
13976 			   " AngelSVC: Write: Unexpected file handle: %d",
13977 			   (int) fd);
13978 	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13979 			     sim_stopped, SIM_SIGABRT);
13980 	  }
13981       }
13982       break;
13983 
13984     case AngelSVC_Reason_ReportException:
13985       {
13986 	/* Get the pointer  */
13987 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13988 	/*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
13989 	uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
13990 	uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);
13991 
13992 	TRACE_SYSCALL (cpu,
13993 		       "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
13994 		       type, state);
13995 
13996 	if (type == 0x20026)
13997 	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13998 			   sim_exited, state);
13999 	else
14000 	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
14001 			   sim_stopped, SIM_SIGINT);
14002       }
14003       break;
14004 
14005     case AngelSVC_Reason_Read:
14006     case AngelSVC_Reason_FLen:
14007     case AngelSVC_Reason_Seek:
14008     case AngelSVC_Reason_Remove:
14009     case AngelSVC_Reason_Time:
14010     case AngelSVC_Reason_System:
14011     case AngelSVC_Reason_Rename:
14012     case AngelSVC_Reason_Elapsed:
14013     default:
14014       TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
14015 		     aarch64_get_reg_u32 (cpu, 0, NO_SP));
14016       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
14017 		       sim_stopped, SIM_SIGTRAP);
14018     }
14019 
14020   aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
14021 }
14022 
14023 static void
14024 dexExcpnGen (sim_cpu *cpu)
14025 {
14026   /* instr[31:24] = 11010100
14027      instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
14028                           010 ==> HLT,       101 ==> DBG GEN EXCPN
14029      instr[20,5]  = imm16
14030      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
14031      instr[1,0]   = LL : discriminates opc  */
14032 
14033   uint32_t opc = INSTR (23, 21);
14034   uint32_t imm16 = INSTR (20, 5);
14035   uint32_t opc2 = INSTR (4, 2);
14036   uint32_t LL;
14037 
14038   NYI_assert (31, 24, 0xd4);
14039 
14040   if (opc2 != 0)
14041     HALT_UNALLOC;
14042 
14043   LL = INSTR (1, 0);
14044 
14045   /* We only implement HLT and BRK for now.  */
14046   if (opc == 1 && LL == 0)
14047     {
14048       TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
14049       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
14050 		       sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
14051     }
14052 
14053   if (opc == 2 && LL == 0)
14054     handle_halt (cpu, imm16);
14055 
14056   else if (opc == 0 || opc == 5)
14057     HALT_NYI;
14058 
14059   else
14060     HALT_UNALLOC;
14061 }
14062 
14063 /* Stub for accessing system registers.  */
14064 
14065 static uint64_t
14066 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
14067 	    unsigned crm, unsigned op2)
14068 {
14069   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
14070     /* DCZID_EL0 - the Data Cache Zero ID register.
14071        We do not support DC ZVA at the moment, so
14072        we return a value with the disable bit set.
14073        We implement support for the DCZID register since
14074        it is used by the C library's memset function.  */
14075     return ((uint64_t) 1) << 4;
14076 
14077   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
14078     /* Cache Type Register.  */
14079     return 0x80008000UL;
14080 
14081   if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
14082     /* TPIDR_EL0 - thread pointer id.  */
14083     return aarch64_get_thread_id (cpu);
14084 
14085   if (op1 == 3 && crm == 4 && op2 == 0)
14086     return aarch64_get_FPCR (cpu);
14087 
14088   if (op1 == 3 && crm == 4 && op2 == 1)
14089     return aarch64_get_FPSR (cpu);
14090 
14091   else if (op1 == 3 && crm == 2 && op2 == 0)
14092     return aarch64_get_CPSR (cpu);
14093 
14094   HALT_NYI;
14095 }
14096 
14097 static void
14098 system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
14099 	    unsigned crm, unsigned op2, uint64_t val)
14100 {
14101   if (op1 == 3 && crm == 4 && op2 == 0)
14102     aarch64_set_FPCR (cpu, val);
14103 
14104   else if (op1 == 3 && crm == 4 && op2 == 1)
14105     aarch64_set_FPSR (cpu, val);
14106 
14107   else if (op1 == 3 && crm == 2 && op2 == 0)
14108     aarch64_set_CPSR (cpu, val);
14109 
14110   else
14111     HALT_NYI;
14112 }
14113 
14114 static void
14115 do_mrs (sim_cpu *cpu)
14116 {
14117   /* instr[31:20] = 1101 0101 0001 1
14118      instr[19]    = op0
14119      instr[18,16] = op1
14120      instr[15,12] = CRn
14121      instr[11,8]  = CRm
14122      instr[7,5]   = op2
14123      instr[4,0]   = Rt  */
14124   unsigned sys_op0 = INSTR (19, 19) + 2;
14125   unsigned sys_op1 = INSTR (18, 16);
14126   unsigned sys_crn = INSTR (15, 12);
14127   unsigned sys_crm = INSTR (11, 8);
14128   unsigned sys_op2 = INSTR (7, 5);
14129   unsigned rt = INSTR (4, 0);
14130 
14131   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
14132   aarch64_set_reg_u64 (cpu, rt, NO_SP,
14133 		       system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
14134 }
14135 
14136 static void
14137 do_MSR_immediate (sim_cpu *cpu)
14138 {
14139   /* instr[31:19] = 1101 0101 0000 0
14140      instr[18,16] = op1
14141      instr[15,12] = 0100
14142      instr[11,8]  = CRm
14143      instr[7,5]   = op2
14144      instr[4,0]   = 1 1111  */
14145 
14146   unsigned op1 = INSTR (18, 16);
14147   /*unsigned crm = INSTR (11, 8);*/
14148   unsigned op2 = INSTR (7, 5);
14149 
14150   NYI_assert (31, 19, 0x1AA0);
14151   NYI_assert (15, 12, 0x4);
14152   NYI_assert (4,  0,  0x1F);
14153 
14154   if (op1 == 0)
14155     {
14156       if (op2 == 5)
14157 	HALT_NYI; /* set SPSel.  */
14158       else
14159 	HALT_UNALLOC;
14160     }
14161   else if (op1 == 3)
14162     {
14163       if (op2 == 6)
14164 	HALT_NYI; /* set DAIFset.  */
14165       else if (op2 == 7)
14166 	HALT_NYI; /* set DAIFclr.  */
14167       else
14168 	HALT_UNALLOC;
14169     }
14170   else
14171     HALT_UNALLOC;
14172 }
14173 
14174 static void
14175 do_MSR_reg (sim_cpu *cpu)
14176 {
14177   /* instr[31:20] = 1101 0101 0001
14178      instr[19]    = op0
14179      instr[18,16] = op1
14180      instr[15,12] = CRn
14181      instr[11,8]  = CRm
14182      instr[7,5]   = op2
14183      instr[4,0]   = Rt  */
14184 
14185   unsigned sys_op0 = INSTR (19, 19) + 2;
14186   unsigned sys_op1 = INSTR (18, 16);
14187   unsigned sys_crn = INSTR (15, 12);
14188   unsigned sys_crm = INSTR (11, 8);
14189   unsigned sys_op2 = INSTR (7, 5);
14190   unsigned rt = INSTR (4, 0);
14191 
14192   NYI_assert (31, 20, 0xD51);
14193 
14194   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
14195   system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
14196 	      aarch64_get_reg_u64 (cpu, rt, NO_SP));
14197 }
14198 
14199 static void
14200 do_SYS (sim_cpu *cpu)
14201 {
14202   /* instr[31,19] = 1101 0101 0000 1
14203      instr[18,16] = op1
14204      instr[15,12] = CRn
14205      instr[11,8]  = CRm
14206      instr[7,5]   = op2
14207      instr[4,0]   = Rt  */
14208   NYI_assert (31, 19, 0x1AA1);
14209 
14210   /* FIXME: For now we just silently accept system ops.  */
14211 }
14212 
14213 static void
14214 dexSystem (sim_cpu *cpu)
14215 {
14216   /* instr[31:22] = 1101 01010 0
14217      instr[21]    = L
14218      instr[20,19] = op0
14219      instr[18,16] = op1
14220      instr[15,12] = CRn
14221      instr[11,8]  = CRm
14222      instr[7,5]   = op2
14223      instr[4,0]   = uimm5  */
14224 
14225   /* We are interested in HINT, DSB, DMB and ISB
14226 
14227      Hint #0 encodes NOOP (this is the only hint we care about)
14228      L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
14229      CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101
14230 
14231      DSB, DMB, ISB are data store barrier, data memory barrier and
14232      instruction store barrier, respectively, where
14233 
14234      L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
14235      op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
14236      CRm<3:2> ==> domain, CRm<1:0> ==> types,
14237      domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
14238               10 ==> InerShareable, 11 ==> FullSystem
14239      types :  01 ==> Reads, 10 ==> Writes,
14240               11 ==> All, 00 ==> All (domain == FullSystem).  */
14241 
14242   unsigned rt = INSTR (4, 0);
14243 
14244   NYI_assert (31, 22, 0x354);
14245 
14246   switch (INSTR (21, 12))
14247     {
14248     case 0x032:
14249       if (rt == 0x1F)
14250 	{
14251 	  /* NOP has CRm != 0000 OR.  */
14252 	  /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
14253 	  uint32_t crm = INSTR (11, 8);
14254 	  uint32_t op2 = INSTR (7, 5);
14255 
14256 	  if (crm != 0 || (op2 == 0 || op2 > 5))
14257 	    {
14258 	      /* Actually call nop method so we can reimplement it later.  */
14259 	      nop (cpu);
14260 	      return;
14261 	    }
14262 	}
14263       HALT_NYI;
14264 
14265     case 0x033:
14266       {
14267 	uint32_t op2 =  INSTR (7, 5);
14268 
14269 	switch (op2)
14270 	  {
14271 	  case 2: HALT_NYI;
14272 	  case 4: dsb (cpu); return;
14273 	  case 5: dmb (cpu); return;
14274 	  case 6: isb (cpu); return;
14275 	  default: HALT_UNALLOC;
14276 	}
14277       }
14278 
14279     case 0x3B0:
14280     case 0x3B4:
14281     case 0x3BD:
14282       do_mrs (cpu);
14283       return;
14284 
14285     case 0x0B7:
14286       do_SYS (cpu); /* DC is an alias of SYS.  */
14287       return;
14288 
14289     default:
14290       if (INSTR (21, 20) == 0x1)
14291 	do_MSR_reg (cpu);
14292       else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
14293 	do_MSR_immediate (cpu);
14294       else
14295 	HALT_NYI;
14296       return;
14297     }
14298 }
14299 
14300 static void
14301 dexBr (sim_cpu *cpu)
14302 {
14303   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
14304      assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
14305      bits [31,29] of a BrExSys are the secondary dispatch vector.  */
14306   uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));
14307 
14308   switch (group2)
14309     {
14310     case BR_IMM_000:
14311       return dexBranchImmediate (cpu);
14312 
14313     case BR_IMMCMP_001:
14314       /* Compare has bit 25 clear while test has it set.  */
14315       if (!INSTR (25, 25))
14316 	dexCompareBranchImmediate (cpu);
14317       else
14318 	dexTestBranchImmediate (cpu);
14319       return;
14320 
14321     case BR_IMMCOND_010:
14322       /* This is a conditional branch if bit 25 is clear otherwise
14323          unallocated.  */
14324       if (!INSTR (25, 25))
14325 	dexCondBranchImmediate (cpu);
14326       else
14327 	HALT_UNALLOC;
14328       return;
14329 
14330     case BR_UNALLOC_011:
14331       HALT_UNALLOC;
14332 
14333     case BR_IMM_100:
14334       dexBranchImmediate (cpu);
14335       return;
14336 
14337     case BR_IMMCMP_101:
14338       /* Compare has bit 25 clear while test has it set.  */
14339       if (!INSTR (25, 25))
14340 	dexCompareBranchImmediate (cpu);
14341       else
14342 	dexTestBranchImmediate (cpu);
14343       return;
14344 
14345     case BR_REG_110:
14346       /* Unconditional branch reg has bit 25 set.  */
14347       if (INSTR (25, 25))
14348 	dexBranchRegister (cpu);
14349 
14350       /* This includes both Excpn Gen, System and unalloc operations.
14351          We need to decode the Excpn Gen operation BRK so we can plant
14352          debugger entry points.
14353          Excpn Gen operations have instr [24] = 0.
14354          we need to decode at least one of the System operations NOP
14355          which is an alias for HINT #0.
14356          System operations have instr [24,22] = 100.  */
14357       else if (INSTR (24, 24) == 0)
14358 	dexExcpnGen (cpu);
14359 
14360       else if (INSTR (24, 22) == 4)
14361 	dexSystem (cpu);
14362 
14363       else
14364 	HALT_UNALLOC;
14365 
14366       return;
14367 
14368     case BR_UNALLOC_111:
14369       HALT_UNALLOC;
14370 
14371     default:
14372       /* Should never reach here.  */
14373       HALT_NYI;
14374     }
14375 }
14376 
14377 static void
14378 aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
14379 {
14380   /* We need to check if gdb wants an in here.  */
14381   /* checkBreak (cpu);.  */
14382 
14383   uint64_t group = dispatchGroup (aarch64_get_instr (cpu));
14384 
14385   switch (group)
14386     {
14387     case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
14388     case GROUP_LDST_0100:     dexLdSt (cpu); break;
14389     case GROUP_DPREG_0101:    dexDPReg (cpu); break;
14390     case GROUP_LDST_0110:     dexLdSt (cpu); break;
14391     case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
14392     case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
14393     case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
14394     case GROUP_BREXSYS_1010:  dexBr (cpu); break;
14395     case GROUP_BREXSYS_1011:  dexBr (cpu); break;
14396     case GROUP_LDST_1100:     dexLdSt (cpu); break;
14397     case GROUP_DPREG_1101:    dexDPReg (cpu); break;
14398     case GROUP_LDST_1110:     dexLdSt (cpu); break;
14399     case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;
14400 
14401     case GROUP_UNALLOC_0001:
14402     case GROUP_UNALLOC_0010:
14403     case GROUP_UNALLOC_0011:
14404       HALT_UNALLOC;
14405 
14406     default:
14407       /* Should never reach here.  */
14408       HALT_NYI;
14409     }
14410 }
14411 
14412 static bfd_boolean
14413 aarch64_step (sim_cpu *cpu)
14414 {
14415   uint64_t pc = aarch64_get_PC (cpu);
14416 
14417   if (pc == TOP_LEVEL_RETURN_PC)
14418     return FALSE;
14419 
14420   aarch64_set_next_PC (cpu, pc + 4);
14421 
14422   /* Code is always little-endian.  */
14423   sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
14424 			& aarch64_get_instr (cpu), pc, 4);
14425   aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
14426 
14427   TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
14428 	      aarch64_get_instr (cpu));
14429   TRACE_DISASM (cpu, pc);
14430 
14431   aarch64_decode_and_execute (cpu, pc);
14432 
14433   return TRUE;
14434 }
14435 
14436 void
14437 aarch64_run (SIM_DESC sd)
14438 {
14439   sim_cpu *cpu = STATE_CPU (sd, 0);
14440 
14441   while (aarch64_step (cpu))
14442     {
14443       aarch64_update_PC (cpu);
14444 
14445       if (sim_events_tick (sd))
14446 	sim_events_process (sd);
14447     }
14448 
14449   sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
14450 		   sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
14451 }
14452 
14453 void
14454 aarch64_init (sim_cpu *cpu, uint64_t pc)
14455 {
14456   uint64_t sp = aarch64_get_stack_start (cpu);
14457 
14458   /* Install SP, FP and PC and set LR to -20
14459      so we can detect a top-level return.  */
14460   aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
14461   aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
14462   aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
14463   aarch64_set_next_PC (cpu, pc);
14464   aarch64_update_PC (cpu);
14465   aarch64_init_LIT_table ();
14466 }
14467