xref: /netbsd-src/external/gpl3/gdb/dist/sim/aarch64/simulator.c (revision 7c192b2a5e1093666e67801684f930ef49b3b363)
1 /* simulator.c -- Interface for the AArch64 simulator.
2 
3    Copyright (C) 2015-2016 Free Software Foundation, Inc.
4 
5    Contributed by Red Hat.
6 
7    This file is part of GDB.
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3 of the License, or
12    (at your option) any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include <sys/types.h>
27 #include <math.h>
28 #include <time.h>
29 #include <limits.h>
30 
31 #include "simulator.h"
32 #include "cpustate.h"
33 #include "memory.h"
34 
35 #define NO_SP 0
36 #define SP_OK 1
37 
38 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
39 #define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
40 #define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
41 
42 /* Space saver macro.  */
43 #define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
44 
45 #define HALT_UNALLOC							\
46   do									\
47     {									\
48       TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
49       TRACE_INSN (cpu,							\
50 		  "Unallocated instruction detected at sim line %d,"	\
51 		  " exe addr %" PRIx64,					\
52 		  __LINE__, aarch64_get_PC (cpu));			\
53       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
54 		       sim_stopped, SIM_SIGILL);			\
55     }									\
56   while (0)
57 
58 #define HALT_NYI							\
59   do									\
60     {									\
61       TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
62       TRACE_INSN (cpu,							\
63 		  "Unimplemented instruction detected at sim line %d,"	\
64 		  " exe addr %" PRIx64,					\
65 		  __LINE__, aarch64_get_PC (cpu));			\
66       if (! TRACE_ANY_P (cpu))						\
67 	{								\
68 	  sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: "); \
69 	  trace_disasm (CPU_STATE (cpu), cpu, aarch64_get_PC (cpu));	\
70 	}								\
71       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
72 		       sim_stopped, SIM_SIGABRT);			\
73     }									\
74   while (0)
75 
76 #define NYI_assert(HI, LO, EXPECTED)					\
77   do									\
78     {									\
79       if (INSTR ((HI), (LO)) != (EXPECTED))				\
80 	HALT_NYI;							\
81     }									\
82   while (0)
83 
84 /* Helper functions used by expandLogicalImmediate.  */
85 
86 /* for i = 1, ... N result<i-1> = 1 other bits are zero  */
87 static inline uint64_t
88 ones (int N)
89 {
90   return (N == 64 ? (uint64_t)-1UL : ((1UL << N) - 1));
91 }
92 
93 /* result<0> to val<N>  */
94 static inline uint64_t
95 pickbit (uint64_t val, int N)
96 {
97   return pickbits64 (val, N, N);
98 }
99 
100 static uint64_t
101 expand_logical_immediate (uint32_t S, uint32_t R, uint32_t N)
102 {
103   uint64_t mask;
104   uint64_t imm;
105   unsigned simd_size;
106 
107   /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
108      (in other words, right rotated by R), then replicated. */
109   if (N != 0)
110     {
111       simd_size = 64;
112       mask = 0xffffffffffffffffull;
113     }
114   else
115     {
116       switch (S)
117 	{
118 	case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
119 	case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; S &= 0xf; break;
120 	case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; S &= 0x7; break;
121 	case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; S &= 0x3; break;
122 	case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; S &= 0x1; break;
123 	default: return 0;
124 	}
125       mask = (1ull << simd_size) - 1;
126       /* Top bits are IGNORED.  */
127       R &= simd_size - 1;
128     }
129 
130   /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
131   if (S == simd_size - 1)
132     return 0;
133 
134   /* S+1 consecutive bits to 1.  */
135   /* NOTE: S can't be 63 due to detection above.  */
136   imm = (1ull << (S + 1)) - 1;
137 
138   /* Rotate to the left by simd_size - R.  */
139   if (R != 0)
140     imm = ((imm << (simd_size - R)) & mask) | (imm >> R);
141 
142   /* Replicate the value according to SIMD size.  */
143   switch (simd_size)
144     {
145     case  2: imm = (imm <<  2) | imm;
146     case  4: imm = (imm <<  4) | imm;
147     case  8: imm = (imm <<  8) | imm;
148     case 16: imm = (imm << 16) | imm;
149     case 32: imm = (imm << 32) | imm;
150     case 64: break;
151     default: return 0;
152     }
153 
154   return imm;
155 }
156 
157 /* Instr[22,10] encodes N immr and imms. we want a lookup table
158    for each possible combination i.e. 13 bits worth of int entries.  */
159 #define  LI_TABLE_SIZE  (1 << 13)
160 static uint64_t LITable[LI_TABLE_SIZE];
161 
162 void
163 aarch64_init_LIT_table (void)
164 {
165   unsigned index;
166 
167   for (index = 0; index < LI_TABLE_SIZE; index++)
168     {
169       uint32_t N    = uimm (index, 12, 12);
170       uint32_t immr = uimm (index, 11, 6);
171       uint32_t imms = uimm (index, 5, 0);
172 
173       LITable [index] = expand_logical_immediate (imms, immr, N);
174     }
175 }
176 
177 static void
178 dexNotify (sim_cpu *cpu)
179 {
180   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
181                            2 ==> exit Java, 3 ==> start next bytecode.  */
182   uint32_t type = INSTR (14, 0);
183 
184   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
185 
186   switch (type)
187     {
188     case 0:
189       /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
190 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
191       break;
192     case 1:
193       /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
194 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
195       break;
196     case 2:
197       /* aarch64_notifyMethodExit ();  */
198       break;
199     case 3:
200       /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
201 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
202       break;
203     }
204 }
205 
206 /* secondary decode within top level groups  */
207 
208 static void
209 dexPseudo (sim_cpu *cpu)
210 {
211   /* assert instr[28,27] = 00
212 
213      We provide 2 pseudo instructions:
214 
215      HALT stops execution of the simulator causing an immediate
216      return to the x86 code which entered it.
217 
218      CALLOUT initiates recursive entry into x86 code.  A register
219      argument holds the address of the x86 routine.  Immediate
220      values in the instruction identify the number of general
221      purpose and floating point register arguments to be passed
222      and the type of any value to be returned.  */
223 
224   uint32_t PSEUDO_HALT      =  0xE0000000U;
225   uint32_t PSEUDO_CALLOUT   =  0x00018000U;
226   uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
227   uint32_t PSEUDO_NOTIFY    =  0x00014000U;
228   uint32_t dispatch;
229 
230   if (aarch64_get_instr (cpu) == PSEUDO_HALT)
231     {
232       TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
233       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
234 		       sim_stopped, SIM_SIGTRAP);
235     }
236 
237   dispatch = INSTR (31, 15);
238 
239   /* We do not handle callouts at the moment.  */
240   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
241     {
242       TRACE_EVENTS (cpu, " Callout");
243       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
244 		       sim_stopped, SIM_SIGABRT);
245     }
246 
247   else if (dispatch == PSEUDO_NOTIFY)
248     dexNotify (cpu);
249 
250   else
251     HALT_UNALLOC;
252 }
253 
254 /* Load-store single register (unscaled offset)
255    These instructions employ a base register plus an unscaled signed
256    9 bit offset.
257 
258    N.B. the base register (source) can be Xn or SP. all other
259    registers may not be SP.  */
260 
261 /* 32 bit load 32 bit unscaled signed 9 bit.  */
262 static void
263 ldur32 (sim_cpu *cpu, int32_t offset)
264 {
265   unsigned rn = INSTR (9, 5);
266   unsigned rt = INSTR (4, 0);
267 
268   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
269   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
270 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
271 			+ offset));
272 }
273 
274 /* 64 bit load 64 bit unscaled signed 9 bit.  */
275 static void
276 ldur64 (sim_cpu *cpu, int32_t offset)
277 {
278   unsigned rn = INSTR (9, 5);
279   unsigned rt = INSTR (4, 0);
280 
281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
282   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
283 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
284 			+ offset));
285 }
286 
287 /* 32 bit load zero-extended byte unscaled signed 9 bit.  */
288 static void
289 ldurb32 (sim_cpu *cpu, int32_t offset)
290 {
291   unsigned rn = INSTR (9, 5);
292   unsigned rt = INSTR (4, 0);
293 
294   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
295   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
296 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
297 			+ offset));
298 }
299 
300 /* 32 bit load sign-extended byte unscaled signed 9 bit.  */
301 static void
302 ldursb32 (sim_cpu *cpu, int32_t offset)
303 {
304   unsigned rn = INSTR (9, 5);
305   unsigned rt = INSTR (4, 0);
306 
307   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
308   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
309 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
310 			+ offset));
311 }
312 
313 /* 64 bit load sign-extended byte unscaled signed 9 bit.  */
314 static void
315 ldursb64 (sim_cpu *cpu, int32_t offset)
316 {
317   unsigned rn = INSTR (9, 5);
318   unsigned rt = INSTR (4, 0);
319 
320   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
321   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
322 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
323 			+ offset));
324 }
325 
326 /* 32 bit load zero-extended short unscaled signed 9 bit  */
327 static void
328 ldurh32 (sim_cpu *cpu, int32_t offset)
329 {
330   unsigned rn = INSTR (9, 5);
331   unsigned rd = INSTR (4, 0);
332 
333   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
334   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
335 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
336 			+ offset));
337 }
338 
339 /* 32 bit load sign-extended short unscaled signed 9 bit  */
340 static void
341 ldursh32 (sim_cpu *cpu, int32_t offset)
342 {
343   unsigned rn = INSTR (9, 5);
344   unsigned rd = INSTR (4, 0);
345 
346   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
347   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
348 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
349 			+ offset));
350 }
351 
352 /* 64 bit load sign-extended short unscaled signed 9 bit  */
353 static void
354 ldursh64 (sim_cpu *cpu, int32_t offset)
355 {
356   unsigned rn = INSTR (9, 5);
357   unsigned rt = INSTR (4, 0);
358 
359   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
360   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
361 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
362 			+ offset));
363 }
364 
365 /* 64 bit load sign-extended word unscaled signed 9 bit  */
366 static void
367 ldursw (sim_cpu *cpu, int32_t offset)
368 {
369   unsigned rn = INSTR (9, 5);
370   unsigned rd = INSTR (4, 0);
371 
372   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
373   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
374 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
375 			+ offset));
376 }
377 
378 /* N.B. with stores the value in source is written to the address
379    identified by source2 modified by offset.  */
380 
381 /* 32 bit store 32 bit unscaled signed 9 bit.  */
382 static void
383 stur32 (sim_cpu *cpu, int32_t offset)
384 {
385   unsigned rn = INSTR (9, 5);
386   unsigned rd = INSTR (4, 0);
387 
388   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
389   aarch64_set_mem_u32 (cpu,
390 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
391 		       aarch64_get_reg_u32 (cpu, rd, NO_SP));
392 }
393 
394 /* 64 bit store 64 bit unscaled signed 9 bit  */
395 static void
396 stur64 (sim_cpu *cpu, int32_t offset)
397 {
398   unsigned rn = INSTR (9, 5);
399   unsigned rd = INSTR (4, 0);
400 
401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
402   aarch64_set_mem_u64 (cpu,
403 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
404 		       aarch64_get_reg_u64 (cpu, rd, NO_SP));
405 }
406 
407 /* 32 bit store byte unscaled signed 9 bit  */
408 static void
409 sturb (sim_cpu *cpu, int32_t offset)
410 {
411   unsigned rn = INSTR (9, 5);
412   unsigned rd = INSTR (4, 0);
413 
414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
415   aarch64_set_mem_u8 (cpu,
416 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
417 		      aarch64_get_reg_u8 (cpu, rd, NO_SP));
418 }
419 
420 /* 32 bit store short unscaled signed 9 bit  */
421 static void
422 sturh (sim_cpu *cpu, int32_t offset)
423 {
424   unsigned rn = INSTR (9, 5);
425   unsigned rd = INSTR (4, 0);
426 
427   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
428   aarch64_set_mem_u16 (cpu,
429 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
430 		       aarch64_get_reg_u16 (cpu, rd, NO_SP));
431 }
432 
433 /* Load single register pc-relative label
434    Offset is a signed 19 bit immediate count in words
435    rt may not be SP.  */
436 
437 /* 32 bit pc-relative load  */
438 static void
439 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
440 {
441   unsigned rd = INSTR (4, 0);
442 
443   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
444   aarch64_set_reg_u64 (cpu, rd, NO_SP,
445 		       aarch64_get_mem_u32
446 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
447 }
448 
449 /* 64 bit pc-relative load  */
450 static void
451 ldr_pcrel (sim_cpu *cpu, int32_t offset)
452 {
453   unsigned rd = INSTR (4, 0);
454 
455   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
456   aarch64_set_reg_u64 (cpu, rd, NO_SP,
457 		       aarch64_get_mem_u64
458 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
459 }
460 
461 /* sign extended 32 bit pc-relative load  */
462 static void
463 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
464 {
465   unsigned rd = INSTR (4, 0);
466 
467   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
468   aarch64_set_reg_u64 (cpu, rd, NO_SP,
469 		       aarch64_get_mem_s32
470 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
471 }
472 
473 /* float pc-relative load  */
474 static void
475 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
476 {
477   unsigned int rd = INSTR (4, 0);
478 
479   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
480   aarch64_set_vec_u32 (cpu, rd, 0,
481 		       aarch64_get_mem_u32
482 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
483 }
484 
485 /* double pc-relative load  */
486 static void
487 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
488 {
489   unsigned int st = INSTR (4, 0);
490 
491   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
492   aarch64_set_vec_u64 (cpu, st, 0,
493 		       aarch64_get_mem_u64
494 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
495 }
496 
497 /* long double pc-relative load.  */
498 static void
499 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
500 {
501   unsigned int st = INSTR (4, 0);
502   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
503   FRegister a;
504 
505   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
506   aarch64_get_mem_long_double (cpu, addr, & a);
507   aarch64_set_FP_long_double (cpu, st, a);
508 }
509 
510 /* This can be used to scale an offset by applying
511    the requisite shift. the second argument is either
512    16, 32 or 64.  */
513 
514 #define SCALE(_offset, _elementSize) \
515     ((_offset) << ScaleShift ## _elementSize)
516 
517 /* This can be used to optionally scale a register derived offset
518    by applying the requisite shift as indicated by the Scaling
519    argument.  The second argument is either Byte, Short, Word
520    or Long. The third argument is either Scaled or Unscaled.
521    N.B. when _Scaling is Scaled the shift gets ANDed with
522    all 1s while when it is Unscaled it gets ANDed with 0.  */
523 
524 #define OPT_SCALE(_offset, _elementType, _Scaling) \
525   ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))
526 
527 /* This can be used to zero or sign extend a 32 bit register derived
528    value to a 64 bit value.  the first argument must be the value as
529    a uint32_t and the second must be either UXTW or SXTW. The result
530    is returned as an int64_t.  */
531 
532 static inline int64_t
533 extend (uint32_t value, Extension extension)
534 {
535   union
536   {
537     uint32_t u;
538     int32_t   n;
539   } x;
540 
541   /* A branchless variant of this ought to be possible.  */
542   if (extension == UXTW || extension == NoExtension)
543     return value;
544 
545   x.u = value;
546   return x.n;
547 }
548 
549 /* Scalar Floating Point
550 
551    FP load/store single register (4 addressing modes)
552 
553    N.B. the base register (source) can be the stack pointer.
554    The secondary source register (source2) can only be an Xn register.  */
555 
556 /* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
557 static void
558 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
559 {
560   unsigned rn = INSTR (9, 5);
561   unsigned st = INSTR (4, 0);
562   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
563 
564   if (wb != Post)
565     address += offset;
566 
567   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
568   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
569   if (wb == Post)
570     address += offset;
571 
572   if (wb != NoWriteBack)
573     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
574 }
575 
576 /* Load 8 bit with unsigned 12 bit offset.  */
577 static void
578 fldrb_abs (sim_cpu *cpu, uint32_t offset)
579 {
580   unsigned rd = INSTR (4, 0);
581   unsigned rn = INSTR (9, 5);
582   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
583 
584   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
585   aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
586 }
587 
588 /* Load 16 bit scaled unsigned 12 bit.  */
589 static void
590 fldrh_abs (sim_cpu *cpu, uint32_t offset)
591 {
592   unsigned rd = INSTR (4, 0);
593   unsigned rn = INSTR (9, 5);
594   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
595 
596   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
597   aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
598 }
599 
600 /* Load 32 bit scaled unsigned 12 bit.  */
601 static void
602 fldrs_abs (sim_cpu *cpu, uint32_t offset)
603 {
604   unsigned rd = INSTR (4, 0);
605   unsigned rn = INSTR (9, 5);
606   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
607 
608   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
609   aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
610 }
611 
612 /* Load 64 bit scaled unsigned 12 bit.  */
613 static void
614 fldrd_abs (sim_cpu *cpu, uint32_t offset)
615 {
616   unsigned rd = INSTR (4, 0);
617   unsigned rn = INSTR (9, 5);
618   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
619 
620   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
621   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
622 }
623 
624 /* Load 128 bit scaled unsigned 12 bit.  */
625 static void
626 fldrq_abs (sim_cpu *cpu, uint32_t offset)
627 {
628   unsigned rd = INSTR (4, 0);
629   unsigned rn = INSTR (9, 5);
630   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
631 
632   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
633   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
634   aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
635 }
636 
637 /* Load 32 bit scaled or unscaled zero- or sign-extended
638    32-bit register offset.  */
639 static void
640 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
641 {
642   unsigned rm = INSTR (20, 16);
643   unsigned rn = INSTR (9, 5);
644   unsigned st = INSTR (4, 0);
645   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
646   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
647   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
648 
649   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
650   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
651 		       (cpu, address + displacement));
652 }
653 
654 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
655 static void
656 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
657 {
658   unsigned rn = INSTR (9, 5);
659   unsigned st = INSTR (4, 0);
660   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
661 
662   if (wb != Post)
663     address += offset;
664 
665   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
666   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
667 
668   if (wb == Post)
669     address += offset;
670 
671   if (wb != NoWriteBack)
672     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
673 }
674 
675 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
676 static void
677 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
678 {
679   unsigned rm = INSTR (20, 16);
680   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
681   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
682 
683   fldrd_wb (cpu, displacement, NoWriteBack);
684 }
685 
686 /* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
687 static void
688 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
689 {
690   FRegister a;
691   unsigned rn = INSTR (9, 5);
692   unsigned st = INSTR (4, 0);
693   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
694 
695   if (wb != Post)
696     address += offset;
697 
698   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
699   aarch64_get_mem_long_double (cpu, address, & a);
700   aarch64_set_FP_long_double (cpu, st, a);
701 
702   if (wb == Post)
703     address += offset;
704 
705   if (wb != NoWriteBack)
706     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
707 }
708 
709 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
710 static void
711 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
712 {
713   unsigned rm = INSTR (20, 16);
714   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
715   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
716 
717   fldrq_wb (cpu, displacement, NoWriteBack);
718 }
719 
720 /* Memory Access
721 
722    load-store single register
723    There are four addressing modes available here which all employ a
724    64 bit source (base) register.
725 
726    N.B. the base register (source) can be the stack pointer.
727    The secondary source register (source2)can only be an Xn register.
728 
729    Scaled, 12-bit, unsigned immediate offset, without pre- and
730    post-index options.
731    Unscaled, 9-bit, signed immediate offset with pre- or post-index
732    writeback.
733    scaled or unscaled 64-bit register offset.
734    scaled or unscaled 32-bit extended register offset.
735 
736    All offsets are assumed to be raw from the decode i.e. the
737    simulator is expected to adjust scaled offsets based on the
738    accessed data size with register or extended register offset
739    versions the same applies except that in the latter case the
740    operation may also require a sign extend.
741 
742    A separate method is provided for each possible addressing mode.  */
743 
744 /* 32 bit load 32 bit scaled unsigned 12 bit  */
745 static void
746 ldr32_abs (sim_cpu *cpu, uint32_t offset)
747 {
748   unsigned rn = INSTR (9, 5);
749   unsigned rt = INSTR (4, 0);
750 
751   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
752   /* The target register may not be SP but the source may be.  */
753   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
754 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
755 			+ SCALE (offset, 32)));
756 }
757 
758 /* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
759 static void
760 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
761 {
762   unsigned rn = INSTR (9, 5);
763   unsigned rt = INSTR (4, 0);
764   uint64_t address;
765 
766   if (rn == rt && wb != NoWriteBack)
767     HALT_UNALLOC;
768 
769   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
770 
771   if (wb != Post)
772     address += offset;
773 
774   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
775   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
776 
777   if (wb == Post)
778     address += offset;
779 
780   if (wb != NoWriteBack)
781     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
782 }
783 
784 /* 32 bit load 32 bit scaled or unscaled
785    zero- or sign-extended 32-bit register offset  */
786 static void
787 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
788 {
789   unsigned rm = INSTR (20, 16);
790   unsigned rn = INSTR (9, 5);
791   unsigned rt = INSTR (4, 0);
792   /* rn may reference SP, rm and rt must reference ZR  */
793 
794   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
795   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
796   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
797 
798   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
799   aarch64_set_reg_u64 (cpu, rt, NO_SP,
800 		       aarch64_get_mem_u32 (cpu, address + displacement));
801 }
802 
803 /* 64 bit load 64 bit scaled unsigned 12 bit  */
804 static void
805 ldr_abs (sim_cpu *cpu, uint32_t offset)
806 {
807   unsigned rn = INSTR (9, 5);
808   unsigned rt = INSTR (4, 0);
809 
810   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
811   /* The target register may not be SP but the source may be.  */
812   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
813 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
814 			+ SCALE (offset, 64)));
815 }
816 
817 /* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
818 static void
819 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
820 {
821   unsigned rn = INSTR (9, 5);
822   unsigned rt = INSTR (4, 0);
823   uint64_t address;
824 
825   if (rn == rt && wb != NoWriteBack)
826     HALT_UNALLOC;
827 
828   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
829 
830   if (wb != Post)
831     address += offset;
832 
833   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
834   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
835 
836   if (wb == Post)
837     address += offset;
838 
839   if (wb != NoWriteBack)
840     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
841 }
842 
843 /* 64 bit load 64 bit scaled or unscaled zero-
844    or sign-extended 32-bit register offset.  */
845 static void
846 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
847 {
848   unsigned rm = INSTR (20, 16);
849   unsigned rn = INSTR (9, 5);
850   unsigned rt = INSTR (4, 0);
851   /* rn may reference SP, rm and rt must reference ZR  */
852 
853   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
854   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
855   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
856 
857   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
858   aarch64_set_reg_u64 (cpu, rt, NO_SP,
859 		       aarch64_get_mem_u64 (cpu, address + displacement));
860 }
861 
862 /* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
863 static void
864 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
865 {
866   unsigned rn = INSTR (9, 5);
867   unsigned rt = INSTR (4, 0);
868 
869   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
870   /* The target register may not be SP but the source may be
871      there is no scaling required for a byte load.  */
872   aarch64_set_reg_u64 (cpu, rt, NO_SP,
873 		       aarch64_get_mem_u8
874 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
875 }
876 
877 /* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
878 static void
879 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
880 {
881   unsigned rn = INSTR (9, 5);
882   unsigned rt = INSTR (4, 0);
883   uint64_t address;
884 
885   if (rn == rt && wb != NoWriteBack)
886     HALT_UNALLOC;
887 
888   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
889 
890   if (wb != Post)
891     address += offset;
892 
893   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
894   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
895 
896   if (wb == Post)
897     address += offset;
898 
899   if (wb != NoWriteBack)
900     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
901 }
902 
903 /* 32 bit load zero-extended byte scaled or unscaled zero-
904    or sign-extended 32-bit register offset.  */
905 static void
906 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
907 {
908   unsigned rm = INSTR (20, 16);
909   unsigned rn = INSTR (9, 5);
910   unsigned rt = INSTR (4, 0);
911   /* rn may reference SP, rm and rt must reference ZR  */
912 
913   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
914   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
915 				 extension);
916 
917   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
918   /* There is no scaling required for a byte load.  */
919   aarch64_set_reg_u64 (cpu, rt, NO_SP,
920 		       aarch64_get_mem_u8 (cpu, address + displacement));
921 }
922 
923 /* 64 bit load sign-extended byte unscaled signed 9 bit
924    with pre- or post-writeback.  */
925 static void
926 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
927 {
928   unsigned rn = INSTR (9, 5);
929   unsigned rt = INSTR (4, 0);
930   uint64_t address;
931   int64_t val;
932 
933   if (rn == rt && wb != NoWriteBack)
934     HALT_UNALLOC;
935 
936   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
937 
938   if (wb != Post)
939     address += offset;
940 
941   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
942   val = aarch64_get_mem_s8 (cpu, address);
943   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
944 
945   if (wb == Post)
946     address += offset;
947 
948   if (wb != NoWriteBack)
949     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
950 }
951 
952 /* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
953 static void
954 ldrsb_abs (sim_cpu *cpu, uint32_t offset)
955 {
956   ldrsb_wb (cpu, offset, NoWriteBack);
957 }
958 
959 /* 64 bit load sign-extended byte scaled or unscaled zero-
960    or sign-extended 32-bit register offset.  */
961 static void
962 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
963 {
964   unsigned rm = INSTR (20, 16);
965   unsigned rn = INSTR (9, 5);
966   unsigned rt = INSTR (4, 0);
967   /* rn may reference SP, rm and rt must reference ZR  */
968 
969   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
970   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
971 				 extension);
972   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
973   /* There is no scaling required for a byte load.  */
974   aarch64_set_reg_s64 (cpu, rt, NO_SP,
975 		       aarch64_get_mem_s8 (cpu, address + displacement));
976 }
977 
978 /* 32 bit load zero-extended short scaled unsigned 12 bit.  */
979 static void
980 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
981 {
982   unsigned rn = INSTR (9, 5);
983   unsigned rt = INSTR (4, 0);
984   uint32_t val;
985 
986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
987   /* The target register may not be SP but the source may be.  */
988   val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
989 			     + SCALE (offset, 16));
990   aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
991 }
992 
993 /* 32 bit load zero-extended short unscaled signed 9 bit
994    with pre- or post-writeback.  */
995 static void
996 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
997 {
998   unsigned rn = INSTR (9, 5);
999   unsigned rt = INSTR (4, 0);
1000   uint64_t address;
1001 
1002   if (rn == rt && wb != NoWriteBack)
1003     HALT_UNALLOC;
1004 
1005   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1006 
1007   if (wb != Post)
1008     address += offset;
1009 
1010   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1011   aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1012 
1013   if (wb == Post)
1014     address += offset;
1015 
1016   if (wb != NoWriteBack)
1017     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1018 }
1019 
1020 /* 32 bit load zero-extended short scaled or unscaled zero-
1021    or sign-extended 32-bit register offset.  */
1022 static void
1023 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1024 {
1025   unsigned rm = INSTR (20, 16);
1026   unsigned rn = INSTR (9, 5);
1027   unsigned rt = INSTR (4, 0);
1028   /* rn may reference SP, rm and rt must reference ZR  */
1029 
1030   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1031   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1032   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1033 
1034   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1035   aarch64_set_reg_u32 (cpu, rt, NO_SP,
1036 		       aarch64_get_mem_u16 (cpu, address + displacement));
1037 }
1038 
1039 /* 32 bit load sign-extended short scaled unsigned 12 bit.  */
1040 static void
1041 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
1042 {
1043   unsigned rn = INSTR (9, 5);
1044   unsigned rt = INSTR (4, 0);
1045   int32_t val;
1046 
1047   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1048   /* The target register may not be SP but the source may be.  */
1049   val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1050 			     + SCALE (offset, 16));
1051   aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
1052 }
1053 
1054 /* 32 bit load sign-extended short unscaled signed 9 bit
1055    with pre- or post-writeback.  */
1056 static void
1057 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1058 {
1059   unsigned rn = INSTR (9, 5);
1060   unsigned rt = INSTR (4, 0);
1061   uint64_t address;
1062 
1063   if (rn == rt && wb != NoWriteBack)
1064     HALT_UNALLOC;
1065 
1066   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1067 
1068   if (wb != Post)
1069     address += offset;
1070 
1071   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1072   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1073 		       (int32_t) aarch64_get_mem_s16 (cpu, address));
1074 
1075   if (wb == Post)
1076     address += offset;
1077 
1078   if (wb != NoWriteBack)
1079     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1080 }
1081 
1082 /* 32 bit load sign-extended short scaled or unscaled zero-
1083    or sign-extended 32-bit register offset.  */
1084 static void
1085 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1086 {
1087   unsigned rm = INSTR (20, 16);
1088   unsigned rn = INSTR (9, 5);
1089   unsigned rt = INSTR (4, 0);
1090   /* rn may reference SP, rm and rt must reference ZR  */
1091 
1092   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1093   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1094   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1095 
1096   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1097   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1098 		       (int32_t) aarch64_get_mem_s16
1099 		       (cpu, address + displacement));
1100 }
1101 
1102 /* 64 bit load sign-extended short scaled unsigned 12 bit.  */
1103 static void
1104 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
1105 {
1106   unsigned rn = INSTR (9, 5);
1107   unsigned rt = INSTR (4, 0);
1108   int64_t val;
1109 
1110   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1111   /* The target register may not be SP but the source may be.  */
1112   val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1113 			      + SCALE (offset, 16));
1114   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1115 }
1116 
1117 /* 64 bit load sign-extended short unscaled signed 9 bit
1118    with pre- or post-writeback.  */
1119 static void
1120 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1121 {
1122   unsigned rn = INSTR (9, 5);
1123   unsigned rt = INSTR (4, 0);
1124   uint64_t address;
1125   int64_t val;
1126 
1127   if (rn == rt && wb != NoWriteBack)
1128     HALT_UNALLOC;
1129 
1130   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1131   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1132 
1133   if (wb != Post)
1134     address += offset;
1135 
1136   val = aarch64_get_mem_s16 (cpu, address);
1137   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1138 
1139   if (wb == Post)
1140     address += offset;
1141 
1142   if (wb != NoWriteBack)
1143     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1144 }
1145 
1146 /* 64 bit load sign-extended short scaled or unscaled zero-
1147    or sign-extended 32-bit register offset.  */
1148 static void
1149 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1150 {
1151   unsigned rm = INSTR (20, 16);
1152   unsigned rn = INSTR (9, 5);
1153   unsigned rt = INSTR (4, 0);
1154 
1155   /* rn may reference SP, rm and rt must reference ZR  */
1156 
1157   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1158   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1159   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
1160   int64_t val;
1161 
1162   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1163   val = aarch64_get_mem_s16 (cpu, address + displacement);
1164   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1165 }
1166 
1167 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
1168 static void
1169 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
1170 {
1171   unsigned rn = INSTR (9, 5);
1172   unsigned rt = INSTR (4, 0);
1173   int64_t val;
1174 
1175   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1176   val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1177 			     + SCALE (offset, 32));
1178   /* The target register may not be SP but the source may be.  */
1179   return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1180 }
1181 
1182 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
1183    with pre- or post-writeback.  */
1184 static void
1185 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1186 {
1187   unsigned rn = INSTR (9, 5);
1188   unsigned rt = INSTR (4, 0);
1189   uint64_t address;
1190 
1191   if (rn == rt && wb != NoWriteBack)
1192     HALT_UNALLOC;
1193 
1194   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1195 
1196   if (wb != Post)
1197     address += offset;
1198 
1199   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1200   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
1201 
1202   if (wb == Post)
1203     address += offset;
1204 
1205   if (wb != NoWriteBack)
1206     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1207 }
1208 
1209 /* 64 bit load sign-extended 32 bit scaled or unscaled zero-
1210    or sign-extended 32-bit register offset.  */
1211 static void
1212 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1213 {
1214   unsigned rm = INSTR (20, 16);
1215   unsigned rn = INSTR (9, 5);
1216   unsigned rt = INSTR (4, 0);
1217   /* rn may reference SP, rm and rt must reference ZR  */
1218 
1219   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1220   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1221   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
1222 
1223   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1224   aarch64_set_reg_s64 (cpu, rt, NO_SP,
1225 		       aarch64_get_mem_s32 (cpu, address + displacement));
1226 }
1227 
1228 /* N.B. with stores the value in source is written to the
1229    address identified by source2 modified by source3/offset.  */
1230 
1231 /* 32 bit store scaled unsigned 12 bit.  */
1232 static void
1233 str32_abs (sim_cpu *cpu, uint32_t offset)
1234 {
1235   unsigned rn = INSTR (9, 5);
1236   unsigned rt = INSTR (4, 0);
1237 
1238   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1239   /* The target register may not be SP but the source may be.  */
1240   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
1241 			     + SCALE (offset, 32)),
1242 		       aarch64_get_reg_u32 (cpu, rt, NO_SP));
1243 }
1244 
1245 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1246 static void
1247 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1248 {
1249   unsigned rn = INSTR (9, 5);
1250   unsigned rt = INSTR (4, 0);
1251   uint64_t address;
1252 
1253   if (rn == rt && wb != NoWriteBack)
1254     HALT_UNALLOC;
1255 
1256   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1257   if (wb != Post)
1258     address += offset;
1259 
1260   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1261   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
1262 
1263   if (wb == Post)
1264     address += offset;
1265 
1266   if (wb != NoWriteBack)
1267     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1268 }
1269 
1270 /* 32 bit store scaled or unscaled zero- or
1271    sign-extended 32-bit register offset.  */
1272 static void
1273 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1274 {
1275   unsigned rm = INSTR (20, 16);
1276   unsigned rn = INSTR (9, 5);
1277   unsigned rt = INSTR (4, 0);
1278 
1279   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1280   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1281   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
1282 
1283   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1284   aarch64_set_mem_u32 (cpu, address + displacement,
1285 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1286 }
1287 
1288 /* 64 bit store scaled unsigned 12 bit.  */
1289 static void
1290 str_abs (sim_cpu *cpu, uint32_t offset)
1291 {
1292   unsigned rn = INSTR (9, 5);
1293   unsigned rt = INSTR (4, 0);
1294 
1295   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1296   aarch64_set_mem_u64 (cpu,
1297 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
1298 		       + SCALE (offset, 64),
1299 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1300 }
1301 
1302 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1303 static void
1304 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1305 {
1306   unsigned rn = INSTR (9, 5);
1307   unsigned rt = INSTR (4, 0);
1308   uint64_t address;
1309 
1310   if (rn == rt && wb != NoWriteBack)
1311     HALT_UNALLOC;
1312 
1313   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1314 
1315   if (wb != Post)
1316     address += offset;
1317 
1318   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1319   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
1320 
1321   if (wb == Post)
1322     address += offset;
1323 
1324   if (wb != NoWriteBack)
1325     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1326 }
1327 
1328 /* 64 bit store scaled or unscaled zero-
1329    or sign-extended 32-bit register offset.  */
1330 static void
1331 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1332 {
1333   unsigned rm = INSTR (20, 16);
1334   unsigned rn = INSTR (9, 5);
1335   unsigned rt = INSTR (4, 0);
1336   /* rn may reference SP, rm and rt must reference ZR  */
1337 
1338   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1339   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1340 			       extension);
1341   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
1342 
1343   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1344   aarch64_set_mem_u64 (cpu, address + displacement,
1345 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
1346 }
1347 
1348 /* 32 bit store byte scaled unsigned 12 bit.  */
1349 static void
1350 strb_abs (sim_cpu *cpu, uint32_t offset)
1351 {
1352   unsigned rn = INSTR (9, 5);
1353   unsigned rt = INSTR (4, 0);
1354 
1355   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1356   /* The target register may not be SP but the source may be.
1357      There is no scaling required for a byte load.  */
1358   aarch64_set_mem_u8 (cpu,
1359 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
1360 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
1361 }
1362 
1363 /* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
1364 static void
1365 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1366 {
1367   unsigned rn = INSTR (9, 5);
1368   unsigned rt = INSTR (4, 0);
1369   uint64_t address;
1370 
1371   if (rn == rt && wb != NoWriteBack)
1372     HALT_UNALLOC;
1373 
1374   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1375 
1376   if (wb != Post)
1377     address += offset;
1378 
1379   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1380   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
1381 
1382   if (wb == Post)
1383     address += offset;
1384 
1385   if (wb != NoWriteBack)
1386     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1387 }
1388 
1389 /* 32 bit store byte scaled or unscaled zero-
1390    or sign-extended 32-bit register offset.  */
1391 static void
1392 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1393 {
1394   unsigned rm = INSTR (20, 16);
1395   unsigned rn = INSTR (9, 5);
1396   unsigned rt = INSTR (4, 0);
1397   /* rn may reference SP, rm and rt must reference ZR  */
1398 
1399   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1400   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1401 				 extension);
1402 
1403   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1404   /* There is no scaling required for a byte load.  */
1405   aarch64_set_mem_u8 (cpu, address + displacement,
1406 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
1407 }
1408 
1409 /* 32 bit store short scaled unsigned 12 bit.  */
1410 static void
1411 strh_abs (sim_cpu *cpu, uint32_t offset)
1412 {
1413   unsigned rn = INSTR (9, 5);
1414   unsigned rt = INSTR (4, 0);
1415 
1416   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1417   /* The target register may not be SP but the source may be.  */
1418   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1419 		       + SCALE (offset, 16),
1420 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
1421 }
1422 
1423 /* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
1424 static void
1425 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1426 {
1427   unsigned rn = INSTR (9, 5);
1428   unsigned rt = INSTR (4, 0);
1429   uint64_t address;
1430 
1431   if (rn == rt && wb != NoWriteBack)
1432     HALT_UNALLOC;
1433 
1434   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1435 
1436   if (wb != Post)
1437     address += offset;
1438 
1439   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1440   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
1441 
1442   if (wb == Post)
1443     address += offset;
1444 
1445   if (wb != NoWriteBack)
1446     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1447 }
1448 
1449 /* 32 bit store short scaled or unscaled zero-
1450    or sign-extended 32-bit register offset.  */
1451 static void
1452 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1453 {
1454   unsigned rm = INSTR (20, 16);
1455   unsigned rn = INSTR (9, 5);
1456   unsigned rt = INSTR (4, 0);
1457   /* rn may reference SP, rm and rt must reference ZR  */
1458 
1459   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1460   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1461   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1462 
1463   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1464   aarch64_set_mem_u16 (cpu, address + displacement,
1465 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
1466 }
1467 
1468 /* Prefetch unsigned 12 bit.  */
1469 static void
1470 prfm_abs (sim_cpu *cpu, uint32_t offset)
1471 {
1472   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1473                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1474                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1475                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1476                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1477                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1478                           ow ==> UNALLOC
1479      PrfOp prfop = prfop (instr, 4, 0);
1480      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
1481      + SCALE (offset, 64).  */
1482 
1483   /* TODO : implement prefetch of address.  */
1484 }
1485 
1486 /* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
1487 static void
1488 prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1489 {
1490   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1491                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1492                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1493                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1494                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1495                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1496                           ow ==> UNALLOC
1497      rn may reference SP, rm may only reference ZR
1498      PrfOp prfop = prfop (instr, 4, 0);
1499      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1500      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1501                                 extension);
1502      uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
1503      uint64_t address = base + displacement.  */
1504 
1505   /* TODO : implement prefetch of address  */
1506 }
1507 
1508 /* 64 bit pc-relative prefetch.  */
1509 static void
1510 prfm_pcrel (sim_cpu *cpu, int32_t offset)
1511 {
1512   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1513                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1514                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1515                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1516                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1517                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1518                           ow ==> UNALLOC
1519      PrfOp prfop = prfop (instr, 4, 0);
1520      uint64_t address = aarch64_get_PC (cpu) + offset.  */
1521 
1522   /* TODO : implement this  */
1523 }
1524 
1525 /* Load-store exclusive.  */
1526 
1527 static void
1528 ldxr (sim_cpu *cpu)
1529 {
1530   unsigned rn = INSTR (9, 5);
1531   unsigned rt = INSTR (4, 0);
1532   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1533   int size = INSTR (31, 30);
1534   /* int ordered = INSTR (15, 15);  */
1535   /* int exclusive = ! INSTR (23, 23);  */
1536 
1537   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1538   switch (size)
1539     {
1540     case 0:
1541       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
1542       break;
1543     case 1:
1544       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1545       break;
1546     case 2:
1547       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
1548       break;
1549     case 3:
1550       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
1551       break;
1552     }
1553 }
1554 
1555 static void
1556 stxr (sim_cpu *cpu)
1557 {
1558   unsigned rn = INSTR (9, 5);
1559   unsigned rt = INSTR (4, 0);
1560   unsigned rs = INSTR (20, 16);
1561   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1562   int      size = INSTR (31, 30);
1563   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
1564 
1565   switch (size)
1566     {
1567     case 0: aarch64_set_mem_u8 (cpu, address, data); break;
1568     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
1569     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
1570     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
1571     }
1572 
1573   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1574   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
1575 }
1576 
1577 static void
1578 dexLoadLiteral (sim_cpu *cpu)
1579 {
1580   /* instr[29,27] == 011
1581      instr[25,24] == 00
1582      instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
1583                             010 ==> LDRX,  011 ==> FLDRD
1584                             100 ==> LDRSW, 101 ==> FLDRQ
1585                             110 ==> PRFM, 111 ==> UNALLOC
1586      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
1587      instr[23, 5] == simm19  */
1588 
1589   /* unsigned rt = INSTR (4, 0);  */
1590   uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
1591   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
1592 
1593   switch (dispatch)
1594     {
1595     case 0: ldr32_pcrel (cpu, imm); break;
1596     case 1: fldrs_pcrel (cpu, imm); break;
1597     case 2: ldr_pcrel   (cpu, imm); break;
1598     case 3: fldrd_pcrel (cpu, imm); break;
1599     case 4: ldrsw_pcrel (cpu, imm); break;
1600     case 5: fldrq_pcrel (cpu, imm); break;
1601     case 6: prfm_pcrel  (cpu, imm); break;
1602     case 7:
1603     default:
1604       HALT_UNALLOC;
1605     }
1606 }
1607 
1608 /* Immediate arithmetic
1609    The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
1610    value left shifted by 12 bits (done at decode).
1611 
1612    N.B. the register args (dest, source) can normally be Xn or SP.
1613    the exception occurs for flag setting instructions which may
1614    only use Xn for the output (dest).  */
1615 
1616 /* 32 bit add immediate.  */
1617 static void
1618 add32 (sim_cpu *cpu, uint32_t aimm)
1619 {
1620   unsigned rn = INSTR (9, 5);
1621   unsigned rd = INSTR (4, 0);
1622 
1623   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1624   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1625 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
1626 }
1627 
1628 /* 64 bit add immediate.  */
1629 static void
1630 add64 (sim_cpu *cpu, uint32_t aimm)
1631 {
1632   unsigned rn = INSTR (9, 5);
1633   unsigned rd = INSTR (4, 0);
1634 
1635   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1636   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1637 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
1638 }
1639 
1640 static void
1641 set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
1642 {
1643   int32_t   result = value1 + value2;
1644   int64_t   sresult = (int64_t) value1 + (int64_t) value2;
1645   uint64_t  uresult = (uint64_t)(uint32_t) value1
1646     + (uint64_t)(uint32_t) value2;
1647   uint32_t  flags = 0;
1648 
1649   if (result == 0)
1650     flags |= Z;
1651 
1652   if (result & (1 << 31))
1653     flags |= N;
1654 
1655   if (uresult != result)
1656     flags |= C;
1657 
1658   if (sresult != result)
1659     flags |= V;
1660 
1661   aarch64_set_CPSR (cpu, flags);
1662 }
1663 
1664 static void
1665 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1666 {
1667   int64_t   sval1 = value1;
1668   int64_t   sval2 = value2;
1669   uint64_t  result = value1 + value2;
1670   int64_t   sresult = sval1 + sval2;
1671   uint32_t  flags = 0;
1672 
1673   if (result == 0)
1674     flags |= Z;
1675 
1676   if (result & (1ULL << 63))
1677     flags |= N;
1678 
1679   if (sval1 < 0)
1680     {
1681       if (sval2 < 0)
1682 	{
1683 	  /* Negative plus a negative.  Overflow happens if
1684 	     the result is greater than either of the operands.  */
1685 	  if (sresult > sval1 || sresult > sval2)
1686 	    flags |= V;
1687 	}
1688       /* else Negative plus a positive.  Overflow cannot happen.  */
1689     }
1690   else /* value1 is +ve.  */
1691     {
1692       if (sval2 < 0)
1693 	{
1694 	  /* Overflow can only occur if we computed "0 - MININT".  */
1695 	  if (sval1 == 0 && sval2 == (1LL << 63))
1696 	    flags |= V;
1697 	}
1698       else
1699 	{
1700 	  /* Postive plus positive - overflow has happened if the
1701 	     result is smaller than either of the operands.  */
1702 	  if (result < value1 || result < value2)
1703 	    flags |= V | C;
1704 	}
1705     }
1706 
1707   aarch64_set_CPSR (cpu, flags);
1708 }
1709 
1710 #define NEG(a) (((a) & signbit) == signbit)
1711 #define POS(a) (((a) & signbit) == 0)
1712 
1713 static void
1714 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
1715 {
1716   uint32_t result = value1 - value2;
1717   uint32_t flags = 0;
1718   uint32_t signbit = 1U << 31;
1719 
1720   if (result == 0)
1721     flags |= Z;
1722 
1723   if (NEG (result))
1724     flags |= N;
1725 
1726   if (   (NEG (value1) && POS (value2))
1727       || (NEG (value1) && POS (result))
1728       || (POS (value2) && POS (result)))
1729     flags |= C;
1730 
1731   if (   (NEG (value1) && POS (value2) && POS (result))
1732       || (POS (value1) && NEG (value2) && NEG (result)))
1733     flags |= V;
1734 
1735   aarch64_set_CPSR (cpu, flags);
1736 }
1737 
1738 static void
1739 set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1740 {
1741   uint64_t result = value1 - value2;
1742   uint32_t flags = 0;
1743   uint64_t signbit = 1ULL << 63;
1744 
1745   if (result == 0)
1746     flags |= Z;
1747 
1748   if (NEG (result))
1749     flags |= N;
1750 
1751   if (   (NEG (value1) && POS (value2))
1752       || (NEG (value1) && POS (result))
1753       || (POS (value2) && POS (result)))
1754     flags |= C;
1755 
1756   if (   (NEG (value1) && POS (value2) && POS (result))
1757       || (POS (value1) && NEG (value2) && NEG (result)))
1758     flags |= V;
1759 
1760   aarch64_set_CPSR (cpu, flags);
1761 }
1762 
1763 static void
1764 set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
1765 {
1766   uint32_t flags = 0;
1767 
1768   if (result == 0)
1769     flags |= Z;
1770   else
1771     flags &= ~ Z;
1772 
1773   if (result & (1 << 31))
1774     flags |= N;
1775   else
1776     flags &= ~ N;
1777 
1778   aarch64_set_CPSR (cpu, flags);
1779 }
1780 
1781 static void
1782 set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
1783 {
1784   uint32_t flags = 0;
1785 
1786   if (result == 0)
1787     flags |= Z;
1788   else
1789     flags &= ~ Z;
1790 
1791   if (result & (1ULL << 63))
1792     flags |= N;
1793   else
1794     flags &= ~ N;
1795 
1796   aarch64_set_CPSR (cpu, flags);
1797 }
1798 
1799 /* 32 bit add immediate set flags.  */
1800 static void
1801 adds32 (sim_cpu *cpu, uint32_t aimm)
1802 {
1803   unsigned rn = INSTR (9, 5);
1804   unsigned rd = INSTR (4, 0);
1805   /* TODO : do we need to worry about signs here?  */
1806   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
1807 
1808   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1809   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
1810   set_flags_for_add32 (cpu, value1, aimm);
1811 }
1812 
1813 /* 64 bit add immediate set flags.  */
1814 static void
1815 adds64 (sim_cpu *cpu, uint32_t aimm)
1816 {
1817   unsigned rn = INSTR (9, 5);
1818   unsigned rd = INSTR (4, 0);
1819   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1820   uint64_t value2 = aimm;
1821 
1822   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1823   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1824   set_flags_for_add64 (cpu, value1, value2);
1825 }
1826 
1827 /* 32 bit sub immediate.  */
1828 static void
1829 sub32 (sim_cpu *cpu, uint32_t aimm)
1830 {
1831   unsigned rn = INSTR (9, 5);
1832   unsigned rd = INSTR (4, 0);
1833 
1834   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1835   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1836 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
1837 }
1838 
1839 /* 64 bit sub immediate.  */
1840 static void
1841 sub64 (sim_cpu *cpu, uint32_t aimm)
1842 {
1843   unsigned rn = INSTR (9, 5);
1844   unsigned rd = INSTR (4, 0);
1845 
1846   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1847   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1848 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
1849 }
1850 
1851 /* 32 bit sub immediate set flags.  */
1852 static void
1853 subs32 (sim_cpu *cpu, uint32_t aimm)
1854 {
1855   unsigned rn = INSTR (9, 5);
1856   unsigned rd = INSTR (4, 0);
1857   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1858   uint32_t value2 = aimm;
1859 
1860   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1861   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1862   set_flags_for_sub32 (cpu, value1, value2);
1863 }
1864 
1865 /* 64 bit sub immediate set flags.  */
1866 static void
1867 subs64 (sim_cpu *cpu, uint32_t aimm)
1868 {
1869   unsigned rn = INSTR (9, 5);
1870   unsigned rd = INSTR (4, 0);
1871   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1872   uint32_t value2 = aimm;
1873 
1874   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1875   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1876   set_flags_for_sub64 (cpu, value1, value2);
1877 }
1878 
1879 /* Data Processing Register.  */
1880 
1881 /* First two helpers to perform the shift operations.  */
1882 
1883 static inline uint32_t
1884 shifted32 (uint32_t value, Shift shift, uint32_t count)
1885 {
1886   switch (shift)
1887     {
1888     default:
1889     case LSL:
1890       return (value << count);
1891     case LSR:
1892       return (value >> count);
1893     case ASR:
1894       {
1895 	int32_t svalue = value;
1896 	return (svalue >> count);
1897       }
1898     case ROR:
1899       {
1900 	uint32_t top = value >> count;
1901 	uint32_t bottom = value << (32 - count);
1902 	return (bottom | top);
1903       }
1904     }
1905 }
1906 
1907 static inline uint64_t
1908 shifted64 (uint64_t value, Shift shift, uint32_t count)
1909 {
1910   switch (shift)
1911     {
1912     default:
1913     case LSL:
1914       return (value << count);
1915     case LSR:
1916       return (value >> count);
1917     case ASR:
1918       {
1919 	int64_t svalue = value;
1920 	return (svalue >> count);
1921       }
1922     case ROR:
1923       {
1924 	uint64_t top = value >> count;
1925 	uint64_t bottom = value << (64 - count);
1926 	return (bottom | top);
1927       }
1928     }
1929 }
1930 
1931 /* Arithmetic shifted register.
1932    These allow an optional LSL, ASR or LSR to the second source
1933    register with a count up to the register bit count.
1934 
1935    N.B register args may not be SP.  */
1936 
1937 /* 32 bit ADD shifted register.  */
1938 static void
1939 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1940 {
1941   unsigned rm = INSTR (20, 16);
1942   unsigned rn = INSTR (9, 5);
1943   unsigned rd = INSTR (4, 0);
1944 
1945   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1946   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1947 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
1948 		       + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1949 				    shift, count));
1950 }
1951 
1952 /* 64 bit ADD shifted register.  */
1953 static void
1954 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1955 {
1956   unsigned rm = INSTR (20, 16);
1957   unsigned rn = INSTR (9, 5);
1958   unsigned rd = INSTR (4, 0);
1959 
1960   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1961   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1962 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
1963 		       + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1964 				    shift, count));
1965 }
1966 
1967 /* 32 bit ADD shifted register setting flags.  */
1968 static void
1969 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1970 {
1971   unsigned rm = INSTR (20, 16);
1972   unsigned rn = INSTR (9, 5);
1973   unsigned rd = INSTR (4, 0);
1974 
1975   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
1976   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1977 			       shift, count);
1978 
1979   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1980   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1981   set_flags_for_add32 (cpu, value1, value2);
1982 }
1983 
1984 /* 64 bit ADD shifted register setting flags.  */
1985 static void
1986 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1987 {
1988   unsigned rm = INSTR (20, 16);
1989   unsigned rn = INSTR (9, 5);
1990   unsigned rd = INSTR (4, 0);
1991 
1992   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
1993   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1994 			       shift, count);
1995 
1996   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1997   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1998   set_flags_for_add64 (cpu, value1, value2);
1999 }
2000 
2001 /* 32 bit SUB shifted register.  */
2002 static void
2003 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2004 {
2005   unsigned rm = INSTR (20, 16);
2006   unsigned rn = INSTR (9, 5);
2007   unsigned rd = INSTR (4, 0);
2008 
2009   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2010   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2011 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
2012 		       - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
2013 				    shift, count));
2014 }
2015 
2016 /* 64 bit SUB shifted register.  */
2017 static void
2018 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2019 {
2020   unsigned rm = INSTR (20, 16);
2021   unsigned rn = INSTR (9, 5);
2022   unsigned rd = INSTR (4, 0);
2023 
2024   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2025   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2026 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
2027 		       - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2028 				    shift, count));
2029 }
2030 
2031 /* 32 bit SUB shifted register setting flags.  */
2032 static void
2033 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2034 {
2035   unsigned rm = INSTR (20, 16);
2036   unsigned rn = INSTR (9, 5);
2037   unsigned rd = INSTR (4, 0);
2038 
2039   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2040   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
2041 			      shift, count);
2042 
2043   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2044   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2045   set_flags_for_sub32 (cpu, value1, value2);
2046 }
2047 
2048 /* 64 bit SUB shifted register setting flags.  */
2049 static void
2050 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2051 {
2052   unsigned rm = INSTR (20, 16);
2053   unsigned rn = INSTR (9, 5);
2054   unsigned rd = INSTR (4, 0);
2055 
2056   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2057   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2058 			       shift, count);
2059 
2060   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2061   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2062   set_flags_for_sub64 (cpu, value1, value2);
2063 }
2064 
2065 /* First a couple more helpers to fetch the
2066    relevant source register element either
2067    sign or zero extended as required by the
2068    extension value.  */
2069 
2070 static uint32_t
2071 extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
2072 {
2073   switch (extension)
2074     {
2075     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2076     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2077     case UXTW: /* Fall through.  */
2078     case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2079     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2080     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2081     case SXTW: /* Fall through.  */
2082     case SXTX: /* Fall through.  */
2083     default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2084   }
2085 }
2086 
2087 static uint64_t
2088 extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
2089 {
2090   switch (extension)
2091     {
2092     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2093     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2094     case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2095     case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
2096     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2097     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2098     case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2099     case SXTX:
2100     default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
2101     }
2102 }
2103 
2104 /* Arithmetic extending register
2105    These allow an optional sign extension of some portion of the
2106    second source register followed by an optional left shift of
2107    between 1 and 4 bits (i.e. a shift of 0-4 bits???)
2108 
2109    N.B output (dest) and first input arg (source) may normally be Xn
2110    or SP. However, for flag setting operations dest can only be
2111    Xn. Second input registers are always Xn.  */
2112 
2113 /* 32 bit ADD extending register.  */
2114 static void
2115 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2116 {
2117   unsigned rm = INSTR (20, 16);
2118   unsigned rn = INSTR (9, 5);
2119   unsigned rd = INSTR (4, 0);
2120 
2121   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2122   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2123 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
2124 		       + (extreg32 (cpu, rm, extension) << shift));
2125 }
2126 
2127 /* 64 bit ADD extending register.
2128    N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2129 static void
2130 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2131 {
2132   unsigned rm = INSTR (20, 16);
2133   unsigned rn = INSTR (9, 5);
2134   unsigned rd = INSTR (4, 0);
2135 
2136   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2137   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2138 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
2139 		       + (extreg64 (cpu, rm, extension) << shift));
2140 }
2141 
2142 /* 32 bit ADD extending register setting flags.  */
2143 static void
2144 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2145 {
2146   unsigned rm = INSTR (20, 16);
2147   unsigned rn = INSTR (9, 5);
2148   unsigned rd = INSTR (4, 0);
2149 
2150   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2151   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2152 
2153   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2154   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2155   set_flags_for_add32 (cpu, value1, value2);
2156 }
2157 
2158 /* 64 bit ADD extending register setting flags  */
2159 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2160 static void
2161 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2162 {
2163   unsigned rm = INSTR (20, 16);
2164   unsigned rn = INSTR (9, 5);
2165   unsigned rd = INSTR (4, 0);
2166 
2167   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2168   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2169 
2170   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2171   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2172   set_flags_for_add64 (cpu, value1, value2);
2173 }
2174 
2175 /* 32 bit SUB extending register.  */
2176 static void
2177 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2178 {
2179   unsigned rm = INSTR (20, 16);
2180   unsigned rn = INSTR (9, 5);
2181   unsigned rd = INSTR (4, 0);
2182 
2183   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2184   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2185 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
2186 		       - (extreg32 (cpu, rm, extension) << shift));
2187 }
2188 
2189 /* 64 bit SUB extending register.  */
2190 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2191 static void
2192 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2193 {
2194   unsigned rm = INSTR (20, 16);
2195   unsigned rn = INSTR (9, 5);
2196   unsigned rd = INSTR (4, 0);
2197 
2198   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2199   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2200 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
2201 		       - (extreg64 (cpu, rm, extension) << shift));
2202 }
2203 
2204 /* 32 bit SUB extending register setting flags.  */
2205 static void
2206 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2207 {
2208   unsigned rm = INSTR (20, 16);
2209   unsigned rn = INSTR (9, 5);
2210   unsigned rd = INSTR (4, 0);
2211 
2212   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2213   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2214 
2215   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2216   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2217   set_flags_for_sub32 (cpu, value1, value2);
2218 }
2219 
2220 /* 64 bit SUB extending register setting flags  */
2221 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2222 static void
2223 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2224 {
2225   unsigned rm = INSTR (20, 16);
2226   unsigned rn = INSTR (9, 5);
2227   unsigned rd = INSTR (4, 0);
2228 
2229   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2230   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2231 
2232   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2233   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2234   set_flags_for_sub64 (cpu, value1, value2);
2235 }
2236 
2237 static void
2238 dexAddSubtractImmediate (sim_cpu *cpu)
2239 {
2240   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2241      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2242      instr[29]    = set : 0 ==> no flags, 1 ==> set flags
2243      instr[28,24] = 10001
2244      instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
2245      instr[21,10] = uimm12
2246      instr[9,5]   = Rn
2247      instr[4,0]   = Rd  */
2248 
2249   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
2250   uint32_t shift = INSTR (23, 22);
2251   uint32_t imm = INSTR (21, 10);
2252   uint32_t dispatch = INSTR (31, 29);
2253 
2254   NYI_assert (28, 24, 0x11);
2255 
2256   if (shift > 1)
2257     HALT_UNALLOC;
2258 
2259   if (shift)
2260     imm <<= 12;
2261 
2262   switch (dispatch)
2263     {
2264     case 0: add32 (cpu, imm); break;
2265     case 1: adds32 (cpu, imm); break;
2266     case 2: sub32 (cpu, imm); break;
2267     case 3: subs32 (cpu, imm); break;
2268     case 4: add64 (cpu, imm); break;
2269     case 5: adds64 (cpu, imm); break;
2270     case 6: sub64 (cpu, imm); break;
2271     case 7: subs64 (cpu, imm); break;
2272     }
2273 }
2274 
2275 static void
2276 dexAddSubtractShiftedRegister (sim_cpu *cpu)
2277 {
2278   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2279      instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
2280      instr[28,24] = 01011
2281      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
2282      instr[21]    = 0
2283      instr[20,16] = Rm
2284      instr[15,10] = count : must be 0xxxxx for 32 bit
2285      instr[9,5]   = Rn
2286      instr[4,0]   = Rd  */
2287 
2288   uint32_t size = INSTR (31, 31);
2289   uint32_t count = INSTR (15, 10);
2290   Shift shiftType = INSTR (23, 22);
2291 
2292   NYI_assert (28, 24, 0x0B);
2293   NYI_assert (21, 21, 0);
2294 
2295   /* Shift encoded as ROR is unallocated.  */
2296   if (shiftType == ROR)
2297     HALT_UNALLOC;
2298 
2299   /* 32 bit operations must have count[5] = 0
2300      or else we have an UNALLOC.  */
2301   if (size == 0 && uimm (count, 5, 5))
2302     HALT_UNALLOC;
2303 
2304   /* Dispatch on size:op i.e instr [31,29].  */
2305   switch (INSTR (31, 29))
2306     {
2307     case 0: add32_shift  (cpu, shiftType, count); break;
2308     case 1: adds32_shift (cpu, shiftType, count); break;
2309     case 2: sub32_shift  (cpu, shiftType, count); break;
2310     case 3: subs32_shift (cpu, shiftType, count); break;
2311     case 4: add64_shift  (cpu, shiftType, count); break;
2312     case 5: adds64_shift (cpu, shiftType, count); break;
2313     case 6: sub64_shift  (cpu, shiftType, count); break;
2314     case 7: subs64_shift (cpu, shiftType, count); break;
2315     }
2316 }
2317 
2318 static void
2319 dexAddSubtractExtendedRegister (sim_cpu *cpu)
2320 {
2321   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2322      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2323      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2324      instr[28,24] = 01011
2325      instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
2326      instr[21]    = 1
2327      instr[20,16] = Rm
2328      instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
2329                              000 ==> LSL|UXTW, 001 ==> UXTZ,
2330                              000 ==> SXTB, 001 ==> SXTH,
2331                              000 ==> SXTW, 001 ==> SXTX,
2332      instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
2333      instr[9,5]   = Rn
2334      instr[4,0]   = Rd  */
2335 
2336   Extension extensionType = INSTR (15, 13);
2337   uint32_t shift = INSTR (12, 10);
2338 
2339   NYI_assert (28, 24, 0x0B);
2340   NYI_assert (21, 21, 1);
2341 
2342   /* Shift may not exceed 4.  */
2343   if (shift > 4)
2344     HALT_UNALLOC;
2345 
2346   /* Dispatch on size:op:set?.  */
2347   switch (INSTR (31, 29))
2348     {
2349     case 0: add32_ext  (cpu, extensionType, shift); break;
2350     case 1: adds32_ext (cpu, extensionType, shift); break;
2351     case 2: sub32_ext  (cpu, extensionType, shift); break;
2352     case 3: subs32_ext (cpu, extensionType, shift); break;
2353     case 4: add64_ext  (cpu, extensionType, shift); break;
2354     case 5: adds64_ext (cpu, extensionType, shift); break;
2355     case 6: sub64_ext  (cpu, extensionType, shift); break;
2356     case 7: subs64_ext (cpu, extensionType, shift); break;
2357     }
2358 }
2359 
2360 /* Conditional data processing
2361    Condition register is implicit 3rd source.  */
2362 
2363 /* 32 bit add with carry.  */
2364 /* N.B register args may not be SP.  */
2365 
2366 static void
2367 adc32 (sim_cpu *cpu)
2368 {
2369   unsigned rm = INSTR (20, 16);
2370   unsigned rn = INSTR (9, 5);
2371   unsigned rd = INSTR (4, 0);
2372 
2373   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2374   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2375 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
2376 		       + aarch64_get_reg_u32 (cpu, rm, NO_SP)
2377 		       + IS_SET (C));
2378 }
2379 
2380 /* 64 bit add with carry  */
2381 static void
2382 adc64 (sim_cpu *cpu)
2383 {
2384   unsigned rm = INSTR (20, 16);
2385   unsigned rn = INSTR (9, 5);
2386   unsigned rd = INSTR (4, 0);
2387 
2388   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2389   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2390 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
2391 		       + aarch64_get_reg_u64 (cpu, rm, NO_SP)
2392 		       + IS_SET (C));
2393 }
2394 
2395 /* 32 bit add with carry setting flags.  */
2396 static void
2397 adcs32 (sim_cpu *cpu)
2398 {
2399   unsigned rm = INSTR (20, 16);
2400   unsigned rn = INSTR (9, 5);
2401   unsigned rd = INSTR (4, 0);
2402 
2403   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2404   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2405   uint32_t carry = IS_SET (C);
2406 
2407   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2408   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2409   set_flags_for_add32 (cpu, value1, value2 + carry);
2410 }
2411 
2412 /* 64 bit add with carry setting flags.  */
2413 static void
2414 adcs64 (sim_cpu *cpu)
2415 {
2416   unsigned rm = INSTR (20, 16);
2417   unsigned rn = INSTR (9, 5);
2418   unsigned rd = INSTR (4, 0);
2419 
2420   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2421   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2422   uint64_t carry = IS_SET (C);
2423 
2424   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2425   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2426   set_flags_for_add64 (cpu, value1, value2 + carry);
2427 }
2428 
2429 /* 32 bit sub with carry.  */
2430 static void
2431 sbc32 (sim_cpu *cpu)
2432 {
2433   unsigned rm = INSTR (20, 16);
2434   unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
2435   unsigned rd = INSTR (4, 0);
2436 
2437   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2438   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2439 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
2440 		       - aarch64_get_reg_u32 (cpu, rm, NO_SP)
2441 		       - 1 + IS_SET (C));
2442 }
2443 
2444 /* 64 bit sub with carry  */
2445 static void
2446 sbc64 (sim_cpu *cpu)
2447 {
2448   unsigned rm = INSTR (20, 16);
2449   unsigned rn = INSTR (9, 5);
2450   unsigned rd = INSTR (4, 0);
2451 
2452   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2453   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2454 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
2455 		       - aarch64_get_reg_u64 (cpu, rm, NO_SP)
2456 		       - 1 + IS_SET (C));
2457 }
2458 
2459 /* 32 bit sub with carry setting flags  */
2460 static void
2461 sbcs32 (sim_cpu *cpu)
2462 {
2463   unsigned rm = INSTR (20, 16);
2464   unsigned rn = INSTR (9, 5);
2465   unsigned rd = INSTR (4, 0);
2466 
2467   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2468   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2469   uint32_t carry  = IS_SET (C);
2470   uint32_t result = value1 - value2 + 1 - carry;
2471 
2472   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2473   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2474   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
2475 }
2476 
2477 /* 64 bit sub with carry setting flags  */
2478 static void
2479 sbcs64 (sim_cpu *cpu)
2480 {
2481   unsigned rm = INSTR (20, 16);
2482   unsigned rn = INSTR (9, 5);
2483   unsigned rd = INSTR (4, 0);
2484 
2485   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2486   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2487   uint64_t carry  = IS_SET (C);
2488   uint64_t result = value1 - value2 + 1 - carry;
2489 
2490   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2491   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2492   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
2493 }
2494 
2495 static void
2496 dexAddSubtractWithCarry (sim_cpu *cpu)
2497 {
2498   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2499      instr[30]    = op : 0 ==> ADC, 1 ==> SBC
2500      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2501      instr[28,21] = 1 1010 000
2502      instr[20,16] = Rm
2503      instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
2504      instr[9,5]   = Rn
2505      instr[4,0]   = Rd  */
2506 
2507   uint32_t op2 = INSTR (15, 10);
2508 
2509   NYI_assert (28, 21, 0xD0);
2510 
2511   if (op2 != 0)
2512     HALT_UNALLOC;
2513 
2514   /* Dispatch on size:op:set?.  */
2515   switch (INSTR (31, 29))
2516     {
2517     case 0: adc32 (cpu); break;
2518     case 1: adcs32 (cpu); break;
2519     case 2: sbc32 (cpu); break;
2520     case 3: sbcs32 (cpu); break;
2521     case 4: adc64 (cpu); break;
2522     case 5: adcs64 (cpu); break;
2523     case 6: sbc64 (cpu); break;
2524     case 7: sbcs64 (cpu); break;
2525     }
2526 }
2527 
2528 static uint32_t
2529 testConditionCode (sim_cpu *cpu, CondCode cc)
2530 {
2531   /* This should be reduceable to branchless logic
2532      by some careful testing of bits in CC followed
2533      by the requisite masking and combining of bits
2534      from the flag register.
2535 
2536      For now we do it with a switch.  */
2537   int res;
2538 
2539   switch (cc)
2540     {
2541     case EQ:  res = IS_SET (Z);    break;
2542     case NE:  res = IS_CLEAR (Z);  break;
2543     case CS:  res = IS_SET (C);    break;
2544     case CC:  res = IS_CLEAR (C);  break;
2545     case MI:  res = IS_SET (N);    break;
2546     case PL:  res = IS_CLEAR (N);  break;
2547     case VS:  res = IS_SET (V);    break;
2548     case VC:  res = IS_CLEAR (V);  break;
2549     case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
2550     case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
2551     case GE:  res = IS_SET (N) == IS_SET (V);    break;
2552     case LT:  res = IS_SET (N) != IS_SET (V);    break;
2553     case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
2554     case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
2555     case AL:
2556     case NV:
2557     default:
2558       res = 1;
2559       break;
2560     }
2561   return res;
2562 }
2563 
2564 static void
2565 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
2566 {
2567   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2568      instr[30]    = compare with positive (1) or negative value (0)
2569      instr[29,21] = 1 1101 0010
2570      instr[20,16] = Rm or const
2571      instr[15,12] = cond
2572      instr[11]    = compare reg (0) or const (1)
2573      instr[10]    = 0
2574      instr[9,5]   = Rn
2575      instr[4]     = 0
2576      instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
2577   signed int negate;
2578   unsigned rm;
2579   unsigned rn;
2580 
2581   NYI_assert (29, 21, 0x1d2);
2582   NYI_assert (10, 10, 0);
2583   NYI_assert (4, 4, 0);
2584 
2585   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2586   if (! testConditionCode (cpu, INSTR (15, 12)))
2587     {
2588       aarch64_set_CPSR (cpu, INSTR (3, 0));
2589       return;
2590     }
2591 
2592   negate = INSTR (30, 30) ? 1 : -1;
2593   rm = INSTR (20, 16);
2594   rn = INSTR ( 9,  5);
2595 
2596   if (INSTR (31, 31))
2597     {
2598       if (INSTR (11, 11))
2599 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2600 			     negate * (uint64_t) rm);
2601       else
2602 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2603 			     negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
2604     }
2605   else
2606     {
2607       if (INSTR (11, 11))
2608 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2609 			     negate * rm);
2610       else
2611 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2612 			     negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
2613     }
2614 }
2615 
2616 static void
2617 do_vec_MOV_whole_vector (sim_cpu *cpu)
2618 {
2619   /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)
2620 
2621      instr[31]    = 0
2622      instr[30]    = half(0)/full(1)
2623      instr[29,21] = 001110101
2624      instr[20,16] = Vs
2625      instr[15,10] = 000111
2626      instr[9,5]   = Vs
2627      instr[4,0]   = Vd  */
2628 
2629   unsigned vs = INSTR (9, 5);
2630   unsigned vd = INSTR (4, 0);
2631 
2632   NYI_assert (29, 21, 0x075);
2633   NYI_assert (15, 10, 0x07);
2634 
2635   if (INSTR (20, 16) != vs)
2636     HALT_NYI;
2637 
2638   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2639   if (INSTR (30, 30))
2640     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
2641 
2642   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
2643 }
2644 
2645 static void
2646 do_vec_MOV_into_scalar (sim_cpu *cpu)
2647 {
2648   /* instr[31]    = 0
2649      instr[30]    = word(0)/long(1)
2650      instr[29,21] = 00 1110 000
2651      instr[20,18] = element size and index
2652      instr[17,10] = 00 0011 11
2653      instr[9,5]   = V source
2654      instr[4,0]   = R dest  */
2655 
2656   unsigned vs = INSTR (9, 5);
2657   unsigned rd = INSTR (4, 0);
2658 
2659   NYI_assert (29, 21, 0x070);
2660   NYI_assert (17, 10, 0x0F);
2661 
2662   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2663   switch (INSTR (20, 18))
2664     {
2665     case 0x2:
2666       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 0));
2667       break;
2668 
2669     case 0x6:
2670       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 1));
2671       break;
2672 
2673     case 0x1:
2674     case 0x3:
2675     case 0x5:
2676     case 0x7:
2677       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u32
2678 			   (cpu, vs, INSTR (20, 19)));
2679       break;
2680 
2681     default:
2682       HALT_NYI;
2683     }
2684 }
2685 
2686 static void
2687 do_vec_INS (sim_cpu *cpu)
2688 {
2689   /* instr[31,21] = 01001110000
2690      instr[20,16] = element size and index
2691      instr[15,10] = 000111
2692      instr[9,5]   = W source
2693      instr[4,0]   = V dest  */
2694 
2695   int index;
2696   unsigned rs = INSTR (9, 5);
2697   unsigned vd = INSTR (4, 0);
2698 
2699   NYI_assert (31, 21, 0x270);
2700   NYI_assert (15, 10, 0x07);
2701 
2702   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2703   if (INSTR (16, 16))
2704     {
2705       index = INSTR (20, 17);
2706       aarch64_set_vec_u8 (cpu, vd, index,
2707 			  aarch64_get_reg_u8 (cpu, rs, NO_SP));
2708     }
2709   else if (INSTR (17, 17))
2710     {
2711       index = INSTR (20, 18);
2712       aarch64_set_vec_u16 (cpu, vd, index,
2713 			   aarch64_get_reg_u16 (cpu, rs, NO_SP));
2714     }
2715   else if (INSTR (18, 18))
2716     {
2717       index = INSTR (20, 19);
2718       aarch64_set_vec_u32 (cpu, vd, index,
2719 			   aarch64_get_reg_u32 (cpu, rs, NO_SP));
2720     }
2721   else if (INSTR (19, 19))
2722     {
2723       index = INSTR (20, 20);
2724       aarch64_set_vec_u64 (cpu, vd, index,
2725 			   aarch64_get_reg_u64 (cpu, rs, NO_SP));
2726     }
2727   else
2728     HALT_NYI;
2729 }
2730 
2731 static void
2732 do_vec_DUP_vector_into_vector (sim_cpu *cpu)
2733 {
2734   /* instr[31]    = 0
2735      instr[30]    = half(0)/full(1)
2736      instr[29,21] = 00 1110 000
2737      instr[20,16] = element size and index
2738      instr[15,10] = 0000 01
2739      instr[9,5]   = V source
2740      instr[4,0]   = V dest.  */
2741 
2742   unsigned full = INSTR (30, 30);
2743   unsigned vs = INSTR (9, 5);
2744   unsigned vd = INSTR (4, 0);
2745   int i, index;
2746 
2747   NYI_assert (29, 21, 0x070);
2748   NYI_assert (15, 10, 0x01);
2749 
2750   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2751   if (INSTR (16, 16))
2752     {
2753       index = INSTR (20, 17);
2754 
2755       for (i = 0; i < (full ? 16 : 8); i++)
2756 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
2757     }
2758   else if (INSTR (17, 17))
2759     {
2760       index = INSTR (20, 18);
2761 
2762       for (i = 0; i < (full ? 8 : 4); i++)
2763 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
2764     }
2765   else if (INSTR (18, 18))
2766     {
2767       index = INSTR (20, 19);
2768 
2769       for (i = 0; i < (full ? 4 : 2); i++)
2770 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
2771     }
2772   else
2773     {
2774       if (INSTR (19, 19) == 0)
2775 	HALT_UNALLOC;
2776 
2777       if (! full)
2778 	HALT_UNALLOC;
2779 
2780       index = INSTR (20, 20);
2781 
2782       for (i = 0; i < 2; i++)
2783 	aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
2784     }
2785 }
2786 
2787 static void
2788 do_vec_TBL (sim_cpu *cpu)
2789 {
2790   /* instr[31]    = 0
2791      instr[30]    = half(0)/full(1)
2792      instr[29,21] = 00 1110 000
2793      instr[20,16] = Vm
2794      instr[15]    = 0
2795      instr[14,13] = vec length
2796      instr[12,10] = 000
2797      instr[9,5]   = V start
2798      instr[4,0]   = V dest  */
2799 
2800   int full    = INSTR (30, 30);
2801   int len     = INSTR (14, 13) + 1;
2802   unsigned vm = INSTR (20, 16);
2803   unsigned vn = INSTR (9, 5);
2804   unsigned vd = INSTR (4, 0);
2805   unsigned i;
2806 
2807   NYI_assert (29, 21, 0x070);
2808   NYI_assert (12, 10, 0);
2809 
2810   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2811   for (i = 0; i < (full ? 16 : 8); i++)
2812     {
2813       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
2814       uint8_t val;
2815 
2816       if (selector < 16)
2817 	val = aarch64_get_vec_u8 (cpu, vn, selector);
2818       else if (selector < 32)
2819 	val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
2820       else if (selector < 48)
2821 	val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
2822       else if (selector < 64)
2823 	val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
2824       else
2825 	val = 0;
2826 
2827       aarch64_set_vec_u8 (cpu, vd, i, val);
2828     }
2829 }
2830 
2831 static void
2832 do_vec_TRN (sim_cpu *cpu)
2833 {
2834   /* instr[31]    = 0
2835      instr[30]    = half(0)/full(1)
2836      instr[29,24] = 00 1110
2837      instr[23,22] = size
2838      instr[21]    = 0
2839      instr[20,16] = Vm
2840      instr[15]    = 0
2841      instr[14]    = TRN1 (0) / TRN2 (1)
2842      instr[13,10] = 1010
2843      instr[9,5]   = V source
2844      instr[4,0]   = V dest.  */
2845 
2846   int full    = INSTR (30, 30);
2847   int second  = INSTR (14, 14);
2848   unsigned vm = INSTR (20, 16);
2849   unsigned vn = INSTR (9, 5);
2850   unsigned vd = INSTR (4, 0);
2851   unsigned i;
2852 
2853   NYI_assert (29, 24, 0x0E);
2854   NYI_assert (13, 10, 0xA);
2855 
2856   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2857   switch (INSTR (23, 22))
2858     {
2859     case 0:
2860       for (i = 0; i < (full ? 8 : 4); i++)
2861 	{
2862 	  aarch64_set_vec_u8
2863 	    (cpu, vd, i * 2,
2864 	     aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
2865 	  aarch64_set_vec_u8
2866 	    (cpu, vd, 1 * 2 + 1,
2867 	     aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
2868 	}
2869       break;
2870 
2871     case 1:
2872       for (i = 0; i < (full ? 4 : 2); i++)
2873 	{
2874 	  aarch64_set_vec_u16
2875 	    (cpu, vd, i * 2,
2876 	     aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
2877 	  aarch64_set_vec_u16
2878 	    (cpu, vd, 1 * 2 + 1,
2879 	     aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
2880 	}
2881       break;
2882 
2883     case 2:
2884       aarch64_set_vec_u32
2885 	(cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
2886       aarch64_set_vec_u32
2887 	(cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
2888       aarch64_set_vec_u32
2889 	(cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
2890       aarch64_set_vec_u32
2891 	(cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
2892       break;
2893 
2894     case 3:
2895       if (! full)
2896 	HALT_UNALLOC;
2897 
2898       aarch64_set_vec_u64 (cpu, vd, 0,
2899 			   aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
2900       aarch64_set_vec_u64 (cpu, vd, 1,
2901 			   aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
2902       break;
2903     }
2904 }
2905 
2906 static void
2907 do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
2908 {
2909   /* instr[31]    = 0
2910      instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
2911                     [must be 1 for 64-bit xfer]
2912      instr[29,20] = 00 1110 0000
2913      instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
2914                                   0100=> 32-bits. 1000=>64-bits
2915      instr[15,10] = 0000 11
2916      instr[9,5]   = W source
2917      instr[4,0]   = V dest.  */
2918 
2919   unsigned i;
2920   unsigned Vd = INSTR (4, 0);
2921   unsigned Rs = INSTR (9, 5);
2922   int both    = INSTR (30, 30);
2923 
2924   NYI_assert (29, 20, 0x0E0);
2925   NYI_assert (15, 10, 0x03);
2926 
2927   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2928   switch (INSTR (19, 16))
2929     {
2930     case 1:
2931       for (i = 0; i < (both ? 16 : 8); i++)
2932 	aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
2933       break;
2934 
2935     case 2:
2936       for (i = 0; i < (both ? 8 : 4); i++)
2937 	aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
2938       break;
2939 
2940     case 4:
2941       for (i = 0; i < (both ? 4 : 2); i++)
2942 	aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
2943       break;
2944 
2945     case 8:
2946       if (!both)
2947 	HALT_NYI;
2948       aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2949       aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2950       break;
2951 
2952     default:
2953       HALT_NYI;
2954     }
2955 }
2956 
2957 static void
2958 do_vec_UZP (sim_cpu *cpu)
2959 {
2960   /* instr[31]    = 0
2961      instr[30]    = half(0)/full(1)
2962      instr[29,24] = 00 1110
2963      instr[23,22] = size: byte(00), half(01), word (10), long (11)
2964      instr[21]    = 0
2965      instr[20,16] = Vm
2966      instr[15]    = 0
2967      instr[14]    = lower (0) / upper (1)
2968      instr[13,10] = 0110
2969      instr[9,5]   = Vn
2970      instr[4,0]   = Vd.  */
2971 
2972   int full = INSTR (30, 30);
2973   int upper = INSTR (14, 14);
2974 
2975   unsigned vm = INSTR (20, 16);
2976   unsigned vn = INSTR (9, 5);
2977   unsigned vd = INSTR (4, 0);
2978 
2979   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
2980   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
2981   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
2982   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
2983 
2984   uint64_t val1 = 0;
2985   uint64_t val2 = 0;
2986 
2987   uint64_t input1 = upper ? val_n1 : val_m1;
2988   uint64_t input2 = upper ? val_n2 : val_m2;
2989   unsigned i;
2990 
2991   NYI_assert (29, 24, 0x0E);
2992   NYI_assert (21, 21, 0);
2993   NYI_assert (15, 15, 0);
2994   NYI_assert (13, 10, 6);
2995 
2996   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2997   switch (INSTR (23, 23))
2998     {
2999     case 0:
3000       for (i = 0; i < 8; i++)
3001 	{
3002 	  val1 |= (input1 >> (i * 8)) & (0xFFULL << (i * 8));
3003 	  val2 |= (input2 >> (i * 8)) & (0xFFULL << (i * 8));
3004 	}
3005       break;
3006 
3007     case 1:
3008       for (i = 0; i < 4; i++)
3009 	{
3010 	  val1 |= (input1 >> (i * 16)) & (0xFFFFULL << (i * 16));
3011 	  val2 |= (input2 >> (i * 16)) & (0xFFFFULL << (i * 16));
3012 	}
3013       break;
3014 
3015     case 2:
3016       val1 = ((input1 & 0xFFFFFFFF) | ((input1 >> 32) & 0xFFFFFFFF00000000ULL));
3017       val2 = ((input2 & 0xFFFFFFFF) | ((input2 >> 32) & 0xFFFFFFFF00000000ULL));
3018 
3019     case 3:
3020       val1 = input1;
3021       val2 = input2;
3022 	   break;
3023     }
3024 
3025   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3026   if (full)
3027     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3028 }
3029 
3030 static void
3031 do_vec_ZIP (sim_cpu *cpu)
3032 {
3033   /* instr[31]    = 0
3034      instr[30]    = half(0)/full(1)
3035      instr[29,24] = 00 1110
3036      instr[23,22] = size: byte(00), hald(01), word (10), long (11)
3037      instr[21]    = 0
3038      instr[20,16] = Vm
3039      instr[15]    = 0
3040      instr[14]    = lower (0) / upper (1)
3041      instr[13,10] = 1110
3042      instr[9,5]   = Vn
3043      instr[4,0]   = Vd.  */
3044 
3045   int full = INSTR (30, 30);
3046   int upper = INSTR (14, 14);
3047 
3048   unsigned vm = INSTR (20, 16);
3049   unsigned vn = INSTR (9, 5);
3050   unsigned vd = INSTR (4, 0);
3051 
3052   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3053   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3054   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3055   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3056 
3057   uint64_t val1 = 0;
3058   uint64_t val2 = 0;
3059 
3060   uint64_t input1 = upper ? val_n1 : val_m1;
3061   uint64_t input2 = upper ? val_n2 : val_m2;
3062 
3063   NYI_assert (29, 24, 0x0E);
3064   NYI_assert (21, 21, 0);
3065   NYI_assert (15, 15, 0);
3066   NYI_assert (13, 10, 0xE);
3067 
3068   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3069   switch (INSTR (23, 23))
3070     {
3071     case 0:
3072       val1 =
3073 	  ((input1 <<  0) & (0xFF    <<  0))
3074 	| ((input2 <<  8) & (0xFF    <<  8))
3075 	| ((input1 <<  8) & (0xFF    << 16))
3076 	| ((input2 << 16) & (0xFF    << 24))
3077 	| ((input1 << 16) & (0xFFULL << 32))
3078 	| ((input2 << 24) & (0xFFULL << 40))
3079 	| ((input1 << 24) & (0xFFULL << 48))
3080 	| ((input2 << 32) & (0xFFULL << 56));
3081 
3082       val2 =
3083 	  ((input1 >> 32) & (0xFF    <<  0))
3084 	| ((input2 >> 24) & (0xFF    <<  8))
3085 	| ((input1 >> 24) & (0xFF    << 16))
3086 	| ((input2 >> 16) & (0xFF    << 24))
3087 	| ((input1 >> 16) & (0xFFULL << 32))
3088 	| ((input2 >>  8) & (0xFFULL << 40))
3089 	| ((input1 >>  8) & (0xFFULL << 48))
3090 	| ((input2 >>  0) & (0xFFULL << 56));
3091       break;
3092 
3093     case 1:
3094       val1 =
3095 	  ((input1 <<  0) & (0xFFFF    <<  0))
3096 	| ((input2 << 16) & (0xFFFF    << 16))
3097 	| ((input1 << 16) & (0xFFFFULL << 32))
3098 	| ((input2 << 32) & (0xFFFFULL << 48));
3099 
3100       val2 =
3101 	  ((input1 >> 32) & (0xFFFF    <<  0))
3102 	| ((input2 >> 16) & (0xFFFF    << 16))
3103 	| ((input1 >> 16) & (0xFFFFULL << 32))
3104 	| ((input2 >>  0) & (0xFFFFULL << 48));
3105       break;
3106 
3107     case 2:
3108       val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
3109       val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
3110       break;
3111 
3112     case 3:
3113       val1 = input1;
3114       val2 = input2;
3115       break;
3116     }
3117 
3118   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3119   if (full)
3120     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3121 }
3122 
3123 /* Floating point immediates are encoded in 8 bits.
3124    fpimm[7] = sign bit.
3125    fpimm[6:4] = signed exponent.
3126    fpimm[3:0] = fraction (assuming leading 1).
3127    i.e. F = s * 1.f * 2^(e - b).  */
3128 
3129 static float
3130 fp_immediate_for_encoding_32 (uint32_t imm8)
3131 {
3132   float u;
3133   uint32_t s, e, f, i;
3134 
3135   s = (imm8 >> 7) & 0x1;
3136   e = (imm8 >> 4) & 0x7;
3137   f = imm8 & 0xf;
3138 
3139   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3140   u = (16.0 + f) / 16.0;
3141 
3142   /* N.B. exponent is signed.  */
3143   if (e < 4)
3144     {
3145       int epos = e;
3146 
3147       for (i = 0; i <= epos; i++)
3148 	u *= 2.0;
3149     }
3150   else
3151     {
3152       int eneg = 7 - e;
3153 
3154       for (i = 0; i < eneg; i++)
3155 	u /= 2.0;
3156     }
3157 
3158   if (s)
3159     u = - u;
3160 
3161   return u;
3162 }
3163 
3164 static double
3165 fp_immediate_for_encoding_64 (uint32_t imm8)
3166 {
3167   double u;
3168   uint32_t s, e, f, i;
3169 
3170   s = (imm8 >> 7) & 0x1;
3171   e = (imm8 >> 4) & 0x7;
3172   f = imm8 & 0xf;
3173 
3174   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3175   u = (16.0 + f) / 16.0;
3176 
3177   /* N.B. exponent is signed.  */
3178   if (e < 4)
3179     {
3180       int epos = e;
3181 
3182       for (i = 0; i <= epos; i++)
3183 	u *= 2.0;
3184     }
3185   else
3186     {
3187       int eneg = 7 - e;
3188 
3189       for (i = 0; i < eneg; i++)
3190 	u /= 2.0;
3191     }
3192 
3193   if (s)
3194     u = - u;
3195 
3196   return u;
3197 }
3198 
3199 static void
3200 do_vec_MOV_immediate (sim_cpu *cpu)
3201 {
3202   /* instr[31]    = 0
3203      instr[30]    = full/half selector
3204      instr[29,19] = 00111100000
3205      instr[18,16] = high 3 bits of uimm8
3206      instr[15,12] = size & shift:
3207                                   0000 => 32-bit
3208                                   0010 => 32-bit + LSL#8
3209                                   0100 => 32-bit + LSL#16
3210                                   0110 => 32-bit + LSL#24
3211                                   1010 => 16-bit + LSL#8
3212                                   1000 => 16-bit
3213                                   1101 => 32-bit + MSL#16
3214                                   1100 => 32-bit + MSL#8
3215                                   1110 => 8-bit
3216                                   1111 => double
3217      instr[11,10] = 01
3218      instr[9,5]   = low 5-bits of uimm8
3219      instr[4,0]   = Vd.  */
3220 
3221   int full     = INSTR (30, 30);
3222   unsigned vd  = INSTR (4, 0);
3223   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3224   unsigned i;
3225 
3226   NYI_assert (29, 19, 0x1E0);
3227   NYI_assert (11, 10, 1);
3228 
3229   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3230   switch (INSTR (15, 12))
3231     {
3232     case 0x0: /* 32-bit, no shift.  */
3233     case 0x2: /* 32-bit, shift by 8.  */
3234     case 0x4: /* 32-bit, shift by 16.  */
3235     case 0x6: /* 32-bit, shift by 24.  */
3236       val <<= (8 * INSTR (14, 13));
3237       for (i = 0; i < (full ? 4 : 2); i++)
3238 	aarch64_set_vec_u32 (cpu, vd, i, val);
3239       break;
3240 
3241     case 0xa: /* 16-bit, shift by 8.  */
3242       val <<= 8;
3243       /* Fall through.  */
3244     case 0x8: /* 16-bit, no shift.  */
3245       for (i = 0; i < (full ? 8 : 4); i++)
3246 	aarch64_set_vec_u16 (cpu, vd, i, val);
3247       /* Fall through.  */
3248     case 0xd: /* 32-bit, mask shift by 16.  */
3249       val <<= 8;
3250       val |= 0xFF;
3251       /* Fall through.  */
3252     case 0xc: /* 32-bit, mask shift by 8. */
3253       val <<= 8;
3254       val |= 0xFF;
3255       for (i = 0; i < (full ? 4 : 2); i++)
3256 	aarch64_set_vec_u32 (cpu, vd, i, val);
3257       break;
3258 
3259     case 0xe: /* 8-bit, no shift.  */
3260       for (i = 0; i < (full ? 16 : 8); i++)
3261 	aarch64_set_vec_u8 (cpu, vd, i, val);
3262       break;
3263 
3264     case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
3265       {
3266 	float u = fp_immediate_for_encoding_32 (val);
3267 	for (i = 0; i < (full ? 4 : 2); i++)
3268 	  aarch64_set_vec_float (cpu, vd, i, u);
3269 	break;
3270       }
3271 
3272     default:
3273       HALT_NYI;
3274     }
3275 }
3276 
3277 static void
3278 do_vec_MVNI (sim_cpu *cpu)
3279 {
3280   /* instr[31]    = 0
3281      instr[30]    = full/half selector
3282      instr[29,19] = 10111100000
3283      instr[18,16] = high 3 bits of uimm8
3284      instr[15,12] = selector
3285      instr[11,10] = 01
3286      instr[9,5]   = low 5-bits of uimm8
3287      instr[4,0]   = Vd.  */
3288 
3289   int full     = INSTR (30, 30);
3290   unsigned vd  = INSTR (4, 0);
3291   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3292   unsigned i;
3293 
3294   NYI_assert (29, 19, 0x5E0);
3295   NYI_assert (11, 10, 1);
3296 
3297   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3298   switch (INSTR (15, 12))
3299     {
3300     case 0x0: /* 32-bit, no shift.  */
3301     case 0x2: /* 32-bit, shift by 8.  */
3302     case 0x4: /* 32-bit, shift by 16.  */
3303     case 0x6: /* 32-bit, shift by 24.  */
3304       val <<= (8 * INSTR (14, 13));
3305       val = ~ val;
3306       for (i = 0; i < (full ? 4 : 2); i++)
3307 	aarch64_set_vec_u32 (cpu, vd, i, val);
3308       return;
3309 
3310     case 0xa: /* 16-bit, 8 bit shift. */
3311       val <<= 8;
3312     case 0x8: /* 16-bit, no shift. */
3313       val = ~ val;
3314       for (i = 0; i < (full ? 8 : 4); i++)
3315 	aarch64_set_vec_u16 (cpu, vd, i, val);
3316       return;
3317 
3318     case 0xd: /* 32-bit, mask shift by 16.  */
3319       val <<= 8;
3320       val |= 0xFF;
3321     case 0xc: /* 32-bit, mask shift by 8. */
3322       val <<= 8;
3323       val |= 0xFF;
3324       val = ~ val;
3325       for (i = 0; i < (full ? 4 : 2); i++)
3326 	aarch64_set_vec_u32 (cpu, vd, i, val);
3327       return;
3328 
3329     case 0xE: /* MOVI Dn, #mask64 */
3330       {
3331 	uint64_t mask = 0;
3332 
3333 	for (i = 0; i < 8; i++)
3334 	  if (val & (1 << i))
3335 	    mask |= (0xFFUL << (i * 8));
3336 	aarch64_set_vec_u64 (cpu, vd, 0, mask);
3337 	aarch64_set_vec_u64 (cpu, vd, 1, mask);
3338 	return;
3339       }
3340 
3341     case 0xf: /* FMOV Vd.2D, #fpimm.  */
3342       {
3343 	double u = fp_immediate_for_encoding_64 (val);
3344 
3345 	if (! full)
3346 	  HALT_UNALLOC;
3347 
3348 	aarch64_set_vec_double (cpu, vd, 0, u);
3349 	aarch64_set_vec_double (cpu, vd, 1, u);
3350 	return;
3351       }
3352 
3353     default:
3354       HALT_NYI;
3355     }
3356 }
3357 
3358 #define ABS(A) ((A) < 0 ? - (A) : (A))
3359 
3360 static void
3361 do_vec_ABS (sim_cpu *cpu)
3362 {
3363   /* instr[31]    = 0
3364      instr[30]    = half(0)/full(1)
3365      instr[29,24] = 00 1110
3366      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3367      instr[21,10] = 10 0000 1011 10
3368      instr[9,5]   = Vn
3369      instr[4.0]   = Vd.  */
3370 
3371   unsigned vn = INSTR (9, 5);
3372   unsigned vd = INSTR (4, 0);
3373   unsigned full = INSTR (30, 30);
3374   unsigned i;
3375 
3376   NYI_assert (29, 24, 0x0E);
3377   NYI_assert (21, 10, 0x82E);
3378 
3379   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3380   switch (INSTR (23, 22))
3381     {
3382     case 0:
3383       for (i = 0; i < (full ? 16 : 8); i++)
3384 	aarch64_set_vec_s8 (cpu, vd, i,
3385 			    ABS (aarch64_get_vec_s8 (cpu, vn, i)));
3386       break;
3387 
3388     case 1:
3389       for (i = 0; i < (full ? 8 : 4); i++)
3390 	aarch64_set_vec_s16 (cpu, vd, i,
3391 			     ABS (aarch64_get_vec_s16 (cpu, vn, i)));
3392       break;
3393 
3394     case 2:
3395       for (i = 0; i < (full ? 4 : 2); i++)
3396 	aarch64_set_vec_s32 (cpu, vd, i,
3397 			     ABS (aarch64_get_vec_s32 (cpu, vn, i)));
3398       break;
3399 
3400     case 3:
3401       if (! full)
3402 	HALT_NYI;
3403       for (i = 0; i < 2; i++)
3404 	aarch64_set_vec_s64 (cpu, vd, i,
3405 			     ABS (aarch64_get_vec_s64 (cpu, vn, i)));
3406       break;
3407     }
3408 }
3409 
3410 static void
3411 do_vec_ADDV (sim_cpu *cpu)
3412 {
3413   /* instr[31]    = 0
3414      instr[30]    = full/half selector
3415      instr[29,24] = 00 1110
3416      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3417      instr[21,10] = 11 0001 1011 10
3418      instr[9,5]   = Vm
3419      instr[4.0]   = Rd.  */
3420 
3421   unsigned vm = INSTR (9, 5);
3422   unsigned rd = INSTR (4, 0);
3423   unsigned i;
3424   uint64_t val = 0;
3425   int      full = INSTR (30, 30);
3426 
3427   NYI_assert (29, 24, 0x0E);
3428   NYI_assert (21, 10, 0xC6E);
3429 
3430   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3431   switch (INSTR (23, 22))
3432     {
3433     case 0:
3434       for (i = 0; i < (full ? 16 : 8); i++)
3435 	val += aarch64_get_vec_u8 (cpu, vm, i);
3436       aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
3437       return;
3438 
3439     case 1:
3440       for (i = 0; i < (full ? 8 : 4); i++)
3441 	val += aarch64_get_vec_u16 (cpu, vm, i);
3442       aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
3443       return;
3444 
3445     case 2:
3446       for (i = 0; i < (full ? 4 : 2); i++)
3447 	val += aarch64_get_vec_u32 (cpu, vm, i);
3448       aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
3449       return;
3450 
3451     case 3:
3452       if (! full)
3453 	HALT_UNALLOC;
3454       val = aarch64_get_vec_u64 (cpu, vm, 0);
3455       val += aarch64_get_vec_u64 (cpu, vm, 1);
3456       aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
3457       return;
3458     }
3459 }
3460 
3461 static void
3462 do_vec_ins_2 (sim_cpu *cpu)
3463 {
3464   /* instr[31,21] = 01001110000
3465      instr[20,18] = size & element selector
3466      instr[17,14] = 0000
3467      instr[13]    = direction: to vec(0), from vec (1)
3468      instr[12,10] = 111
3469      instr[9,5]   = Vm
3470      instr[4,0]   = Vd.  */
3471 
3472   unsigned elem;
3473   unsigned vm = INSTR (9, 5);
3474   unsigned vd = INSTR (4, 0);
3475 
3476   NYI_assert (31, 21, 0x270);
3477   NYI_assert (17, 14, 0);
3478   NYI_assert (12, 10, 7);
3479 
3480   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3481   if (INSTR (13, 13) == 1)
3482     {
3483       if (INSTR (18, 18) == 1)
3484 	{
3485 	  /* 32-bit moves.  */
3486 	  elem = INSTR (20, 19);
3487 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
3488 			       aarch64_get_vec_u32 (cpu, vm, elem));
3489 	}
3490       else
3491 	{
3492 	  /* 64-bit moves.  */
3493 	  if (INSTR (19, 19) != 1)
3494 	    HALT_NYI;
3495 
3496 	  elem = INSTR (20, 20);
3497 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
3498 			       aarch64_get_vec_u64 (cpu, vm, elem));
3499 	}
3500     }
3501   else
3502     {
3503       if (INSTR (18, 18) == 1)
3504 	{
3505 	  /* 32-bit moves.  */
3506 	  elem = INSTR (20, 19);
3507 	  aarch64_set_vec_u32 (cpu, vd, elem,
3508 			       aarch64_get_reg_u32 (cpu, vm, NO_SP));
3509 	}
3510       else
3511 	{
3512 	  /* 64-bit moves.  */
3513 	  if (INSTR (19, 19) != 1)
3514 	    HALT_NYI;
3515 
3516 	  elem = INSTR (20, 20);
3517 	  aarch64_set_vec_u64 (cpu, vd, elem,
3518 			       aarch64_get_reg_u64 (cpu, vm, NO_SP));
3519 	}
3520     }
3521 }
3522 
3523 #define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)	  \
3524   do								  \
3525     {								  \
3526       DST_TYPE a[N], b[N];					  \
3527 								  \
3528       for (i = 0; i < (N); i++)					  \
3529 	{							  \
3530 	  a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
3531 	  b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
3532 	}							  \
3533       for (i = 0; i < (N); i++)					  \
3534 	aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);	  \
3535     }								  \
3536   while (0)
3537 
3538 static void
3539 do_vec_mull (sim_cpu *cpu)
3540 {
3541   /* instr[31]    = 0
3542      instr[30]    = lower(0)/upper(1) selector
3543      instr[29]    = signed(0)/unsigned(1)
3544      instr[28,24] = 0 1110
3545      instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
3546      instr[21]    = 1
3547      instr[20,16] = Vm
3548      instr[15,10] = 11 0000
3549      instr[9,5]   = Vn
3550      instr[4.0]   = Vd.  */
3551 
3552   int    unsign = INSTR (29, 29);
3553   int    bias = INSTR (30, 30);
3554   unsigned vm = INSTR (20, 16);
3555   unsigned vn = INSTR ( 9,  5);
3556   unsigned vd = INSTR ( 4,  0);
3557   unsigned i;
3558 
3559   NYI_assert (28, 24, 0x0E);
3560   NYI_assert (15, 10, 0x30);
3561 
3562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3563   /* NB: Read source values before writing results, in case
3564      the source and destination vectors are the same.  */
3565   switch (INSTR (23, 22))
3566     {
3567     case 0:
3568       if (bias)
3569 	bias = 8;
3570       if (unsign)
3571 	DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
3572       else
3573 	DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
3574       return;
3575 
3576     case 1:
3577       if (bias)
3578 	bias = 4;
3579       if (unsign)
3580 	DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
3581       else
3582 	DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
3583       return;
3584 
3585     case 2:
3586       if (bias)
3587 	bias = 2;
3588       if (unsign)
3589 	DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
3590       else
3591 	DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
3592       return;
3593 
3594     case 3:
3595       HALT_NYI;
3596     }
3597 }
3598 
3599 static void
3600 do_vec_fadd (sim_cpu *cpu)
3601 {
3602   /* instr[31]    = 0
3603      instr[30]    = half(0)/full(1)
3604      instr[29,24] = 001110
3605      instr[23]    = FADD(0)/FSUB(1)
3606      instr[22]    = float (0)/double(1)
3607      instr[21]    = 1
3608      instr[20,16] = Vm
3609      instr[15,10] = 110101
3610      instr[9,5]   = Vn
3611      instr[4.0]   = Vd.  */
3612 
3613   unsigned vm = INSTR (20, 16);
3614   unsigned vn = INSTR (9, 5);
3615   unsigned vd = INSTR (4, 0);
3616   unsigned i;
3617   int      full = INSTR (30, 30);
3618 
3619   NYI_assert (29, 24, 0x0E);
3620   NYI_assert (21, 21, 1);
3621   NYI_assert (15, 10, 0x35);
3622 
3623   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3624   if (INSTR (23, 23))
3625     {
3626       if (INSTR (22, 22))
3627 	{
3628 	  if (! full)
3629 	    HALT_NYI;
3630 
3631 	  for (i = 0; i < 2; i++)
3632 	    aarch64_set_vec_double (cpu, vd, i,
3633 				    aarch64_get_vec_double (cpu, vn, i)
3634 				    - aarch64_get_vec_double (cpu, vm, i));
3635 	}
3636       else
3637 	{
3638 	  for (i = 0; i < (full ? 4 : 2); i++)
3639 	    aarch64_set_vec_float (cpu, vd, i,
3640 				   aarch64_get_vec_float (cpu, vn, i)
3641 				   - aarch64_get_vec_float (cpu, vm, i));
3642 	}
3643     }
3644   else
3645     {
3646       if (INSTR (22, 22))
3647 	{
3648 	  if (! full)
3649 	    HALT_NYI;
3650 
3651 	  for (i = 0; i < 2; i++)
3652 	    aarch64_set_vec_double (cpu, vd, i,
3653 				    aarch64_get_vec_double (cpu, vm, i)
3654 				    + aarch64_get_vec_double (cpu, vn, i));
3655 	}
3656       else
3657 	{
3658 	  for (i = 0; i < (full ? 4 : 2); i++)
3659 	    aarch64_set_vec_float (cpu, vd, i,
3660 				   aarch64_get_vec_float (cpu, vm, i)
3661 				   + aarch64_get_vec_float (cpu, vn, i));
3662 	}
3663     }
3664 }
3665 
3666 static void
3667 do_vec_add (sim_cpu *cpu)
3668 {
3669   /* instr[31]    = 0
3670      instr[30]    = full/half selector
3671      instr[29,24] = 001110
3672      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3673      instr[21]    = 1
3674      instr[20,16] = Vn
3675      instr[15,10] = 100001
3676      instr[9,5]   = Vm
3677      instr[4.0]   = Vd.  */
3678 
3679   unsigned vm = INSTR (20, 16);
3680   unsigned vn = INSTR (9, 5);
3681   unsigned vd = INSTR (4, 0);
3682   unsigned i;
3683   int      full = INSTR (30, 30);
3684 
3685   NYI_assert (29, 24, 0x0E);
3686   NYI_assert (21, 21, 1);
3687   NYI_assert (15, 10, 0x21);
3688 
3689   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3690   switch (INSTR (23, 22))
3691     {
3692     case 0:
3693       for (i = 0; i < (full ? 16 : 8); i++)
3694 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
3695 			    + aarch64_get_vec_u8 (cpu, vm, i));
3696       return;
3697 
3698     case 1:
3699       for (i = 0; i < (full ? 8 : 4); i++)
3700 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
3701 			     + aarch64_get_vec_u16 (cpu, vm, i));
3702       return;
3703 
3704     case 2:
3705       for (i = 0; i < (full ? 4 : 2); i++)
3706 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
3707 			     + aarch64_get_vec_u32 (cpu, vm, i));
3708       return;
3709 
3710     case 3:
3711       if (! full)
3712 	HALT_UNALLOC;
3713       aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
3714 			   + aarch64_get_vec_u64 (cpu, vm, 0));
3715       aarch64_set_vec_u64 (cpu, vd, 1,
3716 			   aarch64_get_vec_u64 (cpu, vn, 1)
3717 			   + aarch64_get_vec_u64 (cpu, vm, 1));
3718       return;
3719     }
3720 }
3721 
3722 static void
3723 do_vec_mul (sim_cpu *cpu)
3724 {
3725   /* instr[31]    = 0
3726      instr[30]    = full/half selector
3727      instr[29,24] = 00 1110
3728      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3729      instr[21]    = 1
3730      instr[20,16] = Vn
3731      instr[15,10] = 10 0111
3732      instr[9,5]   = Vm
3733      instr[4.0]   = Vd.  */
3734 
3735   unsigned vm = INSTR (20, 16);
3736   unsigned vn = INSTR (9, 5);
3737   unsigned vd = INSTR (4, 0);
3738   unsigned i;
3739   int      full = INSTR (30, 30);
3740   int      bias = 0;
3741 
3742   NYI_assert (29, 24, 0x0E);
3743   NYI_assert (21, 21, 1);
3744   NYI_assert (15, 10, 0x27);
3745 
3746   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3747   switch (INSTR (23, 22))
3748     {
3749     case 0:
3750       DO_VEC_WIDENING_MUL (full ? 16 : 8, uint16_t, u8, u16);
3751       return;
3752 
3753     case 1:
3754       DO_VEC_WIDENING_MUL (full ? 8 : 4, uint32_t, u16, u32);
3755       return;
3756 
3757     case 2:
3758       DO_VEC_WIDENING_MUL (full ? 4 : 2, uint64_t, u32, u64);
3759       return;
3760 
3761     case 3:
3762       HALT_UNALLOC;
3763     }
3764 }
3765 
3766 static void
3767 do_vec_MLA (sim_cpu *cpu)
3768 {
3769   /* instr[31]    = 0
3770      instr[30]    = full/half selector
3771      instr[29,24] = 00 1110
3772      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3773      instr[21]    = 1
3774      instr[20,16] = Vn
3775      instr[15,10] = 1001 01
3776      instr[9,5]   = Vm
3777      instr[4.0]   = Vd.  */
3778 
3779   unsigned vm = INSTR (20, 16);
3780   unsigned vn = INSTR (9, 5);
3781   unsigned vd = INSTR (4, 0);
3782   unsigned i;
3783   int      full = INSTR (30, 30);
3784 
3785   NYI_assert (29, 24, 0x0E);
3786   NYI_assert (21, 21, 1);
3787   NYI_assert (15, 10, 0x25);
3788 
3789   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3790   switch (INSTR (23, 22))
3791     {
3792     case 0:
3793       {
3794 	uint16_t a[16], b[16];
3795 
3796 	for (i = 0; i < (full ? 16 : 8); i++)
3797 	  {
3798 	    a[i] = aarch64_get_vec_u8 (cpu, vn, i);
3799 	    b[i] = aarch64_get_vec_u8 (cpu, vm, i);
3800 	  }
3801 
3802 	for (i = 0; i < (full ? 16 : 8); i++)
3803 	  {
3804 	    uint16_t v = aarch64_get_vec_u8 (cpu, vd, i);
3805 
3806 	    aarch64_set_vec_u16 (cpu, vd, i, v + (a[i] * b[i]));
3807 	  }
3808       }
3809       return;
3810 
3811     case 1:
3812       {
3813 	uint32_t a[8], b[8];
3814 
3815 	for (i = 0; i < (full ? 8 : 4); i++)
3816 	  {
3817 	    a[i] = aarch64_get_vec_u16 (cpu, vn, i);
3818 	    b[i] = aarch64_get_vec_u16 (cpu, vm, i);
3819 	  }
3820 
3821 	for (i = 0; i < (full ? 8 : 4); i++)
3822 	  {
3823 	    uint32_t v = aarch64_get_vec_u16 (cpu, vd, i);
3824 
3825 	    aarch64_set_vec_u32 (cpu, vd, i, v + (a[i] * b[i]));
3826 	  }
3827       }
3828       return;
3829 
3830     case 2:
3831       {
3832 	uint64_t a[4], b[4];
3833 
3834 	for (i = 0; i < (full ? 4 : 2); i++)
3835 	  {
3836 	    a[i] = aarch64_get_vec_u32 (cpu, vn, i);
3837 	    b[i] = aarch64_get_vec_u32 (cpu, vm, i);
3838 	  }
3839 
3840 	for (i = 0; i < (full ? 4 : 2); i++)
3841 	  {
3842 	    uint64_t v = aarch64_get_vec_u32 (cpu, vd, i);
3843 
3844 	    aarch64_set_vec_u64 (cpu, vd, i, v + (a[i] * b[i]));
3845 	  }
3846       }
3847       return;
3848 
3849     case 3:
3850       HALT_UNALLOC;
3851     }
3852 }
3853 
3854 static float
3855 fmaxnm (float a, float b)
3856 {
3857   if (fpclassify (a) == FP_NORMAL)
3858     {
3859       if (fpclassify (b) == FP_NORMAL)
3860 	return a > b ? a : b;
3861       return a;
3862     }
3863   else if (fpclassify (b) == FP_NORMAL)
3864     return b;
3865   return a;
3866 }
3867 
3868 static float
3869 fminnm (float a, float b)
3870 {
3871   if (fpclassify (a) == FP_NORMAL)
3872     {
3873       if (fpclassify (b) == FP_NORMAL)
3874 	return a < b ? a : b;
3875       return a;
3876     }
3877   else if (fpclassify (b) == FP_NORMAL)
3878     return b;
3879   return a;
3880 }
3881 
3882 static double
3883 dmaxnm (double a, double b)
3884 {
3885   if (fpclassify (a) == FP_NORMAL)
3886     {
3887       if (fpclassify (b) == FP_NORMAL)
3888 	return a > b ? a : b;
3889       return a;
3890     }
3891   else if (fpclassify (b) == FP_NORMAL)
3892     return b;
3893   return a;
3894 }
3895 
3896 static double
3897 dminnm (double a, double b)
3898 {
3899   if (fpclassify (a) == FP_NORMAL)
3900     {
3901       if (fpclassify (b) == FP_NORMAL)
3902 	return a < b ? a : b;
3903       return a;
3904     }
3905   else if (fpclassify (b) == FP_NORMAL)
3906     return b;
3907   return a;
3908 }
3909 
3910 static void
3911 do_vec_FminmaxNMP (sim_cpu *cpu)
3912 {
3913   /* instr [31]    = 0
3914      instr [30]    = half (0)/full (1)
3915      instr [29,24] = 10 1110
3916      instr [23]    = max(0)/min(1)
3917      instr [22]    = float (0)/double (1)
3918      instr [21]    = 1
3919      instr [20,16] = Vn
3920      instr [15,10] = 1100 01
3921      instr [9,5]   = Vm
3922      instr [4.0]   = Vd.  */
3923 
3924   unsigned vm = INSTR (20, 16);
3925   unsigned vn = INSTR (9, 5);
3926   unsigned vd = INSTR (4, 0);
3927   int      full = INSTR (30, 30);
3928 
3929   NYI_assert (29, 24, 0x2E);
3930   NYI_assert (21, 21, 1);
3931   NYI_assert (15, 10, 0x31);
3932 
3933   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3934   if (INSTR (22, 22))
3935     {
3936       double (* fn)(double, double) = INSTR (23, 23)
3937 	? dminnm : dmaxnm;
3938 
3939       if (! full)
3940 	HALT_NYI;
3941       aarch64_set_vec_double (cpu, vd, 0,
3942 			      fn (aarch64_get_vec_double (cpu, vn, 0),
3943 				  aarch64_get_vec_double (cpu, vn, 1)));
3944       aarch64_set_vec_double (cpu, vd, 0,
3945 			      fn (aarch64_get_vec_double (cpu, vm, 0),
3946 				  aarch64_get_vec_double (cpu, vm, 1)));
3947     }
3948   else
3949     {
3950       float (* fn)(float, float) = INSTR (23, 23)
3951 	? fminnm : fmaxnm;
3952 
3953       aarch64_set_vec_float (cpu, vd, 0,
3954 			     fn (aarch64_get_vec_float (cpu, vn, 0),
3955 				 aarch64_get_vec_float (cpu, vn, 1)));
3956       if (full)
3957 	aarch64_set_vec_float (cpu, vd, 1,
3958 			       fn (aarch64_get_vec_float (cpu, vn, 2),
3959 				   aarch64_get_vec_float (cpu, vn, 3)));
3960 
3961       aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
3962 			     fn (aarch64_get_vec_float (cpu, vm, 0),
3963 				 aarch64_get_vec_float (cpu, vm, 1)));
3964       if (full)
3965 	aarch64_set_vec_float (cpu, vd, 3,
3966 			       fn (aarch64_get_vec_float (cpu, vm, 2),
3967 				   aarch64_get_vec_float (cpu, vm, 3)));
3968     }
3969 }
3970 
3971 static void
3972 do_vec_AND (sim_cpu *cpu)
3973 {
3974   /* instr[31]    = 0
3975      instr[30]    = half (0)/full (1)
3976      instr[29,21] = 001110001
3977      instr[20,16] = Vm
3978      instr[15,10] = 000111
3979      instr[9,5]   = Vn
3980      instr[4.0]   = Vd.  */
3981 
3982   unsigned vm = INSTR (20, 16);
3983   unsigned vn = INSTR (9, 5);
3984   unsigned vd = INSTR (4, 0);
3985   unsigned i;
3986   int      full = INSTR (30, 30);
3987 
3988   NYI_assert (29, 21, 0x071);
3989   NYI_assert (15, 10, 0x07);
3990 
3991   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3992   for (i = 0; i < (full ? 4 : 2); i++)
3993     aarch64_set_vec_u32 (cpu, vd, i,
3994 			 aarch64_get_vec_u32 (cpu, vn, i)
3995 			 & aarch64_get_vec_u32 (cpu, vm, i));
3996 }
3997 
3998 static void
3999 do_vec_BSL (sim_cpu *cpu)
4000 {
4001   /* instr[31]    = 0
4002      instr[30]    = half (0)/full (1)
4003      instr[29,21] = 101110011
4004      instr[20,16] = Vm
4005      instr[15,10] = 000111
4006      instr[9,5]   = Vn
4007      instr[4.0]   = Vd.  */
4008 
4009   unsigned vm = INSTR (20, 16);
4010   unsigned vn = INSTR (9, 5);
4011   unsigned vd = INSTR (4, 0);
4012   unsigned i;
4013   int      full = INSTR (30, 30);
4014 
4015   NYI_assert (29, 21, 0x173);
4016   NYI_assert (15, 10, 0x07);
4017 
4018   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4019   for (i = 0; i < (full ? 16 : 8); i++)
4020     aarch64_set_vec_u8 (cpu, vd, i,
4021 			(    aarch64_get_vec_u8 (cpu, vd, i)
4022 			   & aarch64_get_vec_u8 (cpu, vn, i))
4023 			| ((~ aarch64_get_vec_u8 (cpu, vd, i))
4024 			   & aarch64_get_vec_u8 (cpu, vm, i)));
4025 }
4026 
4027 static void
4028 do_vec_EOR (sim_cpu *cpu)
4029 {
4030   /* instr[31]    = 0
4031      instr[30]    = half (0)/full (1)
4032      instr[29,21] = 10 1110 001
4033      instr[20,16] = Vm
4034      instr[15,10] = 000111
4035      instr[9,5]   = Vn
4036      instr[4.0]   = Vd.  */
4037 
4038   unsigned vm = INSTR (20, 16);
4039   unsigned vn = INSTR (9, 5);
4040   unsigned vd = INSTR (4, 0);
4041   unsigned i;
4042   int      full = INSTR (30, 30);
4043 
4044   NYI_assert (29, 21, 0x171);
4045   NYI_assert (15, 10, 0x07);
4046 
4047   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4048   for (i = 0; i < (full ? 4 : 2); i++)
4049     aarch64_set_vec_u32 (cpu, vd, i,
4050 			 aarch64_get_vec_u32 (cpu, vn, i)
4051 			 ^ aarch64_get_vec_u32 (cpu, vm, i));
4052 }
4053 
4054 static void
4055 do_vec_bit (sim_cpu *cpu)
4056 {
4057   /* instr[31]    = 0
4058      instr[30]    = half (0)/full (1)
4059      instr[29,23] = 10 1110 1
4060      instr[22]    = BIT (0) / BIF (1)
4061      instr[21]    = 1
4062      instr[20,16] = Vm
4063      instr[15,10] = 0001 11
4064      instr[9,5]   = Vn
4065      instr[4.0]   = Vd.  */
4066 
4067   unsigned vm = INSTR (20, 16);
4068   unsigned vn = INSTR (9, 5);
4069   unsigned vd = INSTR (4, 0);
4070   unsigned full = INSTR (30, 30);
4071   unsigned test_false = INSTR (22, 22);
4072   unsigned i;
4073 
4074   NYI_assert (29, 23, 0x5D);
4075   NYI_assert (21, 21, 1);
4076   NYI_assert (15, 10, 0x07);
4077 
4078   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4079   if (test_false)
4080     {
4081       for (i = 0; i < (full ? 16 : 8); i++)
4082 	if (aarch64_get_vec_u32 (cpu, vn, i) == 0)
4083 	  aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vm, i));
4084     }
4085   else
4086     {
4087       for (i = 0; i < (full ? 16 : 8); i++)
4088 	if (aarch64_get_vec_u32 (cpu, vn, i) != 0)
4089 	  aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vm, i));
4090     }
4091 }
4092 
4093 static void
4094 do_vec_ORN (sim_cpu *cpu)
4095 {
4096   /* instr[31]    = 0
4097      instr[30]    = half (0)/full (1)
4098      instr[29,21] = 00 1110 111
4099      instr[20,16] = Vm
4100      instr[15,10] = 00 0111
4101      instr[9,5]   = Vn
4102      instr[4.0]   = Vd.  */
4103 
4104   unsigned vm = INSTR (20, 16);
4105   unsigned vn = INSTR (9, 5);
4106   unsigned vd = INSTR (4, 0);
4107   unsigned i;
4108   int      full = INSTR (30, 30);
4109 
4110   NYI_assert (29, 21, 0x077);
4111   NYI_assert (15, 10, 0x07);
4112 
4113   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4114   for (i = 0; i < (full ? 16 : 8); i++)
4115     aarch64_set_vec_u8 (cpu, vd, i,
4116 			aarch64_get_vec_u8 (cpu, vn, i)
4117 			| ~ aarch64_get_vec_u8 (cpu, vm, i));
4118 }
4119 
4120 static void
4121 do_vec_ORR (sim_cpu *cpu)
4122 {
4123   /* instr[31]    = 0
4124      instr[30]    = half (0)/full (1)
4125      instr[29,21] = 00 1110 101
4126      instr[20,16] = Vm
4127      instr[15,10] = 0001 11
4128      instr[9,5]   = Vn
4129      instr[4.0]   = Vd.  */
4130 
4131   unsigned vm = INSTR (20, 16);
4132   unsigned vn = INSTR (9, 5);
4133   unsigned vd = INSTR (4, 0);
4134   unsigned i;
4135   int      full = INSTR (30, 30);
4136 
4137   NYI_assert (29, 21, 0x075);
4138   NYI_assert (15, 10, 0x07);
4139 
4140   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4141   for (i = 0; i < (full ? 16 : 8); i++)
4142     aarch64_set_vec_u8 (cpu, vd, i,
4143 			aarch64_get_vec_u8 (cpu, vn, i)
4144 			| aarch64_get_vec_u8 (cpu, vm, i));
4145 }
4146 
4147 static void
4148 do_vec_BIC (sim_cpu *cpu)
4149 {
4150   /* instr[31]    = 0
4151      instr[30]    = half (0)/full (1)
4152      instr[29,21] = 00 1110 011
4153      instr[20,16] = Vm
4154      instr[15,10] = 00 0111
4155      instr[9,5]   = Vn
4156      instr[4.0]   = Vd.  */
4157 
4158   unsigned vm = INSTR (20, 16);
4159   unsigned vn = INSTR (9, 5);
4160   unsigned vd = INSTR (4, 0);
4161   unsigned i;
4162   int      full = INSTR (30, 30);
4163 
4164   NYI_assert (29, 21, 0x073);
4165   NYI_assert (15, 10, 0x07);
4166 
4167   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4168   for (i = 0; i < (full ? 16 : 8); i++)
4169     aarch64_set_vec_u8 (cpu, vd, i,
4170 			aarch64_get_vec_u8 (cpu, vn, i)
4171 			& ~ aarch64_get_vec_u8 (cpu, vm, i));
4172 }
4173 
4174 static void
4175 do_vec_XTN (sim_cpu *cpu)
4176 {
4177   /* instr[31]    = 0
4178      instr[30]    = first part (0)/ second part (1)
4179      instr[29,24] = 00 1110
4180      instr[23,22] = size: byte(00), half(01), word (10)
4181      instr[21,10] = 1000 0100 1010
4182      instr[9,5]   = Vs
4183      instr[4,0]   = Vd.  */
4184 
4185   unsigned vs = INSTR (9, 5);
4186   unsigned vd = INSTR (4, 0);
4187   unsigned bias = INSTR (30, 30);
4188   unsigned i;
4189 
4190   NYI_assert (29, 24, 0x0E);
4191   NYI_assert (21, 10, 0x84A);
4192 
4193   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4194   switch (INSTR (23, 22))
4195     {
4196     case 0:
4197       if (bias)
4198 	for (i = 0; i < 8; i++)
4199 	  aarch64_set_vec_u8 (cpu, vd, i + 8,
4200 			      aarch64_get_vec_u16 (cpu, vs, i) >> 8);
4201       else
4202 	for (i = 0; i < 8; i++)
4203 	  aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, i));
4204       return;
4205 
4206     case 1:
4207       if (bias)
4208 	for (i = 0; i < 4; i++)
4209 	  aarch64_set_vec_u16 (cpu, vd, i + 4,
4210 			       aarch64_get_vec_u32 (cpu, vs, i) >> 16);
4211       else
4212 	for (i = 0; i < 4; i++)
4213 	  aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, i));
4214       return;
4215 
4216     case 2:
4217       if (bias)
4218 	for (i = 0; i < 2; i++)
4219 	  aarch64_set_vec_u32 (cpu, vd, i + 4,
4220 			       aarch64_get_vec_u64 (cpu, vs, i) >> 32);
4221       else
4222 	for (i = 0; i < 2; i++)
4223 	  aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, i));
4224       return;
4225     }
4226 }
4227 
4228 static void
4229 do_vec_maxv (sim_cpu *cpu)
4230 {
4231   /* instr[31]    = 0
4232      instr[30]    = half(0)/full(1)
4233      instr[29]    = signed (0)/unsigned(1)
4234      instr[28,24] = 0 1110
4235      instr[23,22] = size: byte(00), half(01), word (10)
4236      instr[21]    = 1
4237      instr[20,17] = 1 000
4238      instr[16]    = max(0)/min(1)
4239      instr[15,10] = 1010 10
4240      instr[9,5]   = V source
4241      instr[4.0]   = R dest.  */
4242 
4243   unsigned vs = INSTR (9, 5);
4244   unsigned rd = INSTR (4, 0);
4245   unsigned full = INSTR (30, 30);
4246   unsigned i;
4247 
4248   NYI_assert (28, 24, 0x0E);
4249   NYI_assert (21, 21, 1);
4250   NYI_assert (20, 17, 8);
4251   NYI_assert (15, 10, 0x2A);
4252 
4253   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4254   switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
4255     {
4256     case 0: /* SMAXV.  */
4257        {
4258 	int64_t smax;
4259 	switch (INSTR (23, 22))
4260 	  {
4261 	  case 0:
4262 	    smax = aarch64_get_vec_s8 (cpu, vs, 0);
4263 	    for (i = 1; i < (full ? 16 : 8); i++)
4264 	      smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
4265 	    break;
4266 	  case 1:
4267 	    smax = aarch64_get_vec_s16 (cpu, vs, 0);
4268 	    for (i = 1; i < (full ? 8 : 4); i++)
4269 	      smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
4270 	    break;
4271 	  case 2:
4272 	    smax = aarch64_get_vec_s32 (cpu, vs, 0);
4273 	    for (i = 1; i < (full ? 4 : 2); i++)
4274 	      smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
4275 	    break;
4276 	  case 3:
4277 	    HALT_UNALLOC;
4278 	  }
4279 	aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
4280 	return;
4281       }
4282 
4283     case 1: /* SMINV.  */
4284       {
4285 	int64_t smin;
4286 	switch (INSTR (23, 22))
4287 	  {
4288 	  case 0:
4289 	    smin = aarch64_get_vec_s8 (cpu, vs, 0);
4290 	    for (i = 1; i < (full ? 16 : 8); i++)
4291 	      smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
4292 	    break;
4293 	  case 1:
4294 	    smin = aarch64_get_vec_s16 (cpu, vs, 0);
4295 	    for (i = 1; i < (full ? 8 : 4); i++)
4296 	      smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
4297 	    break;
4298 	  case 2:
4299 	    smin = aarch64_get_vec_s32 (cpu, vs, 0);
4300 	    for (i = 1; i < (full ? 4 : 2); i++)
4301 	      smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
4302 	    break;
4303 
4304 	  case 3:
4305 	    HALT_UNALLOC;
4306 	  }
4307 	aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
4308 	return;
4309       }
4310 
4311     case 2: /* UMAXV.  */
4312       {
4313 	uint64_t umax;
4314 	switch (INSTR (23, 22))
4315 	  {
4316 	  case 0:
4317 	    umax = aarch64_get_vec_u8 (cpu, vs, 0);
4318 	    for (i = 1; i < (full ? 16 : 8); i++)
4319 	      umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
4320 	    break;
4321 	  case 1:
4322 	    umax = aarch64_get_vec_u16 (cpu, vs, 0);
4323 	    for (i = 1; i < (full ? 8 : 4); i++)
4324 	      umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
4325 	    break;
4326 	  case 2:
4327 	    umax = aarch64_get_vec_u32 (cpu, vs, 0);
4328 	    for (i = 1; i < (full ? 4 : 2); i++)
4329 	      umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
4330 	    break;
4331 
4332 	  case 3:
4333 	    HALT_UNALLOC;
4334 	  }
4335 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
4336 	return;
4337       }
4338 
4339     case 3: /* UMINV.  */
4340       {
4341 	uint64_t umin;
4342 	switch (INSTR (23, 22))
4343 	  {
4344 	  case 0:
4345 	    umin = aarch64_get_vec_u8 (cpu, vs, 0);
4346 	    for (i = 1; i < (full ? 16 : 8); i++)
4347 	      umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
4348 	    break;
4349 	  case 1:
4350 	    umin = aarch64_get_vec_u16 (cpu, vs, 0);
4351 	    for (i = 1; i < (full ? 8 : 4); i++)
4352 	      umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
4353 	    break;
4354 	  case 2:
4355 	    umin = aarch64_get_vec_u32 (cpu, vs, 0);
4356 	    for (i = 1; i < (full ? 4 : 2); i++)
4357 	      umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
4358 	    break;
4359 
4360 	  case 3:
4361 	    HALT_UNALLOC;
4362 	  }
4363 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
4364 	return;
4365       }
4366     }
4367 }
4368 
4369 static void
4370 do_vec_fminmaxV (sim_cpu *cpu)
4371 {
4372   /* instr[31,24] = 0110 1110
4373      instr[23]    = max(0)/min(1)
4374      instr[22,14] = 011 0000 11
4375      instr[13,12] = nm(00)/normal(11)
4376      instr[11,10] = 10
4377      instr[9,5]   = V source
4378      instr[4.0]   = R dest.  */
4379 
4380   unsigned vs = INSTR (9, 5);
4381   unsigned rd = INSTR (4, 0);
4382   unsigned i;
4383   float res   = aarch64_get_vec_float (cpu, vs, 0);
4384 
4385   NYI_assert (31, 24, 0x6E);
4386   NYI_assert (22, 14, 0x0C3);
4387   NYI_assert (11, 10, 2);
4388 
4389   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4390   if (INSTR (23, 23))
4391     {
4392       switch (INSTR (13, 12))
4393 	{
4394 	case 0: /* FMNINNMV.  */
4395 	  for (i = 1; i < 4; i++)
4396 	    res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
4397 	  break;
4398 
4399 	case 3: /* FMINV.  */
4400 	  for (i = 1; i < 4; i++)
4401 	    res = min (res, aarch64_get_vec_float (cpu, vs, i));
4402 	  break;
4403 
4404 	default:
4405 	  HALT_NYI;
4406 	}
4407     }
4408   else
4409     {
4410       switch (INSTR (13, 12))
4411 	{
4412 	case 0: /* FMNAXNMV.  */
4413 	  for (i = 1; i < 4; i++)
4414 	    res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
4415 	  break;
4416 
4417 	case 3: /* FMAXV.  */
4418 	  for (i = 1; i < 4; i++)
4419 	    res = max (res, aarch64_get_vec_float (cpu, vs, i));
4420 	  break;
4421 
4422 	default:
4423 	  HALT_NYI;
4424 	}
4425     }
4426 
4427   aarch64_set_FP_float (cpu, rd, res);
4428 }
4429 
4430 static void
4431 do_vec_Fminmax (sim_cpu *cpu)
4432 {
4433   /* instr[31]    = 0
4434      instr[30]    = half(0)/full(1)
4435      instr[29,24] = 00 1110
4436      instr[23]    = max(0)/min(1)
4437      instr[22]    = float(0)/double(1)
4438      instr[21]    = 1
4439      instr[20,16] = Vm
4440      instr[15,14] = 11
4441      instr[13,12] = nm(00)/normal(11)
4442      instr[11,10] = 01
4443      instr[9,5]   = Vn
4444      instr[4,0]   = Vd.  */
4445 
4446   unsigned vm = INSTR (20, 16);
4447   unsigned vn = INSTR (9, 5);
4448   unsigned vd = INSTR (4, 0);
4449   unsigned full = INSTR (30, 30);
4450   unsigned min = INSTR (23, 23);
4451   unsigned i;
4452 
4453   NYI_assert (29, 24, 0x0E);
4454   NYI_assert (21, 21, 1);
4455   NYI_assert (15, 14, 3);
4456   NYI_assert (11, 10, 1);
4457 
4458   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4459   if (INSTR (22, 22))
4460     {
4461       double (* func)(double, double);
4462 
4463       if (! full)
4464 	HALT_NYI;
4465 
4466       if (INSTR (13, 12) == 0)
4467 	func = min ? dminnm : dmaxnm;
4468       else if (INSTR (13, 12) == 3)
4469 	func = min ? fmin : fmax;
4470       else
4471 	HALT_NYI;
4472 
4473       for (i = 0; i < 2; i++)
4474 	aarch64_set_vec_double (cpu, vd, i,
4475 				func (aarch64_get_vec_double (cpu, vn, i),
4476 				      aarch64_get_vec_double (cpu, vm, i)));
4477     }
4478   else
4479     {
4480       float (* func)(float, float);
4481 
4482       if (INSTR (13, 12) == 0)
4483 	func = min ? fminnm : fmaxnm;
4484       else if (INSTR (13, 12) == 3)
4485 	func = min ? fminf : fmaxf;
4486       else
4487 	HALT_NYI;
4488 
4489       for (i = 0; i < (full ? 4 : 2); i++)
4490 	aarch64_set_vec_float (cpu, vd, i,
4491 			       func (aarch64_get_vec_float (cpu, vn, i),
4492 				     aarch64_get_vec_float (cpu, vm, i)));
4493     }
4494 }
4495 
4496 static void
4497 do_vec_SCVTF (sim_cpu *cpu)
4498 {
4499   /* instr[31]    = 0
4500      instr[30]    = Q
4501      instr[29,23] = 00 1110 0
4502      instr[22]    = float(0)/double(1)
4503      instr[21,10] = 10 0001 1101 10
4504      instr[9,5]   = Vn
4505      instr[4,0]   = Vd.  */
4506 
4507   unsigned vn = INSTR (9, 5);
4508   unsigned vd = INSTR (4, 0);
4509   unsigned full = INSTR (30, 30);
4510   unsigned size = INSTR (22, 22);
4511   unsigned i;
4512 
4513   NYI_assert (29, 23, 0x1C);
4514   NYI_assert (21, 10, 0x876);
4515 
4516   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4517   if (size)
4518     {
4519       if (! full)
4520 	HALT_UNALLOC;
4521 
4522       for (i = 0; i < 2; i++)
4523 	{
4524 	  double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
4525 	  aarch64_set_vec_double (cpu, vd, i, val);
4526 	}
4527     }
4528   else
4529     {
4530       for (i = 0; i < (full ? 4 : 2); i++)
4531 	{
4532 	  float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
4533 	  aarch64_set_vec_float (cpu, vd, i, val);
4534 	}
4535     }
4536 }
4537 
4538 #define VEC_CMP(SOURCE, CMP)						\
4539   do									\
4540     {									\
4541       switch (size)							\
4542 	{								\
4543 	case 0:								\
4544 	  for (i = 0; i < (full ? 16 : 8); i++)				\
4545 	    aarch64_set_vec_u8 (cpu, vd, i,				\
4546 				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4547 				CMP					\
4548 				aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
4549 				? -1 : 0);				\
4550 	  return;							\
4551 	case 1:								\
4552 	  for (i = 0; i < (full ? 8 : 4); i++)				\
4553 	    aarch64_set_vec_u16 (cpu, vd, i,				\
4554 				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4555 				 CMP					\
4556 				 aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
4557 				 ? -1 : 0);				\
4558 	  return;							\
4559 	case 2:								\
4560 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4561 	    aarch64_set_vec_u32 (cpu, vd, i, \
4562 				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4563 				 CMP					\
4564 				 aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
4565 				 ? -1 : 0);				\
4566 	  return;							\
4567 	case 3:								\
4568 	  if (! full)							\
4569 	    HALT_UNALLOC;						\
4570 	  for (i = 0; i < 2; i++)					\
4571 	    aarch64_set_vec_u64 (cpu, vd, i, \
4572 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4573 				 CMP					\
4574 				 aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
4575 				 ? -1ULL : 0);				\
4576 	  return;							\
4577 	}								\
4578     }									\
4579   while (0)
4580 
4581 #define VEC_CMP0(SOURCE, CMP)						\
4582   do									\
4583     {									\
4584       switch (size)							\
4585 	{								\
4586 	case 0:								\
4587 	  for (i = 0; i < (full ? 16 : 8); i++)				\
4588 	    aarch64_set_vec_u8 (cpu, vd, i,				\
4589 				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4590 				CMP 0 ? -1 : 0);			\
4591 	  return;							\
4592 	case 1:								\
4593 	  for (i = 0; i < (full ? 8 : 4); i++)				\
4594 	    aarch64_set_vec_u16 (cpu, vd, i,				\
4595 				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4596 				 CMP 0 ? -1 : 0);			\
4597 	  return;							\
4598 	case 2:								\
4599 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4600 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4601 				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4602 				 CMP 0 ? -1 : 0);			\
4603 	  return;							\
4604 	case 3:								\
4605 	  if (! full)							\
4606 	    HALT_UNALLOC;						\
4607 	  for (i = 0; i < 2; i++)					\
4608 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4609 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4610 				 CMP 0 ? -1ULL : 0);			\
4611 	  return;							\
4612 	}								\
4613     }									\
4614   while (0)
4615 
4616 #define VEC_FCMP0(CMP)							\
4617   do									\
4618     {									\
4619       if (vm != 0)							\
4620 	HALT_NYI;							\
4621       if (INSTR (22, 22))						\
4622 	{								\
4623 	  if (! full)							\
4624 	    HALT_NYI;							\
4625 	  for (i = 0; i < 2; i++)					\
4626 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4627 				 aarch64_get_vec_double (cpu, vn, i)	\
4628 				 CMP 0.0 ? -1 : 0);			\
4629 	}								\
4630       else								\
4631 	{								\
4632 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4633 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4634 				 aarch64_get_vec_float (cpu, vn, i)	\
4635 				 CMP 0.0 ? -1 : 0);			\
4636 	}								\
4637       return;								\
4638     }									\
4639   while (0)
4640 
4641 #define VEC_FCMP(CMP)							\
4642   do									\
4643     {									\
4644       if (INSTR (22, 22))						\
4645 	{								\
4646 	  if (! full)							\
4647 	    HALT_NYI;							\
4648 	  for (i = 0; i < 2; i++)					\
4649 	    aarch64_set_vec_u64 (cpu, vd, i,				\
4650 				 aarch64_get_vec_double (cpu, vn, i)	\
4651 				 CMP					\
4652 				 aarch64_get_vec_double (cpu, vm, i)	\
4653 				 ? -1 : 0);				\
4654 	}								\
4655       else								\
4656 	{								\
4657 	  for (i = 0; i < (full ? 4 : 2); i++)				\
4658 	    aarch64_set_vec_u32 (cpu, vd, i,				\
4659 				 aarch64_get_vec_float (cpu, vn, i)	\
4660 				 CMP					\
4661 				 aarch64_get_vec_float (cpu, vm, i)	\
4662 				 ? -1 : 0);				\
4663 	}								\
4664       return;								\
4665     }									\
4666   while (0)
4667 
4668 static void
4669 do_vec_compare (sim_cpu *cpu)
4670 {
4671   /* instr[31]    = 0
4672      instr[30]    = half(0)/full(1)
4673      instr[29]    = part-of-comparison-type
4674      instr[28,24] = 0 1110
4675      instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
4676                     type of float compares: single (-0) / double (-1)
4677      instr[21]    = 1
4678      instr[20,16] = Vm or 00000 (compare vs 0)
4679      instr[15,10] = part-of-comparison-type
4680      instr[9,5]   = Vn
4681      instr[4.0]   = Vd.  */
4682 
4683   int full = INSTR (30, 30);
4684   int size = INSTR (23, 22);
4685   unsigned vm = INSTR (20, 16);
4686   unsigned vn = INSTR (9, 5);
4687   unsigned vd = INSTR (4, 0);
4688   unsigned i;
4689 
4690   NYI_assert (28, 24, 0x0E);
4691   NYI_assert (21, 21, 1);
4692 
4693   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4694   if ((INSTR (11, 11)
4695        && INSTR (14, 14))
4696       || ((INSTR (11, 11) == 0
4697 	   && INSTR (10, 10) == 0)))
4698     {
4699       /* A compare vs 0.  */
4700       if (vm != 0)
4701 	{
4702 	  if (INSTR (15, 10) == 0x2A)
4703 	    do_vec_maxv (cpu);
4704 	  else if (INSTR (15, 10) == 0x32
4705 		   || INSTR (15, 10) == 0x3E)
4706 	    do_vec_fminmaxV (cpu);
4707 	  else if (INSTR (29, 23) == 0x1C
4708 		   && INSTR (21, 10) == 0x876)
4709 	    do_vec_SCVTF (cpu);
4710 	  else
4711 	    HALT_NYI;
4712 	  return;
4713 	}
4714     }
4715 
4716   if (INSTR (14, 14))
4717     {
4718       /* A floating point compare.  */
4719       unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
4720 	| INSTR (13, 10);
4721 
4722       NYI_assert (15, 15, 1);
4723 
4724       switch (decode)
4725 	{
4726 	case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
4727 	case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
4728 	case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
4729 	case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
4730 	case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
4731 	case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
4732 	case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
4733 	case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);
4734 
4735 	default:
4736 	  HALT_NYI;
4737 	}
4738     }
4739   else
4740     {
4741       unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
4742 
4743       switch (decode)
4744 	{
4745 	case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
4746 	case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
4747 	case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
4748 	case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
4749 	case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
4750 	case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
4751 	case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
4752 	case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
4753 	case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
4754 	case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
4755 	default:
4756 	  if (vm == 0)
4757 	    HALT_NYI;
4758 	  do_vec_maxv (cpu);
4759 	}
4760     }
4761 }
4762 
4763 static void
4764 do_vec_SSHL (sim_cpu *cpu)
4765 {
4766   /* instr[31]    = 0
4767      instr[30]    = first part (0)/ second part (1)
4768      instr[29,24] = 00 1110
4769      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4770      instr[21]    = 1
4771      instr[20,16] = Vm
4772      instr[15,10] = 0100 01
4773      instr[9,5]   = Vn
4774      instr[4,0]   = Vd.  */
4775 
4776   unsigned full = INSTR (30, 30);
4777   unsigned vm = INSTR (20, 16);
4778   unsigned vn = INSTR (9, 5);
4779   unsigned vd = INSTR (4, 0);
4780   unsigned i;
4781   signed int shift;
4782 
4783   NYI_assert (29, 24, 0x0E);
4784   NYI_assert (21, 21, 1);
4785   NYI_assert (15, 10, 0x11);
4786 
4787   /* FIXME: What is a signed shift left in this context ?.  */
4788 
4789   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4790   switch (INSTR (23, 22))
4791     {
4792     case 0:
4793       for (i = 0; i < (full ? 16 : 8); i++)
4794 	{
4795 	  shift = aarch64_get_vec_s8 (cpu, vm, i);
4796 	  if (shift >= 0)
4797 	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4798 				<< shift);
4799 	  else
4800 	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4801 				>> - shift);
4802 	}
4803       return;
4804 
4805     case 1:
4806       for (i = 0; i < (full ? 8 : 4); i++)
4807 	{
4808 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4809 	  if (shift >= 0)
4810 	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4811 				 << shift);
4812 	  else
4813 	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4814 				 >> - shift);
4815 	}
4816       return;
4817 
4818     case 2:
4819       for (i = 0; i < (full ? 4 : 2); i++)
4820 	{
4821 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4822 	  if (shift >= 0)
4823 	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4824 				 << shift);
4825 	  else
4826 	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4827 				 >> - shift);
4828 	}
4829       return;
4830 
4831     case 3:
4832       if (! full)
4833 	HALT_UNALLOC;
4834       for (i = 0; i < 2; i++)
4835 	{
4836 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4837 	  if (shift >= 0)
4838 	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4839 				 << shift);
4840 	  else
4841 	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4842 				 >> - shift);
4843 	}
4844       return;
4845     }
4846 }
4847 
4848 static void
4849 do_vec_USHL (sim_cpu *cpu)
4850 {
4851   /* instr[31]    = 0
4852      instr[30]    = first part (0)/ second part (1)
4853      instr[29,24] = 10 1110
4854      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4855      instr[21]    = 1
4856      instr[20,16] = Vm
4857      instr[15,10] = 0100 01
4858      instr[9,5]   = Vn
4859      instr[4,0]   = Vd  */
4860 
4861   unsigned full = INSTR (30, 30);
4862   unsigned vm = INSTR (20, 16);
4863   unsigned vn = INSTR (9, 5);
4864   unsigned vd = INSTR (4, 0);
4865   unsigned i;
4866   signed int shift;
4867 
4868   NYI_assert (29, 24, 0x2E);
4869   NYI_assert (15, 10, 0x11);
4870 
4871   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4872   switch (INSTR (23, 22))
4873     {
4874     case 0:
4875 	for (i = 0; i < (full ? 16 : 8); i++)
4876 	  {
4877 	    shift = aarch64_get_vec_s8 (cpu, vm, i);
4878 	    if (shift >= 0)
4879 	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4880 				  << shift);
4881 	    else
4882 	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4883 				  >> - shift);
4884 	  }
4885       return;
4886 
4887     case 1:
4888       for (i = 0; i < (full ? 8 : 4); i++)
4889 	{
4890 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4891 	  if (shift >= 0)
4892 	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4893 				 << shift);
4894 	  else
4895 	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4896 				 >> - shift);
4897 	}
4898       return;
4899 
4900     case 2:
4901       for (i = 0; i < (full ? 4 : 2); i++)
4902 	{
4903 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4904 	  if (shift >= 0)
4905 	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4906 				 << shift);
4907 	  else
4908 	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4909 				 >> - shift);
4910 	}
4911       return;
4912 
4913     case 3:
4914       if (! full)
4915 	HALT_UNALLOC;
4916       for (i = 0; i < 2; i++)
4917 	{
4918 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4919 	  if (shift >= 0)
4920 	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4921 				 << shift);
4922 	  else
4923 	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4924 				 >> - shift);
4925 	}
4926       return;
4927     }
4928 }
4929 
4930 static void
4931 do_vec_FMLA (sim_cpu *cpu)
4932 {
4933   /* instr[31]    = 0
4934      instr[30]    = full/half selector
4935      instr[29,23] = 0011100
4936      instr[22]    = size: 0=>float, 1=>double
4937      instr[21]    = 1
4938      instr[20,16] = Vn
4939      instr[15,10] = 1100 11
4940      instr[9,5]   = Vm
4941      instr[4.0]   = Vd.  */
4942 
4943   unsigned vm = INSTR (20, 16);
4944   unsigned vn = INSTR (9, 5);
4945   unsigned vd = INSTR (4, 0);
4946   unsigned i;
4947   int      full = INSTR (30, 30);
4948 
4949   NYI_assert (29, 23, 0x1C);
4950   NYI_assert (21, 21, 1);
4951   NYI_assert (15, 10, 0x33);
4952 
4953   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4954   if (INSTR (22, 22))
4955     {
4956       if (! full)
4957 	HALT_UNALLOC;
4958       for (i = 0; i < 2; i++)
4959 	aarch64_set_vec_double (cpu, vd, i,
4960 				aarch64_get_vec_double (cpu, vn, i) *
4961 				aarch64_get_vec_double (cpu, vm, i) +
4962 				aarch64_get_vec_double (cpu, vd, i));
4963     }
4964   else
4965     {
4966       for (i = 0; i < (full ? 4 : 2); i++)
4967 	aarch64_set_vec_float (cpu, vd, i,
4968 			       aarch64_get_vec_float (cpu, vn, i) *
4969 			       aarch64_get_vec_float (cpu, vm, i) +
4970 			       aarch64_get_vec_float (cpu, vd, i));
4971     }
4972 }
4973 
4974 static void
4975 do_vec_max (sim_cpu *cpu)
4976 {
4977   /* instr[31]    = 0
4978      instr[30]    = full/half selector
4979      instr[29]    = SMAX (0) / UMAX (1)
4980      instr[28,24] = 0 1110
4981      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
4982      instr[21]    = 1
4983      instr[20,16] = Vn
4984      instr[15,10] = 0110 01
4985      instr[9,5]   = Vm
4986      instr[4.0]   = Vd.  */
4987 
4988   unsigned vm = INSTR (20, 16);
4989   unsigned vn = INSTR (9, 5);
4990   unsigned vd = INSTR (4, 0);
4991   unsigned i;
4992   int      full = INSTR (30, 30);
4993 
4994   NYI_assert (28, 24, 0x0E);
4995   NYI_assert (21, 21, 1);
4996   NYI_assert (15, 10, 0x19);
4997 
4998   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4999   if (INSTR (29, 29))
5000     {
5001       switch (INSTR (23, 22))
5002 	{
5003 	case 0:
5004 	  for (i = 0; i < (full ? 16 : 8); i++)
5005 	    aarch64_set_vec_u8 (cpu, vd, i,
5006 				aarch64_get_vec_u8 (cpu, vn, i)
5007 				> aarch64_get_vec_u8 (cpu, vm, i)
5008 				? aarch64_get_vec_u8 (cpu, vn, i)
5009 				: aarch64_get_vec_u8 (cpu, vm, i));
5010 	  return;
5011 
5012 	case 1:
5013 	  for (i = 0; i < (full ? 8 : 4); i++)
5014 	    aarch64_set_vec_u16 (cpu, vd, i,
5015 				 aarch64_get_vec_u16 (cpu, vn, i)
5016 				 > aarch64_get_vec_u16 (cpu, vm, i)
5017 				 ? aarch64_get_vec_u16 (cpu, vn, i)
5018 				 : aarch64_get_vec_u16 (cpu, vm, i));
5019 	  return;
5020 
5021 	case 2:
5022 	  for (i = 0; i < (full ? 4 : 2); i++)
5023 	    aarch64_set_vec_u32 (cpu, vd, i,
5024 				 aarch64_get_vec_u32 (cpu, vn, i)
5025 				 > aarch64_get_vec_u32 (cpu, vm, i)
5026 				 ? aarch64_get_vec_u32 (cpu, vn, i)
5027 				 : aarch64_get_vec_u32 (cpu, vm, i));
5028 	  return;
5029 
5030 	case 3:
5031 	  HALT_UNALLOC;
5032 	}
5033     }
5034   else
5035     {
5036       switch (INSTR (23, 22))
5037 	{
5038 	case 0:
5039 	  for (i = 0; i < (full ? 16 : 8); i++)
5040 	    aarch64_set_vec_s8 (cpu, vd, i,
5041 				aarch64_get_vec_s8 (cpu, vn, i)
5042 				> aarch64_get_vec_s8 (cpu, vm, i)
5043 				? aarch64_get_vec_s8 (cpu, vn, i)
5044 				: aarch64_get_vec_s8 (cpu, vm, i));
5045 	  return;
5046 
5047 	case 1:
5048 	  for (i = 0; i < (full ? 8 : 4); i++)
5049 	    aarch64_set_vec_s16 (cpu, vd, i,
5050 				 aarch64_get_vec_s16 (cpu, vn, i)
5051 				 > aarch64_get_vec_s16 (cpu, vm, i)
5052 				 ? aarch64_get_vec_s16 (cpu, vn, i)
5053 				 : aarch64_get_vec_s16 (cpu, vm, i));
5054 	  return;
5055 
5056 	case 2:
5057 	  for (i = 0; i < (full ? 4 : 2); i++)
5058 	    aarch64_set_vec_s32 (cpu, vd, i,
5059 				 aarch64_get_vec_s32 (cpu, vn, i)
5060 				 > aarch64_get_vec_s32 (cpu, vm, i)
5061 				 ? aarch64_get_vec_s32 (cpu, vn, i)
5062 				 : aarch64_get_vec_s32 (cpu, vm, i));
5063 	  return;
5064 
5065 	case 3:
5066 	  HALT_UNALLOC;
5067 	}
5068     }
5069 }
5070 
5071 static void
5072 do_vec_min (sim_cpu *cpu)
5073 {
5074   /* instr[31]    = 0
5075      instr[30]    = full/half selector
5076      instr[29]    = SMIN (0) / UMIN (1)
5077      instr[28,24] = 0 1110
5078      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5079      instr[21]    = 1
5080      instr[20,16] = Vn
5081      instr[15,10] = 0110 11
5082      instr[9,5]   = Vm
5083      instr[4.0]   = Vd.  */
5084 
5085   unsigned vm = INSTR (20, 16);
5086   unsigned vn = INSTR (9, 5);
5087   unsigned vd = INSTR (4, 0);
5088   unsigned i;
5089   int      full = INSTR (30, 30);
5090 
5091   NYI_assert (28, 24, 0x0E);
5092   NYI_assert (21, 21, 1);
5093   NYI_assert (15, 10, 0x1B);
5094 
5095   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5096   if (INSTR (29, 29))
5097     {
5098       switch (INSTR (23, 22))
5099 	{
5100 	case 0:
5101 	  for (i = 0; i < (full ? 16 : 8); i++)
5102 	    aarch64_set_vec_u8 (cpu, vd, i,
5103 				aarch64_get_vec_u8 (cpu, vn, i)
5104 				< aarch64_get_vec_u8 (cpu, vm, i)
5105 				? aarch64_get_vec_u8 (cpu, vn, i)
5106 				: aarch64_get_vec_u8 (cpu, vm, i));
5107 	  return;
5108 
5109 	case 1:
5110 	  for (i = 0; i < (full ? 8 : 4); i++)
5111 	    aarch64_set_vec_u16 (cpu, vd, i,
5112 				 aarch64_get_vec_u16 (cpu, vn, i)
5113 				 < aarch64_get_vec_u16 (cpu, vm, i)
5114 				 ? aarch64_get_vec_u16 (cpu, vn, i)
5115 				 : aarch64_get_vec_u16 (cpu, vm, i));
5116 	  return;
5117 
5118 	case 2:
5119 	  for (i = 0; i < (full ? 4 : 2); i++)
5120 	    aarch64_set_vec_u32 (cpu, vd, i,
5121 				 aarch64_get_vec_u32 (cpu, vn, i)
5122 				 < aarch64_get_vec_u32 (cpu, vm, i)
5123 				 ? aarch64_get_vec_u32 (cpu, vn, i)
5124 				 : aarch64_get_vec_u32 (cpu, vm, i));
5125 	  return;
5126 
5127 	case 3:
5128 	  HALT_UNALLOC;
5129 	}
5130     }
5131   else
5132     {
5133       switch (INSTR (23, 22))
5134 	{
5135 	case 0:
5136 	  for (i = 0; i < (full ? 16 : 8); i++)
5137 	    aarch64_set_vec_s8 (cpu, vd, i,
5138 				aarch64_get_vec_s8 (cpu, vn, i)
5139 				< aarch64_get_vec_s8 (cpu, vm, i)
5140 				? aarch64_get_vec_s8 (cpu, vn, i)
5141 				: aarch64_get_vec_s8 (cpu, vm, i));
5142 	  return;
5143 
5144 	case 1:
5145 	  for (i = 0; i < (full ? 8 : 4); i++)
5146 	    aarch64_set_vec_s16 (cpu, vd, i,
5147 				 aarch64_get_vec_s16 (cpu, vn, i)
5148 				 < aarch64_get_vec_s16 (cpu, vm, i)
5149 				 ? aarch64_get_vec_s16 (cpu, vn, i)
5150 				 : aarch64_get_vec_s16 (cpu, vm, i));
5151 	  return;
5152 
5153 	case 2:
5154 	  for (i = 0; i < (full ? 4 : 2); i++)
5155 	    aarch64_set_vec_s32 (cpu, vd, i,
5156 				 aarch64_get_vec_s32 (cpu, vn, i)
5157 				 < aarch64_get_vec_s32 (cpu, vm, i)
5158 				 ? aarch64_get_vec_s32 (cpu, vn, i)
5159 				 : aarch64_get_vec_s32 (cpu, vm, i));
5160 	  return;
5161 
5162 	case 3:
5163 	  HALT_UNALLOC;
5164 	}
5165     }
5166 }
5167 
5168 static void
5169 do_vec_sub_long (sim_cpu *cpu)
5170 {
5171   /* instr[31]    = 0
5172      instr[30]    = lower (0) / upper (1)
5173      instr[29]    = signed (0) / unsigned (1)
5174      instr[28,24] = 0 1110
5175      instr[23,22] = size: bytes (00), half (01), word (10)
5176      instr[21]    = 1
5177      insrt[20,16] = Vm
5178      instr[15,10] = 0010 00
5179      instr[9,5]   = Vn
5180      instr[4,0]   = V dest.  */
5181 
5182   unsigned size = INSTR (23, 22);
5183   unsigned vm = INSTR (20, 16);
5184   unsigned vn = INSTR (9, 5);
5185   unsigned vd = INSTR (4, 0);
5186   unsigned bias = 0;
5187   unsigned i;
5188 
5189   NYI_assert (28, 24, 0x0E);
5190   NYI_assert (21, 21, 1);
5191   NYI_assert (15, 10, 0x08);
5192 
5193   if (size == 3)
5194     HALT_UNALLOC;
5195 
5196   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5197   switch (INSTR (30, 29))
5198     {
5199     case 2: /* SSUBL2.  */
5200       bias = 2;
5201     case 0: /* SSUBL.  */
5202       switch (size)
5203 	{
5204 	case 0:
5205 	  bias *= 3;
5206 	  for (i = 0; i < 8; i++)
5207 	    aarch64_set_vec_s16 (cpu, vd, i,
5208 				 aarch64_get_vec_s8 (cpu, vn, i + bias)
5209 				 - aarch64_get_vec_s8 (cpu, vm, i + bias));
5210 	  break;
5211 
5212 	case 1:
5213 	  bias *= 2;
5214 	  for (i = 0; i < 4; i++)
5215 	    aarch64_set_vec_s32 (cpu, vd, i,
5216 				 aarch64_get_vec_s16 (cpu, vn, i + bias)
5217 				 - aarch64_get_vec_s16 (cpu, vm, i + bias));
5218 	  break;
5219 
5220 	case 2:
5221 	  for (i = 0; i < 2; i++)
5222 	    aarch64_set_vec_s64 (cpu, vd, i,
5223 				 aarch64_get_vec_s32 (cpu, vn, i + bias)
5224 				 - aarch64_get_vec_s32 (cpu, vm, i + bias));
5225 	  break;
5226 
5227 	default:
5228 	  HALT_UNALLOC;
5229 	}
5230       break;
5231 
5232     case 3: /* USUBL2.  */
5233       bias = 2;
5234     case 1: /* USUBL.  */
5235       switch (size)
5236 	{
5237 	case 0:
5238 	  bias *= 3;
5239 	  for (i = 0; i < 8; i++)
5240 	    aarch64_set_vec_u16 (cpu, vd, i,
5241 				 aarch64_get_vec_u8 (cpu, vn, i + bias)
5242 				 - aarch64_get_vec_u8 (cpu, vm, i + bias));
5243 	  break;
5244 
5245 	case 1:
5246 	  bias *= 2;
5247 	  for (i = 0; i < 4; i++)
5248 	    aarch64_set_vec_u32 (cpu, vd, i,
5249 				 aarch64_get_vec_u16 (cpu, vn, i + bias)
5250 				 - aarch64_get_vec_u16 (cpu, vm, i + bias));
5251 	  break;
5252 
5253 	case 2:
5254 	  for (i = 0; i < 2; i++)
5255 	    aarch64_set_vec_u64 (cpu, vd, i,
5256 				 aarch64_get_vec_u32 (cpu, vn, i + bias)
5257 				 - aarch64_get_vec_u32 (cpu, vm, i + bias));
5258 	  break;
5259 
5260 	default:
5261 	  HALT_UNALLOC;
5262 	}
5263       break;
5264     }
5265 }
5266 
5267 static void
5268 do_vec_ADDP (sim_cpu *cpu)
5269 {
5270   /* instr[31]    = 0
5271      instr[30]    = half(0)/full(1)
5272      instr[29,24] = 00 1110
5273      instr[23,22] = size: bytes (00), half (01), word (10), long (11)
5274      instr[21]    = 1
5275      insrt[20,16] = Vm
5276      instr[15,10] = 1011 11
5277      instr[9,5]   = Vn
5278      instr[4,0]   = V dest.  */
5279 
5280   FRegister copy_vn;
5281   FRegister copy_vm;
5282   unsigned full = INSTR (30, 30);
5283   unsigned size = INSTR (23, 22);
5284   unsigned vm = INSTR (20, 16);
5285   unsigned vn = INSTR (9, 5);
5286   unsigned vd = INSTR (4, 0);
5287   unsigned i, range;
5288 
5289   NYI_assert (29, 24, 0x0E);
5290   NYI_assert (21, 21, 1);
5291   NYI_assert (15, 10, 0x2F);
5292 
5293   /* Make copies of the source registers in case vd == vn/vm.  */
5294   copy_vn = cpu->fr[vn];
5295   copy_vm = cpu->fr[vm];
5296 
5297   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5298   switch (size)
5299     {
5300     case 0:
5301       range = full ? 8 : 4;
5302       for (i = 0; i < range; i++)
5303 	{
5304 	  aarch64_set_vec_u8 (cpu, vd, i,
5305 			      copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
5306 	  aarch64_set_vec_u8 (cpu, vd, i + range,
5307 			      copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
5308 	}
5309       return;
5310 
5311     case 1:
5312       range = full ? 4 : 2;
5313       for (i = 0; i < range; i++)
5314 	{
5315 	  aarch64_set_vec_u16 (cpu, vd, i,
5316 			       copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
5317 	  aarch64_set_vec_u16 (cpu, vd, i + range,
5318 			       copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
5319 	}
5320       return;
5321 
5322     case 2:
5323       range = full ? 2 : 1;
5324       for (i = 0; i < range; i++)
5325 	{
5326 	  aarch64_set_vec_u32 (cpu, vd, i,
5327 			       copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
5328 	  aarch64_set_vec_u32 (cpu, vd, i + range,
5329 			       copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
5330 	}
5331       return;
5332 
5333     case 3:
5334       if (! full)
5335 	HALT_UNALLOC;
5336       aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
5337       aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
5338       return;
5339     }
5340 }
5341 
5342 static void
5343 do_vec_UMOV (sim_cpu *cpu)
5344 {
5345   /* instr[31]    = 0
5346      instr[30]    = 32-bit(0)/64-bit(1)
5347      instr[29,21] = 00 1110 000
5348      insrt[20,16] = size & index
5349      instr[15,10] = 0011 11
5350      instr[9,5]   = V source
5351      instr[4,0]   = R dest.  */
5352 
5353   unsigned vs = INSTR (9, 5);
5354   unsigned rd = INSTR (4, 0);
5355   unsigned index;
5356 
5357   NYI_assert (29, 21, 0x070);
5358   NYI_assert (15, 10, 0x0F);
5359 
5360   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5361   if (INSTR (16, 16))
5362     {
5363       /* Byte transfer.  */
5364       index = INSTR (20, 17);
5365       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5366 			   aarch64_get_vec_u8 (cpu, vs, index));
5367     }
5368   else if (INSTR (17, 17))
5369     {
5370       index = INSTR (20, 18);
5371       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5372 			   aarch64_get_vec_u16 (cpu, vs, index));
5373     }
5374   else if (INSTR (18, 18))
5375     {
5376       index = INSTR (20, 19);
5377       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5378 			   aarch64_get_vec_u32 (cpu, vs, index));
5379     }
5380   else
5381     {
5382       if (INSTR (30, 30) != 1)
5383 	HALT_UNALLOC;
5384 
5385       index = INSTR (20, 20);
5386       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5387 			   aarch64_get_vec_u64 (cpu, vs, index));
5388     }
5389 }
5390 
5391 static void
5392 do_vec_FABS (sim_cpu *cpu)
5393 {
5394   /* instr[31]    = 0
5395      instr[30]    = half(0)/full(1)
5396      instr[29,23] = 00 1110 1
5397      instr[22]    = float(0)/double(1)
5398      instr[21,16] = 10 0000
5399      instr[15,10] = 1111 10
5400      instr[9,5]   = Vn
5401      instr[4,0]   = Vd.  */
5402 
5403   unsigned vn = INSTR (9, 5);
5404   unsigned vd = INSTR (4, 0);
5405   unsigned full = INSTR (30, 30);
5406   unsigned i;
5407 
5408   NYI_assert (29, 23, 0x1D);
5409   NYI_assert (21, 10, 0x83E);
5410 
5411   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5412   if (INSTR (22, 22))
5413     {
5414       if (! full)
5415 	HALT_NYI;
5416 
5417       for (i = 0; i < 2; i++)
5418 	aarch64_set_vec_double (cpu, vd, i,
5419 				fabs (aarch64_get_vec_double (cpu, vn, i)));
5420     }
5421   else
5422     {
5423       for (i = 0; i < (full ? 4 : 2); i++)
5424 	aarch64_set_vec_float (cpu, vd, i,
5425 			       fabsf (aarch64_get_vec_float (cpu, vn, i)));
5426     }
5427 }
5428 
5429 static void
5430 do_vec_FCVTZS (sim_cpu *cpu)
5431 {
5432   /* instr[31]    = 0
5433      instr[30]    = half (0) / all (1)
5434      instr[29,23] = 00 1110 1
5435      instr[22]    = single (0) / double (1)
5436      instr[21,10] = 10 0001 1011 10
5437      instr[9,5]   = Rn
5438      instr[4,0]   = Rd.  */
5439 
5440   unsigned rn = INSTR (9, 5);
5441   unsigned rd = INSTR (4, 0);
5442   unsigned full = INSTR (30, 30);
5443   unsigned i;
5444 
5445   NYI_assert (31, 31, 0);
5446   NYI_assert (29, 23, 0x1D);
5447   NYI_assert (21, 10, 0x86E);
5448 
5449   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5450   if (INSTR (22, 22))
5451     {
5452       if (! full)
5453 	HALT_UNALLOC;
5454 
5455       for (i = 0; i < 2; i++)
5456 	aarch64_set_vec_s64 (cpu, rd, i,
5457 			     (int64_t) aarch64_get_vec_double (cpu, rn, i));
5458     }
5459   else
5460     for (i = 0; i < (full ? 4 : 2); i++)
5461       aarch64_set_vec_s32 (cpu, rd, i,
5462 			   (int32_t) aarch64_get_vec_float (cpu, rn, i));
5463 }
5464 
5465 static void
5466 do_vec_REV64 (sim_cpu *cpu)
5467 {
5468   /* instr[31]    = 0
5469      instr[30]    = full/half
5470      instr[29,24] = 00 1110
5471      instr[23,22] = size
5472      instr[21,10] = 10 0000 0000 10
5473      instr[9,5]   = Rn
5474      instr[4,0]   = Rd.  */
5475 
5476   unsigned rn = INSTR (9, 5);
5477   unsigned rd = INSTR (4, 0);
5478   unsigned size = INSTR (23, 22);
5479   unsigned full = INSTR (30, 30);
5480   unsigned i;
5481   FRegister val;
5482 
5483   NYI_assert (29, 24, 0x0E);
5484   NYI_assert (21, 10, 0x802);
5485 
5486   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5487   switch (size)
5488     {
5489     case 0:
5490       for (i = 0; i < (full ? 16 : 8); i++)
5491 	val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
5492       break;
5493 
5494     case 1:
5495       for (i = 0; i < (full ? 8 : 4); i++)
5496 	val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
5497       break;
5498 
5499     case 2:
5500       for (i = 0; i < (full ? 4 : 2); i++)
5501 	val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
5502       break;
5503 
5504     case 3:
5505       HALT_UNALLOC;
5506     }
5507 
5508   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5509   if (full)
5510     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5511 }
5512 
5513 static void
5514 do_vec_REV16 (sim_cpu *cpu)
5515 {
5516   /* instr[31]    = 0
5517      instr[30]    = full/half
5518      instr[29,24] = 00 1110
5519      instr[23,22] = size
5520      instr[21,10] = 10 0000 0001 10
5521      instr[9,5]   = Rn
5522      instr[4,0]   = Rd.  */
5523 
5524   unsigned rn = INSTR (9, 5);
5525   unsigned rd = INSTR (4, 0);
5526   unsigned size = INSTR (23, 22);
5527   unsigned full = INSTR (30, 30);
5528   unsigned i;
5529   FRegister val;
5530 
5531   NYI_assert (29, 24, 0x0E);
5532   NYI_assert (21, 10, 0x806);
5533 
5534   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5535   switch (size)
5536     {
5537     case 0:
5538       for (i = 0; i < (full ? 16 : 8); i++)
5539 	val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
5540       break;
5541 
5542     default:
5543       HALT_UNALLOC;
5544     }
5545 
5546   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5547   if (full)
5548     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5549 }
5550 
5551 static void
5552 do_vec_op1 (sim_cpu *cpu)
5553 {
5554   /* instr[31]    = 0
5555      instr[30]    = half/full
5556      instr[29,24] = 00 1110
5557      instr[23,21] = ???
5558      instr[20,16] = Vm
5559      instr[15,10] = sub-opcode
5560      instr[9,5]   = Vn
5561      instr[4,0]   = Vd  */
5562   NYI_assert (29, 24, 0x0E);
5563 
5564   if (INSTR (21, 21) == 0)
5565     {
5566       if (INSTR (23, 22) == 0)
5567 	{
5568 	  if (INSTR (30, 30) == 1
5569 	      && INSTR (17, 14) == 0
5570 	      && INSTR (12, 10) == 7)
5571 	    return do_vec_ins_2 (cpu);
5572 
5573 	  switch (INSTR (15, 10))
5574 	    {
5575 	    case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
5576 	    case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
5577 	    case 0x07: do_vec_INS (cpu); return;
5578 	    case 0x0A: do_vec_TRN (cpu); return;
5579 
5580 	    case 0x0F:
5581 	      if (INSTR (17, 16) == 0)
5582 		{
5583 		  do_vec_MOV_into_scalar (cpu);
5584 		  return;
5585 		}
5586 	      break;
5587 
5588 	    case 0x00:
5589 	    case 0x08:
5590 	    case 0x10:
5591 	    case 0x18:
5592 	      do_vec_TBL (cpu); return;
5593 
5594 	    case 0x06:
5595 	    case 0x16:
5596 	      do_vec_UZP (cpu); return;
5597 
5598 	    case 0x0E:
5599 	    case 0x1E:
5600 	      do_vec_ZIP (cpu); return;
5601 
5602 	    default:
5603 	      HALT_NYI;
5604 	    }
5605 	}
5606 
5607       switch (INSTR (13, 10))
5608 	{
5609 	case 0x6: do_vec_UZP (cpu); return;
5610 	case 0xE: do_vec_ZIP (cpu); return;
5611 	case 0xA: do_vec_TRN (cpu); return;
5612 	case 0xF: do_vec_UMOV (cpu); return;
5613 	default:  HALT_NYI;
5614 	}
5615     }
5616 
5617   switch (INSTR (15, 10))
5618     {
5619     case 0x02: do_vec_REV64 (cpu); return;
5620     case 0x06: do_vec_REV16 (cpu); return;
5621 
5622     case 0x07:
5623       switch (INSTR (23, 21))
5624 	{
5625 	case 1: do_vec_AND (cpu); return;
5626 	case 3: do_vec_BIC (cpu); return;
5627 	case 5: do_vec_ORR (cpu); return;
5628 	case 7: do_vec_ORN (cpu); return;
5629 	default: HALT_NYI;
5630 	}
5631 
5632     case 0x08: do_vec_sub_long (cpu); return;
5633     case 0x0a: do_vec_XTN (cpu); return;
5634     case 0x11: do_vec_SSHL (cpu); return;
5635     case 0x19: do_vec_max (cpu); return;
5636     case 0x1B: do_vec_min (cpu); return;
5637     case 0x21: do_vec_add (cpu); return;
5638     case 0x25: do_vec_MLA (cpu); return;
5639     case 0x27: do_vec_mul (cpu); return;
5640     case 0x2F: do_vec_ADDP (cpu); return;
5641     case 0x30: do_vec_mull (cpu); return;
5642     case 0x33: do_vec_FMLA (cpu); return;
5643     case 0x35: do_vec_fadd (cpu); return;
5644 
5645     case 0x2E:
5646       switch (INSTR (20, 16))
5647 	{
5648 	case 0x00: do_vec_ABS (cpu); return;
5649 	case 0x01: do_vec_FCVTZS (cpu); return;
5650 	case 0x11: do_vec_ADDV (cpu); return;
5651 	default: HALT_NYI;
5652 	}
5653 
5654     case 0x31:
5655     case 0x3B:
5656       do_vec_Fminmax (cpu); return;
5657 
5658     case 0x0D:
5659     case 0x0F:
5660     case 0x22:
5661     case 0x23:
5662     case 0x26:
5663     case 0x2A:
5664     case 0x32:
5665     case 0x36:
5666     case 0x39:
5667     case 0x3A:
5668       do_vec_compare (cpu); return;
5669 
5670     case 0x3E:
5671       do_vec_FABS (cpu); return;
5672 
5673     default:
5674       HALT_NYI;
5675     }
5676 }
5677 
5678 static void
5679 do_vec_xtl (sim_cpu *cpu)
5680 {
5681   /* instr[31]    = 0
5682      instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
5683      instr[28,22] = 0 1111 00
5684      instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
5685      instr[15,10] = 1010 01
5686      instr[9,5]   = V source
5687      instr[4,0]   = V dest.  */
5688 
5689   unsigned vs = INSTR (9, 5);
5690   unsigned vd = INSTR (4, 0);
5691   unsigned i, shift, bias = 0;
5692 
5693   NYI_assert (28, 22, 0x3C);
5694   NYI_assert (15, 10, 0x29);
5695 
5696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5697   switch (INSTR (30, 29))
5698     {
5699     case 2: /* SXTL2, SSHLL2.  */
5700       bias = 2;
5701     case 0: /* SXTL, SSHLL.  */
5702       if (INSTR (21, 21))
5703 	{
5704 	  int64_t val1, val2;
5705 
5706 	  shift = INSTR (20, 16);
5707 	  /* Get the source values before setting the destination values
5708 	     in case the source and destination are the same.  */
5709 	  val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
5710 	  val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
5711 	  aarch64_set_vec_s64 (cpu, vd, 0, val1);
5712 	  aarch64_set_vec_s64 (cpu, vd, 1, val2);
5713 	}
5714       else if (INSTR (20, 20))
5715 	{
5716 	  int32_t v[4];
5717 	  int32_t v1,v2,v3,v4;
5718 
5719 	  shift = INSTR (19, 16);
5720 	  bias *= 2;
5721 	  for (i = 0; i < 4; i++)
5722 	    v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
5723 	  for (i = 0; i < 4; i++)
5724 	    aarch64_set_vec_s32 (cpu, vd, i, v[i]);
5725 	}
5726       else
5727 	{
5728 	  int16_t v[8];
5729 	  NYI_assert (19, 19, 1);
5730 
5731 	  shift = INSTR (18, 16);
5732 	  bias *= 3;
5733 	  for (i = 0; i < 8; i++)
5734 	    v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
5735 	  for (i = 0; i < 8; i++)
5736 	    aarch64_set_vec_s16 (cpu, vd, i, v[i]);
5737 	}
5738       return;
5739 
5740     case 3: /* UXTL2, USHLL2.  */
5741       bias = 2;
5742     case 1: /* UXTL, USHLL.  */
5743       if (INSTR (21, 21))
5744 	{
5745 	  uint64_t v1, v2;
5746 	  shift = INSTR (20, 16);
5747 	  v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
5748 	  v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
5749 	  aarch64_set_vec_u64 (cpu, vd, 0, v1);
5750 	  aarch64_set_vec_u64 (cpu, vd, 1, v2);
5751 	}
5752       else if (INSTR (20, 20))
5753 	{
5754 	  uint32_t v[4];
5755 	  shift = INSTR (19, 16);
5756 	  bias *= 2;
5757 	  for (i = 0; i < 4; i++)
5758 	    v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
5759 	  for (i = 0; i < 4; i++)
5760 	    aarch64_set_vec_u32 (cpu, vd, i, v[i]);
5761 	}
5762       else
5763 	{
5764 	  uint16_t v[8];
5765 	  NYI_assert (19, 19, 1);
5766 
5767 	  shift = INSTR (18, 16);
5768 	  bias *= 3;
5769 	  for (i = 0; i < 8; i++)
5770 	    v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
5771 	  for (i = 0; i < 8; i++)
5772 	    aarch64_set_vec_u16 (cpu, vd, i, v[i]);
5773 	}
5774       return;
5775     }
5776 }
5777 
5778 static void
5779 do_vec_SHL (sim_cpu *cpu)
5780 {
5781   /* instr [31]    = 0
5782      instr [30]    = half(0)/full(1)
5783      instr [29,23] = 001 1110
5784      instr [22,16] = size and shift amount
5785      instr [15,10] = 01 0101
5786      instr [9, 5]  = Vs
5787      instr [4, 0]  = Vd.  */
5788 
5789   int shift;
5790   int full    = INSTR (30, 30);
5791   unsigned vs = INSTR (9, 5);
5792   unsigned vd = INSTR (4, 0);
5793   unsigned i;
5794 
5795   NYI_assert (29, 23, 0x1E);
5796   NYI_assert (15, 10, 0x15);
5797 
5798   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5799   if (INSTR (22, 22))
5800     {
5801       shift = INSTR (21, 16);
5802 
5803       if (full == 0)
5804 	HALT_UNALLOC;
5805 
5806       for (i = 0; i < 2; i++)
5807 	{
5808 	  uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5809 	  aarch64_set_vec_u64 (cpu, vd, i, val << shift);
5810 	}
5811 
5812       return;
5813     }
5814 
5815   if (INSTR (21, 21))
5816     {
5817       shift = INSTR (20, 16);
5818 
5819       for (i = 0; i < (full ? 4 : 2); i++)
5820 	{
5821 	  uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5822 	  aarch64_set_vec_u32 (cpu, vd, i, val << shift);
5823 	}
5824 
5825       return;
5826     }
5827 
5828   if (INSTR (20, 20))
5829     {
5830       shift = INSTR (19, 16);
5831 
5832       for (i = 0; i < (full ? 8 : 4); i++)
5833 	{
5834 	  uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5835 	  aarch64_set_vec_u16 (cpu, vd, i, val << shift);
5836 	}
5837 
5838       return;
5839     }
5840 
5841   if (INSTR (19, 19) == 0)
5842     HALT_UNALLOC;
5843 
5844   shift = INSTR (18, 16);
5845 
5846   for (i = 0; i < (full ? 16 : 8); i++)
5847     {
5848       uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5849       aarch64_set_vec_u8 (cpu, vd, i, val << shift);
5850     }
5851 }
5852 
5853 static void
5854 do_vec_SSHR_USHR (sim_cpu *cpu)
5855 {
5856   /* instr [31]    = 0
5857      instr [30]    = half(0)/full(1)
5858      instr [29]    = signed(0)/unsigned(1)
5859      instr [28,23] = 0 1111 0
5860      instr [22,16] = size and shift amount
5861      instr [15,10] = 0000 01
5862      instr [9, 5]  = Vs
5863      instr [4, 0]  = Vd.  */
5864 
5865   int full       = INSTR (30, 30);
5866   int sign       = ! INSTR (29, 29);
5867   unsigned shift = INSTR (22, 16);
5868   unsigned vs    = INSTR (9, 5);
5869   unsigned vd    = INSTR (4, 0);
5870   unsigned i;
5871 
5872   NYI_assert (28, 23, 0x1E);
5873   NYI_assert (15, 10, 0x01);
5874 
5875   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5876   if (INSTR (22, 22))
5877     {
5878       shift = 128 - shift;
5879 
5880       if (full == 0)
5881 	HALT_UNALLOC;
5882 
5883       if (sign)
5884 	for (i = 0; i < 2; i++)
5885 	  {
5886 	    int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
5887 	    aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
5888 	  }
5889       else
5890 	for (i = 0; i < 2; i++)
5891 	  {
5892 	    uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5893 	    aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
5894 	  }
5895 
5896       return;
5897     }
5898 
5899   if (INSTR (21, 21))
5900     {
5901       shift = 64 - shift;
5902 
5903       if (sign)
5904 	for (i = 0; i < (full ? 4 : 2); i++)
5905 	  {
5906 	    int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
5907 	    aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
5908 	  }
5909       else
5910 	for (i = 0; i < (full ? 4 : 2); i++)
5911 	  {
5912 	    uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5913 	    aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
5914 	  }
5915 
5916       return;
5917     }
5918 
5919   if (INSTR (20, 20))
5920     {
5921       shift = 32 - shift;
5922 
5923       if (sign)
5924 	for (i = 0; i < (full ? 8 : 4); i++)
5925 	  {
5926 	    int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
5927 	    aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
5928 	  }
5929       else
5930 	for (i = 0; i < (full ? 8 : 4); i++)
5931 	  {
5932 	    uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5933 	    aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
5934 	  }
5935 
5936       return;
5937     }
5938 
5939   if (INSTR (19, 19) == 0)
5940     HALT_UNALLOC;
5941 
5942   shift = 16 - shift;
5943 
5944   if (sign)
5945     for (i = 0; i < (full ? 16 : 8); i++)
5946       {
5947 	int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
5948 	aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
5949       }
5950   else
5951     for (i = 0; i < (full ? 16 : 8); i++)
5952       {
5953 	uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5954 	aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
5955       }
5956 }
5957 
5958 static void
5959 do_vec_MUL_by_element (sim_cpu *cpu)
5960 {
5961   /* instr[31]    = 0
5962      instr[30]    = half/full
5963      instr[29,24] = 00 1111
5964      instr[23,22] = size
5965      instr[21]    = L
5966      instr[20]    = M
5967      instr[19,16] = m
5968      instr[15,12] = 1000
5969      instr[11]    = H
5970      instr[10]    = 0
5971      instr[9,5]   = Vn
5972      instr[4,0]   = Vd  */
5973 
5974   unsigned full     = INSTR (30, 30);
5975   unsigned L        = INSTR (21, 21);
5976   unsigned H        = INSTR (11, 11);
5977   unsigned vn       = INSTR (9, 5);
5978   unsigned vd       = INSTR (4, 0);
5979   unsigned size     = INSTR (23, 22);
5980   unsigned index;
5981   unsigned vm;
5982   unsigned e;
5983 
5984   NYI_assert (29, 24, 0x0F);
5985   NYI_assert (15, 12, 0x8);
5986   NYI_assert (10, 10, 0);
5987 
5988   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5989   switch (size)
5990     {
5991     case 1:
5992       {
5993 	/* 16 bit products.  */
5994 	uint16_t product;
5995 	uint16_t element1;
5996 	uint16_t element2;
5997 
5998 	index = (H << 2) | (L << 1) | INSTR (20, 20);
5999 	vm = INSTR (19, 16);
6000 	element2 = aarch64_get_vec_u16 (cpu, vm, index);
6001 
6002 	for (e = 0; e < (full ? 8 : 4); e ++)
6003 	  {
6004 	    element1 = aarch64_get_vec_u16 (cpu, vn, e);
6005 	    product  = element1 * element2;
6006 	    aarch64_set_vec_u16 (cpu, vd, e, product);
6007 	  }
6008       }
6009       break;
6010 
6011     case 2:
6012       {
6013 	/* 32 bit products.  */
6014 	uint32_t product;
6015 	uint32_t element1;
6016 	uint32_t element2;
6017 
6018 	index = (H << 1) | L;
6019 	vm = INSTR (20, 16);
6020 	element2 = aarch64_get_vec_u32 (cpu, vm, index);
6021 
6022 	for (e = 0; e < (full ? 4 : 2); e ++)
6023 	  {
6024 	    element1 = aarch64_get_vec_u32 (cpu, vn, e);
6025 	    product  = element1 * element2;
6026 	    aarch64_set_vec_u32 (cpu, vd, e, product);
6027 	  }
6028       }
6029       break;
6030 
6031     default:
6032       HALT_UNALLOC;
6033     }
6034 }
6035 
6036 static void
6037 do_FMLA_by_element (sim_cpu *cpu)
6038 {
6039   /* instr[31]    = 0
6040      instr[30]    = half/full
6041      instr[29,23] = 00 1111 1
6042      instr[22]    = size
6043      instr[21]    = L
6044      instr[20,16] = m
6045      instr[15,12] = 0001
6046      instr[11]    = H
6047      instr[10]    = 0
6048      instr[9,5]   = Vn
6049      instr[4,0]   = Vd  */
6050 
6051   unsigned full     = INSTR (30, 30);
6052   unsigned size     = INSTR (22, 22);
6053   unsigned L        = INSTR (21, 21);
6054   unsigned vm       = INSTR (20, 16);
6055   unsigned H        = INSTR (11, 11);
6056   unsigned vn       = INSTR (9, 5);
6057   unsigned vd       = INSTR (4, 0);
6058   unsigned e;
6059 
6060   NYI_assert (29, 23, 0x1F);
6061   NYI_assert (15, 12, 0x1);
6062   NYI_assert (10, 10, 0);
6063 
6064   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6065   if (size)
6066     {
6067       double element1, element2;
6068 
6069       if (! full || L)
6070 	HALT_UNALLOC;
6071 
6072       element2 = aarch64_get_vec_double (cpu, vm, H);
6073 
6074       for (e = 0; e < 2; e++)
6075 	{
6076 	  element1 = aarch64_get_vec_double (cpu, vn, e);
6077 	  element1 *= element2;
6078 	  element1 += aarch64_get_vec_double (cpu, vd, e);
6079 	  aarch64_set_vec_double (cpu, vd, e, element1);
6080 	}
6081     }
6082   else
6083     {
6084       float element1;
6085       float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
6086 
6087       for (e = 0; e < (full ? 4 : 2); e++)
6088 	{
6089 	  element1 = aarch64_get_vec_float (cpu, vn, e);
6090 	  element1 *= element2;
6091 	  element1 += aarch64_get_vec_float (cpu, vd, e);
6092 	  aarch64_set_vec_float (cpu, vd, e, element1);
6093 	}
6094     }
6095 }
6096 
6097 static void
6098 do_vec_op2 (sim_cpu *cpu)
6099 {
6100   /* instr[31]    = 0
6101      instr[30]    = half/full
6102      instr[29,24] = 00 1111
6103      instr[23]    = ?
6104      instr[22,16] = element size & index
6105      instr[15,10] = sub-opcode
6106      instr[9,5]   = Vm
6107      instr[4,0]   = Vd  */
6108 
6109   NYI_assert (29, 24, 0x0F);
6110 
6111   if (INSTR (23, 23) != 0)
6112     {
6113       switch (INSTR (15, 10))
6114 	{
6115 	case 0x04:
6116 	case 0x06:
6117 	  do_FMLA_by_element (cpu);
6118 	  return;
6119 
6120 	case 0x20:
6121 	case 0x22:
6122 	  do_vec_MUL_by_element (cpu);
6123 	  return;
6124 
6125 	default:
6126 	  HALT_NYI;
6127 	}
6128     }
6129   else
6130     {
6131       switch (INSTR (15, 10))
6132 	{
6133 	case 0x01: do_vec_SSHR_USHR (cpu); return;
6134 	case 0x15: do_vec_SHL (cpu); return;
6135 	case 0x20:
6136 	case 0x22: do_vec_MUL_by_element (cpu); return;
6137 	case 0x29: do_vec_xtl (cpu); return;
6138 	default:   HALT_NYI;
6139 	}
6140     }
6141 }
6142 
6143 static void
6144 do_vec_neg (sim_cpu *cpu)
6145 {
6146   /* instr[31]    = 0
6147      instr[30]    = full(1)/half(0)
6148      instr[29,24] = 10 1110
6149      instr[23,22] = size: byte(00), half (01), word (10), long (11)
6150      instr[21,10] = 1000 0010 1110
6151      instr[9,5]   = Vs
6152      instr[4,0]   = Vd  */
6153 
6154   int    full = INSTR (30, 30);
6155   unsigned vs = INSTR (9, 5);
6156   unsigned vd = INSTR (4, 0);
6157   unsigned i;
6158 
6159   NYI_assert (29, 24, 0x2E);
6160   NYI_assert (21, 10, 0x82E);
6161 
6162   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6163   switch (INSTR (23, 22))
6164     {
6165     case 0:
6166       for (i = 0; i < (full ? 16 : 8); i++)
6167 	aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
6168       return;
6169 
6170     case 1:
6171       for (i = 0; i < (full ? 8 : 4); i++)
6172 	aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
6173       return;
6174 
6175     case 2:
6176       for (i = 0; i < (full ? 4 : 2); i++)
6177 	aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
6178       return;
6179 
6180     case 3:
6181       if (! full)
6182 	HALT_NYI;
6183       for (i = 0; i < 2; i++)
6184 	aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
6185       return;
6186     }
6187 }
6188 
6189 static void
6190 do_vec_sqrt (sim_cpu *cpu)
6191 {
6192   /* instr[31]    = 0
6193      instr[30]    = full(1)/half(0)
6194      instr[29,23] = 101 1101
6195      instr[22]    = single(0)/double(1)
6196      instr[21,10] = 1000 0111 1110
6197      instr[9,5]   = Vs
6198      instr[4,0]   = Vd.  */
6199 
6200   int    full = INSTR (30, 30);
6201   unsigned vs = INSTR (9, 5);
6202   unsigned vd = INSTR (4, 0);
6203   unsigned i;
6204 
6205   NYI_assert (29, 23, 0x5B);
6206   NYI_assert (21, 10, 0x87E);
6207 
6208   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6209   if (INSTR (22, 22) == 0)
6210     for (i = 0; i < (full ? 4 : 2); i++)
6211       aarch64_set_vec_float (cpu, vd, i,
6212 			     sqrtf (aarch64_get_vec_float (cpu, vs, i)));
6213   else
6214     for (i = 0; i < 2; i++)
6215       aarch64_set_vec_double (cpu, vd, i,
6216 			      sqrt (aarch64_get_vec_double (cpu, vs, i)));
6217 }
6218 
6219 static void
6220 do_vec_mls_indexed (sim_cpu *cpu)
6221 {
6222   /* instr[31]       = 0
6223      instr[30]       = half(0)/full(1)
6224      instr[29,24]    = 10 1111
6225      instr[23,22]    = 16-bit(01)/32-bit(10)
6226      instr[21,20+11] = index (if 16-bit)
6227      instr[21+11]    = index (if 32-bit)
6228      instr[20,16]    = Vm
6229      instr[15,12]    = 0100
6230      instr[11]       = part of index
6231      instr[10]       = 0
6232      instr[9,5]      = Vs
6233      instr[4,0]      = Vd.  */
6234 
6235   int    full = INSTR (30, 30);
6236   unsigned vs = INSTR (9, 5);
6237   unsigned vd = INSTR (4, 0);
6238   unsigned vm = INSTR (20, 16);
6239   unsigned i;
6240 
6241   NYI_assert (15, 12, 4);
6242   NYI_assert (10, 10, 0);
6243 
6244   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6245   switch (INSTR (23, 22))
6246     {
6247     case 1:
6248       {
6249 	unsigned elem;
6250 	uint32_t val;
6251 
6252 	if (vm > 15)
6253 	  HALT_NYI;
6254 
6255 	elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
6256 	val = aarch64_get_vec_u16 (cpu, vm, elem);
6257 
6258 	for (i = 0; i < (full ? 8 : 4); i++)
6259 	  aarch64_set_vec_u32 (cpu, vd, i,
6260 			       aarch64_get_vec_u32 (cpu, vd, i) -
6261 			       (aarch64_get_vec_u32 (cpu, vs, i) * val));
6262 	return;
6263       }
6264 
6265     case 2:
6266       {
6267 	unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
6268 	uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
6269 
6270 	for (i = 0; i < (full ? 4 : 2); i++)
6271 	  aarch64_set_vec_u64 (cpu, vd, i,
6272 			       aarch64_get_vec_u64 (cpu, vd, i) -
6273 			       (aarch64_get_vec_u64 (cpu, vs, i) * val));
6274 	return;
6275       }
6276 
6277     case 0:
6278     case 3:
6279     default:
6280       HALT_NYI;
6281     }
6282 }
6283 
6284 static void
6285 do_vec_SUB (sim_cpu *cpu)
6286 {
6287   /* instr [31]    = 0
6288      instr [30]    = half(0)/full(1)
6289      instr [29,24] = 10 1110
6290      instr [23,22] = size: byte(00, half(01), word (10), long (11)
6291      instr [21]    = 1
6292      instr [20,16] = Vm
6293      instr [15,10] = 10 0001
6294      instr [9, 5]  = Vn
6295      instr [4, 0]  = Vd.  */
6296 
6297   unsigned full = INSTR (30, 30);
6298   unsigned vm = INSTR (20, 16);
6299   unsigned vn = INSTR (9, 5);
6300   unsigned vd = INSTR (4, 0);
6301   unsigned i;
6302 
6303   NYI_assert (29, 24, 0x2E);
6304   NYI_assert (21, 21, 1);
6305   NYI_assert (15, 10, 0x21);
6306 
6307   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6308   switch (INSTR (23, 22))
6309     {
6310     case 0:
6311       for (i = 0; i < (full ? 16 : 8); i++)
6312 	aarch64_set_vec_s8 (cpu, vd, i,
6313 			    aarch64_get_vec_s8 (cpu, vn, i)
6314 			    - aarch64_get_vec_s8 (cpu, vm, i));
6315       return;
6316 
6317     case 1:
6318       for (i = 0; i < (full ? 8 : 4); i++)
6319 	aarch64_set_vec_s16 (cpu, vd, i,
6320 			     aarch64_get_vec_s16 (cpu, vn, i)
6321 			     - aarch64_get_vec_s16 (cpu, vm, i));
6322       return;
6323 
6324     case 2:
6325       for (i = 0; i < (full ? 4 : 2); i++)
6326 	aarch64_set_vec_s32 (cpu, vd, i,
6327 			     aarch64_get_vec_s32 (cpu, vn, i)
6328 			     - aarch64_get_vec_s32 (cpu, vm, i));
6329       return;
6330 
6331     case 3:
6332       if (full == 0)
6333 	HALT_UNALLOC;
6334 
6335       for (i = 0; i < 2; i++)
6336 	aarch64_set_vec_s64 (cpu, vd, i,
6337 			     aarch64_get_vec_s64 (cpu, vn, i)
6338 			     - aarch64_get_vec_s64 (cpu, vm, i));
6339       return;
6340     }
6341 }
6342 
6343 static void
6344 do_vec_MLS (sim_cpu *cpu)
6345 {
6346   /* instr [31]    = 0
6347      instr [30]    = half(0)/full(1)
6348      instr [29,24] = 10 1110
6349      instr [23,22] = size: byte(00, half(01), word (10)
6350      instr [21]    = 1
6351      instr [20,16] = Vm
6352      instr [15,10] = 10 0101
6353      instr [9, 5]  = Vn
6354      instr [4, 0]  = Vd.  */
6355 
6356   unsigned full = INSTR (30, 30);
6357   unsigned vm = INSTR (20, 16);
6358   unsigned vn = INSTR (9, 5);
6359   unsigned vd = INSTR (4, 0);
6360   unsigned i;
6361 
6362   NYI_assert (29, 24, 0x2E);
6363   NYI_assert (21, 21, 1);
6364   NYI_assert (15, 10, 0x25);
6365 
6366   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6367   switch (INSTR (23, 22))
6368     {
6369     case 0:
6370       for (i = 0; i < (full ? 16 : 8); i++)
6371 	aarch64_set_vec_u8 (cpu, vd, i,
6372 			    (aarch64_get_vec_u8 (cpu, vn, i)
6373 			     * aarch64_get_vec_u8 (cpu, vm, i))
6374 			    - aarch64_get_vec_u8 (cpu, vd, i));
6375       return;
6376 
6377     case 1:
6378       for (i = 0; i < (full ? 8 : 4); i++)
6379 	aarch64_set_vec_u16 (cpu, vd, i,
6380 			     (aarch64_get_vec_u16 (cpu, vn, i)
6381 			      * aarch64_get_vec_u16 (cpu, vm, i))
6382 			     - aarch64_get_vec_u16 (cpu, vd, i));
6383       return;
6384 
6385     case 2:
6386       for (i = 0; i < (full ? 4 : 2); i++)
6387 	aarch64_set_vec_u32 (cpu, vd, i,
6388 			     (aarch64_get_vec_u32 (cpu, vn, i)
6389 			      * aarch64_get_vec_u32 (cpu, vm, i))
6390 			     - aarch64_get_vec_u32 (cpu, vd, i));
6391       return;
6392 
6393     default:
6394       HALT_UNALLOC;
6395     }
6396 }
6397 
6398 static void
6399 do_vec_FDIV (sim_cpu *cpu)
6400 {
6401   /* instr [31]    = 0
6402      instr [30]    = half(0)/full(1)
6403      instr [29,23] = 10 1110 0
6404      instr [22]    = float()/double(1)
6405      instr [21]    = 1
6406      instr [20,16] = Vm
6407      instr [15,10] = 1111 11
6408      instr [9, 5]  = Vn
6409      instr [4, 0]  = Vd.  */
6410 
6411   unsigned full = INSTR (30, 30);
6412   unsigned vm = INSTR (20, 16);
6413   unsigned vn = INSTR (9, 5);
6414   unsigned vd = INSTR (4, 0);
6415   unsigned i;
6416 
6417   NYI_assert (29, 23, 0x5C);
6418   NYI_assert (21, 21, 1);
6419   NYI_assert (15, 10, 0x3F);
6420 
6421   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6422   if (INSTR (22, 22))
6423     {
6424       if (! full)
6425 	HALT_UNALLOC;
6426 
6427       for (i = 0; i < 2; i++)
6428 	aarch64_set_vec_double (cpu, vd, i,
6429 				aarch64_get_vec_double (cpu, vn, i)
6430 				/ aarch64_get_vec_double (cpu, vm, i));
6431     }
6432   else
6433     for (i = 0; i < (full ? 4 : 2); i++)
6434       aarch64_set_vec_float (cpu, vd, i,
6435 			     aarch64_get_vec_float (cpu, vn, i)
6436 			     / aarch64_get_vec_float (cpu, vm, i));
6437 }
6438 
6439 static void
6440 do_vec_FMUL (sim_cpu *cpu)
6441 {
6442   /* instr [31]    = 0
6443      instr [30]    = half(0)/full(1)
6444      instr [29,23] = 10 1110 0
6445      instr [22]    = float(0)/double(1)
6446      instr [21]    = 1
6447      instr [20,16] = Vm
6448      instr [15,10] = 1101 11
6449      instr [9, 5]  = Vn
6450      instr [4, 0]  = Vd.  */
6451 
6452   unsigned full = INSTR (30, 30);
6453   unsigned vm = INSTR (20, 16);
6454   unsigned vn = INSTR (9, 5);
6455   unsigned vd = INSTR (4, 0);
6456   unsigned i;
6457 
6458   NYI_assert (29, 23, 0x5C);
6459   NYI_assert (21, 21, 1);
6460   NYI_assert (15, 10, 0x37);
6461 
6462   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6463   if (INSTR (22, 22))
6464     {
6465       if (! full)
6466 	HALT_UNALLOC;
6467 
6468       for (i = 0; i < 2; i++)
6469 	aarch64_set_vec_double (cpu, vd, i,
6470 				aarch64_get_vec_double (cpu, vn, i)
6471 				* aarch64_get_vec_double (cpu, vm, i));
6472     }
6473   else
6474     for (i = 0; i < (full ? 4 : 2); i++)
6475       aarch64_set_vec_float (cpu, vd, i,
6476 			     aarch64_get_vec_float (cpu, vn, i)
6477 			     * aarch64_get_vec_float (cpu, vm, i));
6478 }
6479 
6480 static void
6481 do_vec_FADDP (sim_cpu *cpu)
6482 {
6483   /* instr [31]    = 0
6484      instr [30]    = half(0)/full(1)
6485      instr [29,23] = 10 1110 0
6486      instr [22]    = float(0)/double(1)
6487      instr [21]    = 1
6488      instr [20,16] = Vm
6489      instr [15,10] = 1101 01
6490      instr [9, 5]  = Vn
6491      instr [4, 0]  = Vd.  */
6492 
6493   unsigned full = INSTR (30, 30);
6494   unsigned vm = INSTR (20, 16);
6495   unsigned vn = INSTR (9, 5);
6496   unsigned vd = INSTR (4, 0);
6497 
6498   NYI_assert (29, 23, 0x5C);
6499   NYI_assert (21, 21, 1);
6500   NYI_assert (15, 10, 0x35);
6501 
6502   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6503   if (INSTR (22, 22))
6504     {
6505       /* Extract values before adding them incase vd == vn/vm.  */
6506       double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
6507       double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
6508       double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
6509       double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
6510 
6511       if (! full)
6512 	HALT_UNALLOC;
6513 
6514       aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
6515       aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
6516     }
6517   else
6518     {
6519       /* Extract values before adding them incase vd == vn/vm.  */
6520       float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
6521       float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
6522       float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
6523       float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
6524 
6525       if (full)
6526 	{
6527 	  float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
6528 	  float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
6529 	  float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
6530 	  float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
6531 
6532 	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6533 	  aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
6534 	  aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
6535 	  aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
6536 	}
6537       else
6538 	{
6539 	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6540 	  aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
6541 	}
6542     }
6543 }
6544 
6545 static void
6546 do_vec_FSQRT (sim_cpu *cpu)
6547 {
6548   /* instr[31]    = 0
6549      instr[30]    = half(0)/full(1)
6550      instr[29,23] = 10 1110 1
6551      instr[22]    = single(0)/double(1)
6552      instr[21,10] = 10 0001 1111 10
6553      instr[9,5]   = Vsrc
6554      instr[4,0]   = Vdest.  */
6555 
6556   unsigned vn = INSTR (9, 5);
6557   unsigned vd = INSTR (4, 0);
6558   unsigned full = INSTR (30, 30);
6559   int i;
6560 
6561   NYI_assert (29, 23, 0x5D);
6562   NYI_assert (21, 10, 0x87E);
6563 
6564   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6565   if (INSTR (22, 22))
6566     {
6567       if (! full)
6568 	HALT_UNALLOC;
6569 
6570       for (i = 0; i < 2; i++)
6571 	aarch64_set_vec_double (cpu, vd, i,
6572 				sqrt (aarch64_get_vec_double (cpu, vn, i)));
6573     }
6574   else
6575     {
6576       for (i = 0; i < (full ? 4 : 2); i++)
6577 	aarch64_set_vec_float (cpu, vd, i,
6578 			       sqrtf (aarch64_get_vec_float (cpu, vn, i)));
6579     }
6580 }
6581 
6582 static void
6583 do_vec_FNEG (sim_cpu *cpu)
6584 {
6585   /* instr[31]    = 0
6586      instr[30]    = half (0)/full (1)
6587      instr[29,23] = 10 1110 1
6588      instr[22]    = single (0)/double (1)
6589      instr[21,10] = 10 0000 1111 10
6590      instr[9,5]   = Vsrc
6591      instr[4,0]   = Vdest.  */
6592 
6593   unsigned vn = INSTR (9, 5);
6594   unsigned vd = INSTR (4, 0);
6595   unsigned full = INSTR (30, 30);
6596   int i;
6597 
6598   NYI_assert (29, 23, 0x5D);
6599   NYI_assert (21, 10, 0x83E);
6600 
6601   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6602   if (INSTR (22, 22))
6603     {
6604       if (! full)
6605 	HALT_UNALLOC;
6606 
6607       for (i = 0; i < 2; i++)
6608 	aarch64_set_vec_double (cpu, vd, i,
6609 				- aarch64_get_vec_double (cpu, vn, i));
6610     }
6611   else
6612     {
6613       for (i = 0; i < (full ? 4 : 2); i++)
6614 	aarch64_set_vec_float (cpu, vd, i,
6615 			       - aarch64_get_vec_float (cpu, vn, i));
6616     }
6617 }
6618 
6619 static void
6620 do_vec_NOT (sim_cpu *cpu)
6621 {
6622   /* instr[31]    = 0
6623      instr[30]    = half (0)/full (1)
6624      instr[29,10] = 10 1110 0010 0000 0101 10
6625      instr[9,5]   = Vn
6626      instr[4.0]   = Vd.  */
6627 
6628   unsigned vn = INSTR (9, 5);
6629   unsigned vd = INSTR (4, 0);
6630   unsigned i;
6631   int      full = INSTR (30, 30);
6632 
6633   NYI_assert (29, 10, 0xB8816);
6634 
6635   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6636   for (i = 0; i < (full ? 16 : 8); i++)
6637     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
6638 }
6639 
6640 static unsigned int
6641 clz (uint64_t val, unsigned size)
6642 {
6643   uint64_t mask = 1;
6644   int      count;
6645 
6646   mask <<= (size - 1);
6647   count = 0;
6648   do
6649     {
6650       if (val & mask)
6651 	break;
6652       mask >>= 1;
6653       count ++;
6654     }
6655   while (mask);
6656 
6657   return count;
6658 }
6659 
6660 static void
6661 do_vec_CLZ (sim_cpu *cpu)
6662 {
6663   /* instr[31]    = 0
6664      instr[30]    = half (0)/full (1)
6665      instr[29,24] = 10 1110
6666      instr[23,22] = size
6667      instr[21,10] = 10 0000 0100 10
6668      instr[9,5]   = Vn
6669      instr[4.0]   = Vd.  */
6670 
6671   unsigned vn = INSTR (9, 5);
6672   unsigned vd = INSTR (4, 0);
6673   unsigned i;
6674   int      full = INSTR (30,30);
6675 
6676   NYI_assert (29, 24, 0x2E);
6677   NYI_assert (21, 10, 0x812);
6678 
6679   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6680   switch (INSTR (23, 22))
6681     {
6682     case 0:
6683       for (i = 0; i < (full ? 16 : 8); i++)
6684 	aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
6685       break;
6686     case 1:
6687       for (i = 0; i < (full ? 8 : 4); i++)
6688 	aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
6689       break;
6690     case 2:
6691       for (i = 0; i < (full ? 4 : 2); i++)
6692 	aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
6693       break;
6694     case 3:
6695       if (! full)
6696 	HALT_UNALLOC;
6697       aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
6698       aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
6699       break;
6700     }
6701 }
6702 
6703 static void
6704 do_vec_MOV_element (sim_cpu *cpu)
6705 {
6706   /* instr[31,21] = 0110 1110 000
6707      instr[20,16] = size & dest index
6708      instr[15]    = 0
6709      instr[14,11] = source index
6710      instr[10]    = 1
6711      instr[9,5]   = Vs
6712      instr[4.0]   = Vd.  */
6713 
6714   unsigned vs = INSTR (9, 5);
6715   unsigned vd = INSTR (4, 0);
6716   unsigned src_index;
6717   unsigned dst_index;
6718 
6719   NYI_assert (31, 21, 0x370);
6720   NYI_assert (15, 15, 0);
6721   NYI_assert (10, 10, 1);
6722 
6723   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6724   if (INSTR (16, 16))
6725     {
6726       /* Move a byte.  */
6727       src_index = INSTR (14, 11);
6728       dst_index = INSTR (20, 17);
6729       aarch64_set_vec_u8 (cpu, vd, dst_index,
6730 			  aarch64_get_vec_u8 (cpu, vs, src_index));
6731     }
6732   else if (INSTR (17, 17))
6733     {
6734       /* Move 16-bits.  */
6735       NYI_assert (11, 11, 0);
6736       src_index = INSTR (14, 12);
6737       dst_index = INSTR (20, 18);
6738       aarch64_set_vec_u16 (cpu, vd, dst_index,
6739 			   aarch64_get_vec_u16 (cpu, vs, src_index));
6740     }
6741   else if (INSTR (18, 18))
6742     {
6743       /* Move 32-bits.  */
6744       NYI_assert (12, 11, 0);
6745       src_index = INSTR (14, 13);
6746       dst_index = INSTR (20, 19);
6747       aarch64_set_vec_u32 (cpu, vd, dst_index,
6748 			   aarch64_get_vec_u32 (cpu, vs, src_index));
6749     }
6750   else
6751     {
6752       NYI_assert (19, 19, 1);
6753       NYI_assert (13, 11, 0);
6754       src_index = INSTR (14, 14);
6755       dst_index = INSTR (20, 20);
6756       aarch64_set_vec_u64 (cpu, vd, dst_index,
6757 			   aarch64_get_vec_u64 (cpu, vs, src_index));
6758     }
6759 }
6760 
6761 static void
6762 do_vec_REV32 (sim_cpu *cpu)
6763 {
6764   /* instr[31]    = 0
6765      instr[30]    = full/half
6766      instr[29,24] = 10 1110
6767      instr[23,22] = size
6768      instr[21,10] = 10 0000 0000 10
6769      instr[9,5]   = Rn
6770      instr[4,0]   = Rd.  */
6771 
6772   unsigned rn = INSTR (9, 5);
6773   unsigned rd = INSTR (4, 0);
6774   unsigned size = INSTR (23, 22);
6775   unsigned full = INSTR (30, 30);
6776   unsigned i;
6777   FRegister val;
6778 
6779   NYI_assert (29, 24, 0x2E);
6780   NYI_assert (21, 10, 0x802);
6781 
6782   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6783   switch (size)
6784     {
6785     case 0:
6786       for (i = 0; i < (full ? 16 : 8); i++)
6787 	val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
6788       break;
6789 
6790     case 1:
6791       for (i = 0; i < (full ? 8 : 4); i++)
6792 	val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
6793       break;
6794 
6795     default:
6796       HALT_UNALLOC;
6797     }
6798 
6799   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
6800   if (full)
6801     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
6802 }
6803 
6804 static void
6805 do_vec_EXT (sim_cpu *cpu)
6806 {
6807   /* instr[31]    = 0
6808      instr[30]    = full/half
6809      instr[29,21] = 10 1110 000
6810      instr[20,16] = Vm
6811      instr[15]    = 0
6812      instr[14,11] = source index
6813      instr[10]    = 0
6814      instr[9,5]   = Vn
6815      instr[4.0]   = Vd.  */
6816 
6817   unsigned vm = INSTR (20, 16);
6818   unsigned vn = INSTR (9, 5);
6819   unsigned vd = INSTR (4, 0);
6820   unsigned src_index = INSTR (14, 11);
6821   unsigned full = INSTR (30, 30);
6822   unsigned i;
6823   unsigned j;
6824   FRegister val;
6825 
6826   NYI_assert (31, 21, 0x370);
6827   NYI_assert (15, 15, 0);
6828   NYI_assert (10, 10, 0);
6829 
6830   if (!full && (src_index & 0x8))
6831     HALT_UNALLOC;
6832 
6833   j = 0;
6834 
6835   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6836   for (i = src_index; i < (full ? 16 : 8); i++)
6837     val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
6838   for (i = 0; i < src_index; i++)
6839     val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
6840 
6841   aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
6842   if (full)
6843     aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
6844 }
6845 
6846 static void
6847 dexAdvSIMD0 (sim_cpu *cpu)
6848 {
6849   /* instr [28,25] = 0 111.  */
6850   if (    INSTR (15, 10) == 0x07
6851       && (INSTR (9, 5) ==
6852 	  INSTR (20, 16)))
6853     {
6854       if (INSTR (31, 21) == 0x075
6855 	  || INSTR (31, 21) == 0x275)
6856 	{
6857 	  do_vec_MOV_whole_vector (cpu);
6858 	  return;
6859 	}
6860     }
6861 
6862   if (INSTR (29, 19) == 0x1E0)
6863     {
6864       do_vec_MOV_immediate (cpu);
6865       return;
6866     }
6867 
6868   if (INSTR (29, 19) == 0x5E0)
6869     {
6870       do_vec_MVNI (cpu);
6871       return;
6872     }
6873 
6874   if (INSTR (29, 19) == 0x1C0
6875       || INSTR (29, 19) == 0x1C1)
6876     {
6877       if (INSTR (15, 10) == 0x03)
6878 	{
6879 	  do_vec_DUP_scalar_into_vector (cpu);
6880 	  return;
6881 	}
6882     }
6883 
6884   switch (INSTR (29, 24))
6885     {
6886     case 0x0E: do_vec_op1 (cpu); return;
6887     case 0x0F: do_vec_op2 (cpu); return;
6888 
6889     case 0x2E:
6890       if (INSTR (21, 21) == 1)
6891 	{
6892 	  switch (INSTR (15, 10))
6893 	    {
6894 	    case 0x02:
6895 	      do_vec_REV32 (cpu);
6896 	      return;
6897 
6898 	    case 0x07:
6899 	      switch (INSTR (23, 22))
6900 		{
6901 		case 0: do_vec_EOR (cpu); return;
6902 		case 1: do_vec_BSL (cpu); return;
6903 		case 2:
6904 		case 3: do_vec_bit (cpu); return;
6905 		}
6906 	      break;
6907 
6908 	    case 0x08: do_vec_sub_long (cpu); return;
6909 	    case 0x11: do_vec_USHL (cpu); return;
6910 	    case 0x12: do_vec_CLZ (cpu); return;
6911 	    case 0x16: do_vec_NOT (cpu); return;
6912 	    case 0x19: do_vec_max (cpu); return;
6913 	    case 0x1B: do_vec_min (cpu); return;
6914 	    case 0x21: do_vec_SUB (cpu); return;
6915 	    case 0x25: do_vec_MLS (cpu); return;
6916 	    case 0x31: do_vec_FminmaxNMP (cpu); return;
6917 	    case 0x35: do_vec_FADDP (cpu); return;
6918 	    case 0x37: do_vec_FMUL (cpu); return;
6919 	    case 0x3F: do_vec_FDIV (cpu); return;
6920 
6921 	    case 0x3E:
6922 	      switch (INSTR (20, 16))
6923 		{
6924 		case 0x00: do_vec_FNEG (cpu); return;
6925 		case 0x01: do_vec_FSQRT (cpu); return;
6926 		default:   HALT_NYI;
6927 		}
6928 
6929 	    case 0x0D:
6930 	    case 0x0F:
6931 	    case 0x22:
6932 	    case 0x23:
6933 	    case 0x26:
6934 	    case 0x2A:
6935 	    case 0x32:
6936 	    case 0x36:
6937 	    case 0x39:
6938 	    case 0x3A:
6939 	      do_vec_compare (cpu); return;
6940 
6941 	    default:
6942 	      break;
6943 	    }
6944 	}
6945 
6946       if (INSTR (31, 21) == 0x370)
6947 	{
6948 	  if (INSTR (10, 10))
6949 	    do_vec_MOV_element (cpu);
6950 	  else
6951 	    do_vec_EXT (cpu);
6952 	  return;
6953 	}
6954 
6955       switch (INSTR (21, 10))
6956 	{
6957 	case 0x82E: do_vec_neg (cpu); return;
6958 	case 0x87E: do_vec_sqrt (cpu); return;
6959 	default:
6960 	  if (INSTR (15, 10) == 0x30)
6961 	    {
6962 	      do_vec_mull (cpu);
6963 	      return;
6964 	    }
6965 	  break;
6966 	}
6967       break;
6968 
6969     case 0x2f:
6970       switch (INSTR (15, 10))
6971 	{
6972 	case 0x01: do_vec_SSHR_USHR (cpu); return;
6973 	case 0x10:
6974 	case 0x12: do_vec_mls_indexed (cpu); return;
6975 	case 0x29: do_vec_xtl (cpu); return;
6976 	default:
6977 	  HALT_NYI;
6978 	}
6979 
6980     default:
6981       break;
6982     }
6983 
6984   HALT_NYI;
6985 }
6986 
6987 /* 3 sources.  */
6988 
6989 /* Float multiply add.  */
6990 static void
6991 fmadds (sim_cpu *cpu)
6992 {
6993   unsigned sa = INSTR (14, 10);
6994   unsigned sm = INSTR (20, 16);
6995   unsigned sn = INSTR ( 9,  5);
6996   unsigned sd = INSTR ( 4,  0);
6997 
6998   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6999   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7000 			+ aarch64_get_FP_float (cpu, sn)
7001 			* aarch64_get_FP_float (cpu, sm));
7002 }
7003 
7004 /* Double multiply add.  */
7005 static void
7006 fmaddd (sim_cpu *cpu)
7007 {
7008   unsigned sa = INSTR (14, 10);
7009   unsigned sm = INSTR (20, 16);
7010   unsigned sn = INSTR ( 9,  5);
7011   unsigned sd = INSTR ( 4,  0);
7012 
7013   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7014   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7015 			 + aarch64_get_FP_double (cpu, sn)
7016 			 * aarch64_get_FP_double (cpu, sm));
7017 }
7018 
7019 /* Float multiply subtract.  */
7020 static void
7021 fmsubs (sim_cpu *cpu)
7022 {
7023   unsigned sa = INSTR (14, 10);
7024   unsigned sm = INSTR (20, 16);
7025   unsigned sn = INSTR ( 9,  5);
7026   unsigned sd = INSTR ( 4,  0);
7027 
7028   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7029   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7030 			- aarch64_get_FP_float (cpu, sn)
7031 			* aarch64_get_FP_float (cpu, sm));
7032 }
7033 
7034 /* Double multiply subtract.  */
7035 static void
7036 fmsubd (sim_cpu *cpu)
7037 {
7038   unsigned sa = INSTR (14, 10);
7039   unsigned sm = INSTR (20, 16);
7040   unsigned sn = INSTR ( 9,  5);
7041   unsigned sd = INSTR ( 4,  0);
7042 
7043   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7044   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7045 			 - aarch64_get_FP_double (cpu, sn)
7046 			 * aarch64_get_FP_double (cpu, sm));
7047 }
7048 
7049 /* Float negative multiply add.  */
7050 static void
7051 fnmadds (sim_cpu *cpu)
7052 {
7053   unsigned sa = INSTR (14, 10);
7054   unsigned sm = INSTR (20, 16);
7055   unsigned sn = INSTR ( 9,  5);
7056   unsigned sd = INSTR ( 4,  0);
7057 
7058   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7059   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7060 			+ (- aarch64_get_FP_float (cpu, sn))
7061 			* aarch64_get_FP_float (cpu, sm));
7062 }
7063 
7064 /* Double negative multiply add.  */
7065 static void
7066 fnmaddd (sim_cpu *cpu)
7067 {
7068   unsigned sa = INSTR (14, 10);
7069   unsigned sm = INSTR (20, 16);
7070   unsigned sn = INSTR ( 9,  5);
7071   unsigned sd = INSTR ( 4,  0);
7072 
7073   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7074   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7075 			 + (- aarch64_get_FP_double (cpu, sn))
7076 			 * aarch64_get_FP_double (cpu, sm));
7077 }
7078 
7079 /* Float negative multiply subtract.  */
7080 static void
7081 fnmsubs (sim_cpu *cpu)
7082 {
7083   unsigned sa = INSTR (14, 10);
7084   unsigned sm = INSTR (20, 16);
7085   unsigned sn = INSTR ( 9,  5);
7086   unsigned sd = INSTR ( 4,  0);
7087 
7088   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7089   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7090 			+ aarch64_get_FP_float (cpu, sn)
7091 			* aarch64_get_FP_float (cpu, sm));
7092 }
7093 
7094 /* Double negative multiply subtract.  */
7095 static void
7096 fnmsubd (sim_cpu *cpu)
7097 {
7098   unsigned sa = INSTR (14, 10);
7099   unsigned sm = INSTR (20, 16);
7100   unsigned sn = INSTR ( 9,  5);
7101   unsigned sd = INSTR ( 4,  0);
7102 
7103   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7104   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7105 			 + aarch64_get_FP_double (cpu, sn)
7106 			 * aarch64_get_FP_double (cpu, sm));
7107 }
7108 
7109 static void
7110 dexSimpleFPDataProc3Source (sim_cpu *cpu)
7111 {
7112   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7113      instr[30]    = 0
7114      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7115      instr[28,25] = 1111
7116      instr[24]    = 1
7117      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7118      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
7119      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
7120 
7121   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7122   /* dispatch on combined type:o1:o2.  */
7123   uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
7124 
7125   if (M_S != 0)
7126     HALT_UNALLOC;
7127 
7128   switch (dispatch)
7129     {
7130     case 0: fmadds (cpu); return;
7131     case 1: fmsubs (cpu); return;
7132     case 2: fnmadds (cpu); return;
7133     case 3: fnmsubs (cpu); return;
7134     case 4: fmaddd (cpu); return;
7135     case 5: fmsubd (cpu); return;
7136     case 6: fnmaddd (cpu); return;
7137     case 7: fnmsubd (cpu); return;
7138     default:
7139       /* type > 1 is currently unallocated.  */
7140       HALT_UNALLOC;
7141     }
7142 }
7143 
7144 static void
7145 dexSimpleFPFixedConvert (sim_cpu *cpu)
7146 {
7147   HALT_NYI;
7148 }
7149 
7150 static void
7151 dexSimpleFPCondCompare (sim_cpu *cpu)
7152 {
7153   /* instr [31,23] = 0001 1110 0
7154      instr [22]    = type
7155      instr [21]    = 1
7156      instr [20,16] = Rm
7157      instr [15,12] = condition
7158      instr [11,10] = 01
7159      instr [9,5]   = Rn
7160      instr [4]     = 0
7161      instr [3,0]   = nzcv  */
7162 
7163   unsigned rm = INSTR (20, 16);
7164   unsigned rn = INSTR (9, 5);
7165 
7166   NYI_assert (31, 23, 0x3C);
7167   NYI_assert (11, 10, 0x1);
7168   NYI_assert (4,  4,  0);
7169 
7170   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7171   if (! testConditionCode (cpu, INSTR (15, 12)))
7172     {
7173       aarch64_set_CPSR (cpu, INSTR (3, 0));
7174       return;
7175     }
7176 
7177   if (INSTR (22, 22))
7178     {
7179       /* Double precision.  */
7180       double val1 = aarch64_get_vec_double (cpu, rn, 0);
7181       double val2 = aarch64_get_vec_double (cpu, rm, 0);
7182 
7183       /* FIXME: Check for NaNs.  */
7184       if (val1 == val2)
7185 	aarch64_set_CPSR (cpu, (Z | C));
7186       else if (val1 < val2)
7187 	aarch64_set_CPSR (cpu, N);
7188       else /* val1 > val2 */
7189 	aarch64_set_CPSR (cpu, C);
7190     }
7191   else
7192     {
7193       /* Single precision.  */
7194       float val1 = aarch64_get_vec_float (cpu, rn, 0);
7195       float val2 = aarch64_get_vec_float (cpu, rm, 0);
7196 
7197       /* FIXME: Check for NaNs.  */
7198       if (val1 == val2)
7199 	aarch64_set_CPSR (cpu, (Z | C));
7200       else if (val1 < val2)
7201 	aarch64_set_CPSR (cpu, N);
7202       else /* val1 > val2 */
7203 	aarch64_set_CPSR (cpu, C);
7204     }
7205 }
7206 
7207 /* 2 sources.  */
7208 
7209 /* Float add.  */
7210 static void
7211 fadds (sim_cpu *cpu)
7212 {
7213   unsigned sm = INSTR (20, 16);
7214   unsigned sn = INSTR ( 9,  5);
7215   unsigned sd = INSTR ( 4,  0);
7216 
7217   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7218   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7219 			+ aarch64_get_FP_float (cpu, sm));
7220 }
7221 
7222 /* Double add.  */
7223 static void
7224 faddd (sim_cpu *cpu)
7225 {
7226   unsigned sm = INSTR (20, 16);
7227   unsigned sn = INSTR ( 9,  5);
7228   unsigned sd = INSTR ( 4,  0);
7229 
7230   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7231   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7232 			 + aarch64_get_FP_double (cpu, sm));
7233 }
7234 
7235 /* Float divide.  */
7236 static void
7237 fdivs (sim_cpu *cpu)
7238 {
7239   unsigned sm = INSTR (20, 16);
7240   unsigned sn = INSTR ( 9,  5);
7241   unsigned sd = INSTR ( 4,  0);
7242 
7243   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7244   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7245 			/ aarch64_get_FP_float (cpu, sm));
7246 }
7247 
7248 /* Double divide.  */
7249 static void
7250 fdivd (sim_cpu *cpu)
7251 {
7252   unsigned sm = INSTR (20, 16);
7253   unsigned sn = INSTR ( 9,  5);
7254   unsigned sd = INSTR ( 4,  0);
7255 
7256   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7257   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7258 			 / aarch64_get_FP_double (cpu, sm));
7259 }
7260 
7261 /* Float multiply.  */
7262 static void
7263 fmuls (sim_cpu *cpu)
7264 {
7265   unsigned sm = INSTR (20, 16);
7266   unsigned sn = INSTR ( 9,  5);
7267   unsigned sd = INSTR ( 4,  0);
7268 
7269   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7270   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7271 			* aarch64_get_FP_float (cpu, sm));
7272 }
7273 
7274 /* Double multiply.  */
7275 static void
7276 fmuld (sim_cpu *cpu)
7277 {
7278   unsigned sm = INSTR (20, 16);
7279   unsigned sn = INSTR ( 9,  5);
7280   unsigned sd = INSTR ( 4,  0);
7281 
7282   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7283   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7284 			 * aarch64_get_FP_double (cpu, sm));
7285 }
7286 
7287 /* Float negate and multiply.  */
7288 static void
7289 fnmuls (sim_cpu *cpu)
7290 {
7291   unsigned sm = INSTR (20, 16);
7292   unsigned sn = INSTR ( 9,  5);
7293   unsigned sd = INSTR ( 4,  0);
7294 
7295   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7296   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
7297 				    * aarch64_get_FP_float (cpu, sm)));
7298 }
7299 
7300 /* Double negate and multiply.  */
7301 static void
7302 fnmuld (sim_cpu *cpu)
7303 {
7304   unsigned sm = INSTR (20, 16);
7305   unsigned sn = INSTR ( 9,  5);
7306   unsigned sd = INSTR ( 4,  0);
7307 
7308   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7309   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
7310 				     * aarch64_get_FP_double (cpu, sm)));
7311 }
7312 
7313 /* Float subtract.  */
7314 static void
7315 fsubs (sim_cpu *cpu)
7316 {
7317   unsigned sm = INSTR (20, 16);
7318   unsigned sn = INSTR ( 9,  5);
7319   unsigned sd = INSTR ( 4,  0);
7320 
7321   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7322   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7323 			- aarch64_get_FP_float (cpu, sm));
7324 }
7325 
7326 /* Double subtract.  */
7327 static void
7328 fsubd (sim_cpu *cpu)
7329 {
7330   unsigned sm = INSTR (20, 16);
7331   unsigned sn = INSTR ( 9,  5);
7332   unsigned sd = INSTR ( 4,  0);
7333 
7334   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7335   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7336 			 - aarch64_get_FP_double (cpu, sm));
7337 }
7338 
7339 static void
7340 do_FMINNM (sim_cpu *cpu)
7341 {
7342   /* instr[31,23] = 0 0011 1100
7343      instr[22]    = float(0)/double(1)
7344      instr[21]    = 1
7345      instr[20,16] = Sm
7346      instr[15,10] = 01 1110
7347      instr[9,5]   = Sn
7348      instr[4,0]   = Cpu  */
7349 
7350   unsigned sm = INSTR (20, 16);
7351   unsigned sn = INSTR ( 9,  5);
7352   unsigned sd = INSTR ( 4,  0);
7353 
7354   NYI_assert (31, 23, 0x03C);
7355   NYI_assert (15, 10, 0x1E);
7356 
7357   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7358   if (INSTR (22, 22))
7359     aarch64_set_FP_double (cpu, sd,
7360 			   dminnm (aarch64_get_FP_double (cpu, sn),
7361 				   aarch64_get_FP_double (cpu, sm)));
7362   else
7363     aarch64_set_FP_float (cpu, sd,
7364 			  fminnm (aarch64_get_FP_float (cpu, sn),
7365 				  aarch64_get_FP_float (cpu, sm)));
7366 }
7367 
7368 static void
7369 do_FMAXNM (sim_cpu *cpu)
7370 {
7371   /* instr[31,23] = 0 0011 1100
7372      instr[22]    = float(0)/double(1)
7373      instr[21]    = 1
7374      instr[20,16] = Sm
7375      instr[15,10] = 01 1010
7376      instr[9,5]   = Sn
7377      instr[4,0]   = Cpu  */
7378 
7379   unsigned sm = INSTR (20, 16);
7380   unsigned sn = INSTR ( 9,  5);
7381   unsigned sd = INSTR ( 4,  0);
7382 
7383   NYI_assert (31, 23, 0x03C);
7384   NYI_assert (15, 10, 0x1A);
7385 
7386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7387   if (INSTR (22, 22))
7388     aarch64_set_FP_double (cpu, sd,
7389 			   dmaxnm (aarch64_get_FP_double (cpu, sn),
7390 				   aarch64_get_FP_double (cpu, sm)));
7391   else
7392     aarch64_set_FP_float (cpu, sd,
7393 			  fmaxnm (aarch64_get_FP_float (cpu, sn),
7394 				  aarch64_get_FP_float (cpu, sm)));
7395 }
7396 
7397 static void
7398 dexSimpleFPDataProc2Source (sim_cpu *cpu)
7399 {
7400   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7401      instr[30]    = 0
7402      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7403      instr[28,25] = 1111
7404      instr[24]    = 0
7405      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7406      instr[21]    = 1
7407      instr[20,16] = Vm
7408      instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
7409                                0010 ==> FADD, 0011 ==> FSUB,
7410                                0100 ==> FMAX, 0101 ==> FMIN
7411                                0110 ==> FMAXNM, 0111 ==> FMINNM
7412                                1000 ==> FNMUL, ow ==> UNALLOC
7413      instr[11,10] = 10
7414      instr[9,5]   = Vn
7415      instr[4,0]   = Vd  */
7416 
7417   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7418   uint32_t type = INSTR (23, 22);
7419   /* Dispatch on opcode.  */
7420   uint32_t dispatch = INSTR (15, 12);
7421 
7422   if (type > 1)
7423     HALT_UNALLOC;
7424 
7425   if (M_S != 0)
7426     HALT_UNALLOC;
7427 
7428   if (type)
7429     switch (dispatch)
7430       {
7431       case 0: fmuld (cpu); return;
7432       case 1: fdivd (cpu); return;
7433       case 2: faddd (cpu); return;
7434       case 3: fsubd (cpu); return;
7435       case 6: do_FMAXNM (cpu); return;
7436       case 7: do_FMINNM (cpu); return;
7437       case 8: fnmuld (cpu); return;
7438 
7439 	/* Have not yet implemented fmax and fmin.  */
7440       case 4:
7441       case 5:
7442 	HALT_NYI;
7443 
7444       default:
7445 	HALT_UNALLOC;
7446       }
7447   else /* type == 0 => floats.  */
7448     switch (dispatch)
7449       {
7450       case 0: fmuls (cpu); return;
7451       case 1: fdivs (cpu); return;
7452       case 2: fadds (cpu); return;
7453       case 3: fsubs (cpu); return;
7454       case 6: do_FMAXNM (cpu); return;
7455       case 7: do_FMINNM (cpu); return;
7456       case 8: fnmuls (cpu); return;
7457 
7458       case 4:
7459       case 5:
7460 	HALT_NYI;
7461 
7462       default:
7463 	HALT_UNALLOC;
7464       }
7465 }
7466 
7467 static void
7468 dexSimpleFPCondSelect (sim_cpu *cpu)
7469 {
7470   /* FCSEL
7471      instr[31,23] = 0 0011 1100
7472      instr[22]    = 0=>single 1=>double
7473      instr[21]    = 1
7474      instr[20,16] = Sm
7475      instr[15,12] = cond
7476      instr[11,10] = 11
7477      instr[9,5]   = Sn
7478      instr[4,0]   = Cpu  */
7479   unsigned sm = INSTR (20, 16);
7480   unsigned sn = INSTR ( 9, 5);
7481   unsigned sd = INSTR ( 4, 0);
7482   uint32_t set = testConditionCode (cpu, INSTR (15, 12));
7483 
7484   NYI_assert (31, 23, 0x03C);
7485   NYI_assert (11, 10, 0x3);
7486 
7487   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7488   if (INSTR (22, 22))
7489     aarch64_set_FP_double (cpu, sd, set ? sn : sm);
7490   else
7491     aarch64_set_FP_float (cpu, sd, set ? sn : sm);
7492 }
7493 
7494 /* Store 32 bit unscaled signed 9 bit.  */
7495 static void
7496 fsturs (sim_cpu *cpu, int32_t offset)
7497 {
7498   unsigned int rn = INSTR (9, 5);
7499   unsigned int st = INSTR (4, 0);
7500 
7501   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7502   aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, st, 1) + offset,
7503 		       aarch64_get_vec_u32 (cpu, rn, 0));
7504 }
7505 
7506 /* Store 64 bit unscaled signed 9 bit.  */
7507 static void
7508 fsturd (sim_cpu *cpu, int32_t offset)
7509 {
7510   unsigned int rn = INSTR (9, 5);
7511   unsigned int st = INSTR (4, 0);
7512 
7513   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7514   aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, st, 1) + offset,
7515 		       aarch64_get_vec_u64 (cpu, rn, 0));
7516 }
7517 
7518 /* Store 128 bit unscaled signed 9 bit.  */
7519 static void
7520 fsturq (sim_cpu *cpu, int32_t offset)
7521 {
7522   unsigned int rn = INSTR (9, 5);
7523   unsigned int st = INSTR (4, 0);
7524   FRegister a;
7525 
7526   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7527   aarch64_get_FP_long_double (cpu, rn, & a);
7528   aarch64_set_mem_long_double (cpu,
7529 			       aarch64_get_reg_u64 (cpu, st, 1)
7530 			       + offset, a);
7531 }
7532 
7533 /* TODO FP move register.  */
7534 
7535 /* 32 bit fp to fp move register.  */
7536 static void
7537 ffmovs (sim_cpu *cpu)
7538 {
7539   unsigned int rn = INSTR (9, 5);
7540   unsigned int st = INSTR (4, 0);
7541 
7542   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7543   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
7544 }
7545 
7546 /* 64 bit fp to fp move register.  */
7547 static void
7548 ffmovd (sim_cpu *cpu)
7549 {
7550   unsigned int rn = INSTR (9, 5);
7551   unsigned int st = INSTR (4, 0);
7552 
7553   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7554   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
7555 }
7556 
7557 /* 32 bit GReg to Vec move register.  */
7558 static void
7559 fgmovs (sim_cpu *cpu)
7560 {
7561   unsigned int rn = INSTR (9, 5);
7562   unsigned int st = INSTR (4, 0);
7563 
7564   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7565   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
7566 }
7567 
7568 /* 64 bit g to fp move register.  */
7569 static void
7570 fgmovd (sim_cpu *cpu)
7571 {
7572   unsigned int rn = INSTR (9, 5);
7573   unsigned int st = INSTR (4, 0);
7574 
7575   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7576   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
7577 }
7578 
7579 /* 32 bit fp to g move register.  */
7580 static void
7581 gfmovs (sim_cpu *cpu)
7582 {
7583   unsigned int rn = INSTR (9, 5);
7584   unsigned int st = INSTR (4, 0);
7585 
7586   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7587   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
7588 }
7589 
7590 /* 64 bit fp to g move register.  */
7591 static void
7592 gfmovd (sim_cpu *cpu)
7593 {
7594   unsigned int rn = INSTR (9, 5);
7595   unsigned int st = INSTR (4, 0);
7596 
7597   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7598   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
7599 }
7600 
7601 /* FP move immediate
7602 
7603    These install an immediate 8 bit value in the target register
7604    where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
7605    bit exponent.  */
7606 
7607 static void
7608 fmovs (sim_cpu *cpu)
7609 {
7610   unsigned int sd = INSTR (4, 0);
7611   uint32_t imm = INSTR (20, 13);
7612   float f = fp_immediate_for_encoding_32 (imm);
7613 
7614   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7615   aarch64_set_FP_float (cpu, sd, f);
7616 }
7617 
7618 static void
7619 fmovd (sim_cpu *cpu)
7620 {
7621   unsigned int sd = INSTR (4, 0);
7622   uint32_t imm = INSTR (20, 13);
7623   double d = fp_immediate_for_encoding_64 (imm);
7624 
7625   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7626   aarch64_set_FP_double (cpu, sd, d);
7627 }
7628 
7629 static void
7630 dexSimpleFPImmediate (sim_cpu *cpu)
7631 {
7632   /* instr[31,23] == 00111100
7633      instr[22]    == type : single(0)/double(1)
7634      instr[21]    == 1
7635      instr[20,13] == imm8
7636      instr[12,10] == 100
7637      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
7638      instr[4,0]   == Rd  */
7639   uint32_t imm5 = INSTR (9, 5);
7640 
7641   NYI_assert (31, 23, 0x3C);
7642 
7643   if (imm5 != 0)
7644     HALT_UNALLOC;
7645 
7646   if (INSTR (22, 22))
7647     fmovd (cpu);
7648   else
7649     fmovs (cpu);
7650 }
7651 
7652 /* TODO specific decode and execute for group Load Store.  */
7653 
7654 /* TODO FP load/store single register (unscaled offset).  */
7655 
7656 /* TODO load 8 bit unscaled signed 9 bit.  */
7657 /* TODO load 16 bit unscaled signed 9 bit.  */
7658 
7659 /* Load 32 bit unscaled signed 9 bit.  */
7660 static void
7661 fldurs (sim_cpu *cpu, int32_t offset)
7662 {
7663   unsigned int rn = INSTR (9, 5);
7664   unsigned int st = INSTR (4, 0);
7665 
7666   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7667   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
7668 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7669 }
7670 
7671 /* Load 64 bit unscaled signed 9 bit.  */
7672 static void
7673 fldurd (sim_cpu *cpu, int32_t offset)
7674 {
7675   unsigned int rn = INSTR (9, 5);
7676   unsigned int st = INSTR (4, 0);
7677 
7678   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7679   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
7680 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7681 }
7682 
7683 /* Load 128 bit unscaled signed 9 bit.  */
7684 static void
7685 fldurq (sim_cpu *cpu, int32_t offset)
7686 {
7687   unsigned int rn = INSTR (9, 5);
7688   unsigned int st = INSTR (4, 0);
7689   FRegister a;
7690   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
7691 
7692   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7693   aarch64_get_mem_long_double (cpu, addr, & a);
7694   aarch64_set_FP_long_double (cpu, st, a);
7695 }
7696 
7697 /* TODO store 8 bit unscaled signed 9 bit.  */
7698 /* TODO store 16 bit unscaled signed 9 bit.  */
7699 
7700 
7701 /* 1 source.  */
7702 
7703 /* Float absolute value.  */
7704 static void
7705 fabss (sim_cpu *cpu)
7706 {
7707   unsigned sn = INSTR (9, 5);
7708   unsigned sd = INSTR (4, 0);
7709   float value = aarch64_get_FP_float (cpu, sn);
7710 
7711   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7712   aarch64_set_FP_float (cpu, sd, fabsf (value));
7713 }
7714 
7715 /* Double absolute value.  */
7716 static void
7717 fabcpu (sim_cpu *cpu)
7718 {
7719   unsigned sn = INSTR (9, 5);
7720   unsigned sd = INSTR (4, 0);
7721   double value = aarch64_get_FP_double (cpu, sn);
7722 
7723   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7724   aarch64_set_FP_double (cpu, sd, fabs (value));
7725 }
7726 
7727 /* Float negative value.  */
7728 static void
7729 fnegs (sim_cpu *cpu)
7730 {
7731   unsigned sn = INSTR (9, 5);
7732   unsigned sd = INSTR (4, 0);
7733 
7734   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7735   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
7736 }
7737 
7738 /* Double negative value.  */
7739 static void
7740 fnegd (sim_cpu *cpu)
7741 {
7742   unsigned sn = INSTR (9, 5);
7743   unsigned sd = INSTR (4, 0);
7744 
7745   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7746   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
7747 }
7748 
7749 /* Float square root.  */
7750 static void
7751 fsqrts (sim_cpu *cpu)
7752 {
7753   unsigned sn = INSTR (9, 5);
7754   unsigned sd = INSTR (4, 0);
7755 
7756   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7757   aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
7758 }
7759 
7760 /* Double square root.  */
7761 static void
7762 fsqrtd (sim_cpu *cpu)
7763 {
7764   unsigned sn = INSTR (9, 5);
7765   unsigned sd = INSTR (4, 0);
7766 
7767   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7768   aarch64_set_FP_double (cpu, sd,
7769 			 sqrt (aarch64_get_FP_double (cpu, sn)));
7770 }
7771 
7772 /* Convert double to float.  */
7773 static void
7774 fcvtds (sim_cpu *cpu)
7775 {
7776   unsigned sn = INSTR (9, 5);
7777   unsigned sd = INSTR (4, 0);
7778 
7779   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7780   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
7781 }
7782 
7783 /* Convert float to double.  */
7784 static void
7785 fcvtcpu (sim_cpu *cpu)
7786 {
7787   unsigned sn = INSTR (9, 5);
7788   unsigned sd = INSTR (4, 0);
7789 
7790   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7791   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
7792 }
7793 
7794 static void
7795 do_FRINT (sim_cpu *cpu)
7796 {
7797   /* instr[31,23] = 0001 1110 0
7798      instr[22]    = single(0)/double(1)
7799      instr[21,18] = 1001
7800      instr[17,15] = rounding mode
7801      instr[14,10] = 10000
7802      instr[9,5]   = source
7803      instr[4,0]   = dest  */
7804 
7805   float val;
7806   unsigned rs = INSTR (9, 5);
7807   unsigned rd = INSTR (4, 0);
7808   unsigned int rmode = INSTR (17, 15);
7809 
7810   NYI_assert (31, 23, 0x03C);
7811   NYI_assert (21, 18, 0x9);
7812   NYI_assert (14, 10, 0x10);
7813 
7814   if (rmode == 6 || rmode == 7)
7815     /* FIXME: Add support for rmode == 6 exactness check.  */
7816     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
7817 
7818   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7819   if (INSTR (22, 22))
7820     {
7821       double val = aarch64_get_FP_double (cpu, rs);
7822 
7823       switch (rmode)
7824 	{
7825 	case 0: /* mode N: nearest or even.  */
7826 	  {
7827 	    double rval = round (val);
7828 
7829 	    if (val - rval == 0.5)
7830 	      {
7831 		if (((rval / 2.0) * 2.0) != rval)
7832 		  rval += 1.0;
7833 	      }
7834 
7835 	    aarch64_set_FP_double (cpu, rd, round (val));
7836 	    return;
7837 	  }
7838 
7839 	case 1: /* mode P: towards +inf.  */
7840 	  if (val < 0.0)
7841 	    aarch64_set_FP_double (cpu, rd, trunc (val));
7842 	  else
7843 	    aarch64_set_FP_double (cpu, rd, round (val));
7844 	  return;
7845 
7846 	case 2: /* mode M: towards -inf.  */
7847 	  if (val < 0.0)
7848 	    aarch64_set_FP_double (cpu, rd, round (val));
7849 	  else
7850 	    aarch64_set_FP_double (cpu, rd, trunc (val));
7851 	  return;
7852 
7853 	case 3: /* mode Z: towards 0.  */
7854 	  aarch64_set_FP_double (cpu, rd, trunc (val));
7855 	  return;
7856 
7857 	case 4: /* mode A: away from 0.  */
7858 	  aarch64_set_FP_double (cpu, rd, round (val));
7859 	  return;
7860 
7861 	case 6: /* mode X: use FPCR with exactness check.  */
7862 	case 7: /* mode I: use FPCR mode.  */
7863 	  HALT_NYI;
7864 
7865 	default:
7866 	  HALT_UNALLOC;
7867 	}
7868     }
7869 
7870   val = aarch64_get_FP_float (cpu, rs);
7871 
7872   switch (rmode)
7873     {
7874     case 0: /* mode N: nearest or even.  */
7875       {
7876 	float rval = roundf (val);
7877 
7878 	if (val - rval == 0.5)
7879 	  {
7880 	    if (((rval / 2.0) * 2.0) != rval)
7881 	      rval += 1.0;
7882 	  }
7883 
7884 	aarch64_set_FP_float (cpu, rd, rval);
7885 	return;
7886       }
7887 
7888     case 1: /* mode P: towards +inf.  */
7889       if (val < 0.0)
7890 	aarch64_set_FP_float (cpu, rd, truncf (val));
7891       else
7892 	aarch64_set_FP_float (cpu, rd, roundf (val));
7893       return;
7894 
7895     case 2: /* mode M: towards -inf.  */
7896       if (val < 0.0)
7897 	aarch64_set_FP_float (cpu, rd, truncf (val));
7898       else
7899 	aarch64_set_FP_float (cpu, rd, roundf (val));
7900       return;
7901 
7902     case 3: /* mode Z: towards 0.  */
7903       aarch64_set_FP_float (cpu, rd, truncf (val));
7904       return;
7905 
7906     case 4: /* mode A: away from 0.  */
7907       aarch64_set_FP_float (cpu, rd, roundf (val));
7908       return;
7909 
7910     case 6: /* mode X: use FPCR with exactness check.  */
7911     case 7: /* mode I: use FPCR mode.  */
7912       HALT_NYI;
7913 
7914     default:
7915       HALT_UNALLOC;
7916     }
7917 }
7918 
7919 /* Convert half to float.  */
7920 static void
7921 do_FCVT_half_to_single (sim_cpu *cpu)
7922 {
7923   unsigned rn = INSTR (9, 5);
7924   unsigned rd = INSTR (4, 0);
7925 
7926   NYI_assert (31, 10, 0x7B890);
7927 
7928   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7929   aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
7930 }
7931 
7932 /* Convert half to double.  */
7933 static void
7934 do_FCVT_half_to_double (sim_cpu *cpu)
7935 {
7936   unsigned rn = INSTR (9, 5);
7937   unsigned rd = INSTR (4, 0);
7938 
7939   NYI_assert (31, 10, 0x7B8B0);
7940 
7941   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7942   aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
7943 }
7944 
7945 static void
7946 do_FCVT_single_to_half (sim_cpu *cpu)
7947 {
7948   unsigned rn = INSTR (9, 5);
7949   unsigned rd = INSTR (4, 0);
7950 
7951   NYI_assert (31, 10, 0x788F0);
7952 
7953   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7954   aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
7955 }
7956 
7957 /* Convert double to half.  */
7958 static void
7959 do_FCVT_double_to_half (sim_cpu *cpu)
7960 {
7961   unsigned rn = INSTR (9, 5);
7962   unsigned rd = INSTR (4, 0);
7963 
7964   NYI_assert (31, 10, 0x798F0);
7965 
7966   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7967   aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
7968 }
7969 
7970 static void
7971 dexSimpleFPDataProc1Source (sim_cpu *cpu)
7972 {
7973   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7974      instr[30]    = 0
7975      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7976      instr[28,25] = 1111
7977      instr[24]    = 0
7978      instr[23,22] ==> type : 00 ==> source is single,
7979                              01 ==> source is double
7980                              10 ==> UNALLOC
7981                              11 ==> UNALLOC or source is half
7982      instr[21]    = 1
7983      instr[20,15] ==> opcode : with type 00 or 01
7984                                000000 ==> FMOV, 000001 ==> FABS,
7985                                000010 ==> FNEG, 000011 ==> FSQRT,
7986                                000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
7987                                000110 ==> UNALLOC, 000111 ==> FCVT (to half)
7988                                001000 ==> FRINTN, 001001 ==> FRINTP,
7989                                001010 ==> FRINTM, 001011 ==> FRINTZ,
7990                                001100 ==> FRINTA, 001101 ==> UNALLOC
7991                                001110 ==> FRINTX, 001111 ==> FRINTI
7992                                with type 11
7993                                000100 ==> FCVT (half-to-single)
7994                                000101 ==> FCVT (half-to-double)
7995 			       instr[14,10] = 10000.  */
7996 
7997   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7998   uint32_t type   = INSTR (23, 22);
7999   uint32_t opcode = INSTR (20, 15);
8000 
8001   if (M_S != 0)
8002     HALT_UNALLOC;
8003 
8004   if (type == 3)
8005     {
8006       if (opcode == 4)
8007 	do_FCVT_half_to_single (cpu);
8008       else if (opcode == 5)
8009 	do_FCVT_half_to_double (cpu);
8010       else
8011 	HALT_UNALLOC;
8012       return;
8013     }
8014 
8015   if (type == 2)
8016     HALT_UNALLOC;
8017 
8018   switch (opcode)
8019     {
8020     case 0:
8021       if (type)
8022 	ffmovd (cpu);
8023       else
8024 	ffmovs (cpu);
8025       return;
8026 
8027     case 1:
8028       if (type)
8029 	fabcpu (cpu);
8030       else
8031 	fabss (cpu);
8032       return;
8033 
8034     case 2:
8035       if (type)
8036 	fnegd (cpu);
8037       else
8038 	fnegs (cpu);
8039       return;
8040 
8041     case 3:
8042       if (type)
8043 	fsqrtd (cpu);
8044       else
8045 	fsqrts (cpu);
8046       return;
8047 
8048     case 4:
8049       if (type)
8050 	fcvtds (cpu);
8051       else
8052 	HALT_UNALLOC;
8053       return;
8054 
8055     case 5:
8056       if (type)
8057 	HALT_UNALLOC;
8058       fcvtcpu (cpu);
8059       return;
8060 
8061     case 8:		/* FRINTN etc.  */
8062     case 9:
8063     case 10:
8064     case 11:
8065     case 12:
8066     case 14:
8067     case 15:
8068        do_FRINT (cpu);
8069        return;
8070 
8071     case 7:
8072       if (INSTR (22, 22))
8073 	do_FCVT_double_to_half (cpu);
8074       else
8075 	do_FCVT_single_to_half (cpu);
8076       return;
8077 
8078     case 13:
8079       HALT_NYI;
8080 
8081     default:
8082       HALT_UNALLOC;
8083     }
8084 }
8085 
8086 /* 32 bit signed int to float.  */
8087 static void
8088 scvtf32 (sim_cpu *cpu)
8089 {
8090   unsigned rn = INSTR (9, 5);
8091   unsigned sd = INSTR (4, 0);
8092 
8093   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8094   aarch64_set_FP_float
8095     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8096 }
8097 
8098 /* signed int to float.  */
8099 static void
8100 scvtf (sim_cpu *cpu)
8101 {
8102   unsigned rn = INSTR (9, 5);
8103   unsigned sd = INSTR (4, 0);
8104 
8105   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8106   aarch64_set_FP_float
8107     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8108 }
8109 
8110 /* 32 bit signed int to double.  */
8111 static void
8112 scvtd32 (sim_cpu *cpu)
8113 {
8114   unsigned rn = INSTR (9, 5);
8115   unsigned sd = INSTR (4, 0);
8116 
8117   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8118   aarch64_set_FP_double
8119     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8120 }
8121 
8122 /* signed int to double.  */
8123 static void
8124 scvtd (sim_cpu *cpu)
8125 {
8126   unsigned rn = INSTR (9, 5);
8127   unsigned sd = INSTR (4, 0);
8128 
8129   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8130   aarch64_set_FP_double
8131     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8132 }
8133 
8134 static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
8135 static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
8136 static const double DOUBLE_INT_MAX  = (double) INT_MAX;
8137 static const double DOUBLE_INT_MIN  = (double) INT_MIN;
8138 static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
8139 static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
8140 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
8141 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
8142 
8143 /* Check for FP exception conditions:
8144      NaN raises IO
8145      Infinity raises IO
8146      Out of Range raises IO and IX and saturates value
8147      Denormal raises ID and IX and sets to zero.  */
8148 #define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)	\
8149   do							\
8150     {							\
8151       switch (fpclassify (F))				\
8152 	{						\
8153 	case FP_INFINITE:				\
8154 	case FP_NAN:					\
8155 	  aarch64_set_FPSR (cpu, IO);			\
8156 	  if (signbit (F))				\
8157 	    VALUE = ITYPE##_MAX;			\
8158 	  else						\
8159 	    VALUE = ITYPE##_MIN;			\
8160 	  break;					\
8161 							\
8162 	case FP_NORMAL:					\
8163 	  if (F >= FTYPE##_##ITYPE##_MAX)		\
8164 	    {						\
8165 	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
8166 	      VALUE = ITYPE##_MAX;			\
8167 	    }						\
8168 	  else if (F <= FTYPE##_##ITYPE##_MIN)		\
8169 	    {						\
8170 	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
8171 	      VALUE = ITYPE##_MIN;			\
8172 	    }						\
8173 	  break;					\
8174 							\
8175 	case FP_SUBNORMAL:				\
8176 	  aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);	\
8177 	  VALUE = 0;					\
8178 	  break;					\
8179 							\
8180 	default:					\
8181 	case FP_ZERO:					\
8182 	  VALUE = 0;					\
8183 	  break;					\
8184 	}						\
8185     }							\
8186   while (0)
8187 
8188 /* 32 bit convert float to signed int truncate towards zero.  */
8189 static void
8190 fcvtszs32 (sim_cpu *cpu)
8191 {
8192   unsigned sn = INSTR (9, 5);
8193   unsigned rd = INSTR (4, 0);
8194   /* TODO : check that this rounds toward zero.  */
8195   float   f = aarch64_get_FP_float (cpu, sn);
8196   int32_t value = (int32_t) f;
8197 
8198   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
8199 
8200   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8201   /* Avoid sign extension to 64 bit.  */
8202   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8203 }
8204 
8205 /* 64 bit convert float to signed int truncate towards zero.  */
8206 static void
8207 fcvtszs (sim_cpu *cpu)
8208 {
8209   unsigned sn = INSTR (9, 5);
8210   unsigned rd = INSTR (4, 0);
8211   float f = aarch64_get_FP_float (cpu, sn);
8212   int64_t value = (int64_t) f;
8213 
8214   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
8215 
8216   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8217   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8218 }
8219 
8220 /* 32 bit convert double to signed int truncate towards zero.  */
8221 static void
8222 fcvtszd32 (sim_cpu *cpu)
8223 {
8224   unsigned sn = INSTR (9, 5);
8225   unsigned rd = INSTR (4, 0);
8226   /* TODO : check that this rounds toward zero.  */
8227   double   d = aarch64_get_FP_double (cpu, sn);
8228   int32_t  value = (int32_t) d;
8229 
8230   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
8231 
8232   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8233   /* Avoid sign extension to 64 bit.  */
8234   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8235 }
8236 
8237 /* 64 bit convert double to signed int truncate towards zero.  */
8238 static void
8239 fcvtszd (sim_cpu *cpu)
8240 {
8241   unsigned sn = INSTR (9, 5);
8242   unsigned rd = INSTR (4, 0);
8243   /* TODO : check that this rounds toward zero.  */
8244   double  d = aarch64_get_FP_double (cpu, sn);
8245   int64_t value;
8246 
8247   value = (int64_t) d;
8248 
8249   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
8250 
8251   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8252   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8253 }
8254 
8255 static void
8256 do_fcvtzu (sim_cpu *cpu)
8257 {
8258   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8259      instr[30,23] = 00111100
8260      instr[22]    = type: single (0)/ double (1)
8261      instr[21]    = enable (0)/disable(1) precision
8262      instr[20,16] = 11001
8263      instr[15,10] = precision
8264      instr[9,5]   = Rs
8265      instr[4,0]   = Rd.  */
8266 
8267   unsigned rs = INSTR (9, 5);
8268   unsigned rd = INSTR (4, 0);
8269 
8270   NYI_assert (30, 23, 0x3C);
8271   NYI_assert (20, 16, 0x19);
8272 
8273   if (INSTR (21, 21) != 1)
8274     /* Convert to fixed point.  */
8275     HALT_NYI;
8276 
8277   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8278   if (INSTR (31, 31))
8279     {
8280       /* Convert to unsigned 64-bit integer.  */
8281       if (INSTR (22, 22))
8282 	{
8283 	  double  d = aarch64_get_FP_double (cpu, rs);
8284 	  uint64_t value = (uint64_t) d;
8285 
8286 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
8287 	  if (value != (1UL << 63))
8288 	    RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
8289 
8290 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8291 	}
8292       else
8293 	{
8294 	  float  f = aarch64_get_FP_float (cpu, rs);
8295 	  uint64_t value = (uint64_t) f;
8296 
8297 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
8298 	  if (value != (1UL << 63))
8299 	    RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
8300 
8301 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8302 	}
8303     }
8304   else
8305     {
8306       uint32_t value;
8307 
8308       /* Convert to unsigned 32-bit integer.  */
8309       if (INSTR (22, 22))
8310 	{
8311 	  double  d = aarch64_get_FP_double (cpu, rs);
8312 
8313 	  value = (uint32_t) d;
8314 	  /* Do not raise an exception if we have reached UINT_MAX.  */
8315 	  if (value != (1UL << 31))
8316 	    RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
8317 	}
8318       else
8319 	{
8320 	  float  f = aarch64_get_FP_float (cpu, rs);
8321 
8322 	  value = (uint32_t) f;
8323 	  /* Do not raise an exception if we have reached UINT_MAX.  */
8324 	  if (value != (1UL << 31))
8325 	    RAISE_EXCEPTIONS (f, value, FLOAT, INT);
8326 	}
8327 
8328       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8329     }
8330 }
8331 
8332 static void
8333 do_UCVTF (sim_cpu *cpu)
8334 {
8335   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8336      instr[30,23] = 001 1110 0
8337      instr[22]    = type: single (0)/ double (1)
8338      instr[21]    = enable (0)/disable(1) precision
8339      instr[20,16] = 0 0011
8340      instr[15,10] = precision
8341      instr[9,5]   = Rs
8342      instr[4,0]   = Rd.  */
8343 
8344   unsigned rs = INSTR (9, 5);
8345   unsigned rd = INSTR (4, 0);
8346 
8347   NYI_assert (30, 23, 0x3C);
8348   NYI_assert (20, 16, 0x03);
8349 
8350   if (INSTR (21, 21) != 1)
8351     HALT_NYI;
8352 
8353   /* FIXME: Add exception raising.  */
8354   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8355   if (INSTR (31, 31))
8356     {
8357       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
8358 
8359       if (INSTR (22, 22))
8360 	aarch64_set_FP_double (cpu, rd, (double) value);
8361       else
8362 	aarch64_set_FP_float (cpu, rd, (float) value);
8363     }
8364   else
8365     {
8366       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
8367 
8368       if (INSTR (22, 22))
8369 	aarch64_set_FP_double (cpu, rd, (double) value);
8370       else
8371 	aarch64_set_FP_float (cpu, rd, (float) value);
8372     }
8373 }
8374 
8375 static void
8376 float_vector_move (sim_cpu *cpu)
8377 {
8378   /* instr[31,17] == 100 1111 0101 0111
8379      instr[16]    ==> direction 0=> to GR, 1=> from GR
8380      instr[15,10] => ???
8381      instr[9,5]   ==> source
8382      instr[4,0]   ==> dest.  */
8383 
8384   unsigned rn = INSTR (9, 5);
8385   unsigned rd = INSTR (4, 0);
8386 
8387   NYI_assert (31, 17, 0x4F57);
8388 
8389   if (INSTR (15, 10) != 0)
8390     HALT_UNALLOC;
8391 
8392   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8393   if (INSTR (16, 16))
8394     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
8395   else
8396     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
8397 }
8398 
8399 static void
8400 dexSimpleFPIntegerConvert (sim_cpu *cpu)
8401 {
8402   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
8403      instr[30     = 0
8404      instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
8405      instr[28,25] = 1111
8406      instr[24]    = 0
8407      instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
8408      instr[21]    = 1
8409      instr[20,19] = rmode
8410      instr[18,16] = opcode
8411      instr[15,10] = 10 0000  */
8412 
8413   uint32_t rmode_opcode;
8414   uint32_t size_type;
8415   uint32_t type;
8416   uint32_t size;
8417   uint32_t S;
8418 
8419   if (INSTR (31, 17) == 0x4F57)
8420     {
8421       float_vector_move (cpu);
8422       return;
8423     }
8424 
8425   size = INSTR (31, 31);
8426   S = INSTR (29, 29);
8427   if (S != 0)
8428     HALT_UNALLOC;
8429 
8430   type = INSTR (23, 22);
8431   if (type > 1)
8432     HALT_UNALLOC;
8433 
8434   rmode_opcode = INSTR (20, 16);
8435   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
8436 
8437   switch (rmode_opcode)
8438     {
8439     case 2:			/* SCVTF.  */
8440       switch (size_type)
8441 	{
8442 	case 0: scvtf32 (cpu); return;
8443 	case 1: scvtd32 (cpu); return;
8444 	case 2: scvtf (cpu); return;
8445 	case 3: scvtd (cpu); return;
8446 	}
8447 
8448     case 6:			/* FMOV GR, Vec.  */
8449       switch (size_type)
8450 	{
8451 	case 0:  gfmovs (cpu); return;
8452 	case 3:  gfmovd (cpu); return;
8453 	default: HALT_UNALLOC;
8454 	}
8455 
8456     case 7:			/* FMOV vec, GR.  */
8457       switch (size_type)
8458 	{
8459 	case 0:  fgmovs (cpu); return;
8460 	case 3:  fgmovd (cpu); return;
8461 	default: HALT_UNALLOC;
8462 	}
8463 
8464     case 24:			/* FCVTZS.  */
8465       switch (size_type)
8466 	{
8467 	case 0: fcvtszs32 (cpu); return;
8468 	case 1: fcvtszd32 (cpu); return;
8469 	case 2: fcvtszs (cpu); return;
8470 	case 3: fcvtszd (cpu); return;
8471 	}
8472 
8473     case 25: do_fcvtzu (cpu); return;
8474     case 3:  do_UCVTF (cpu); return;
8475 
8476     case 0:	/* FCVTNS.  */
8477     case 1:	/* FCVTNU.  */
8478     case 4:	/* FCVTAS.  */
8479     case 5:	/* FCVTAU.  */
8480     case 8:	/* FCVPTS.  */
8481     case 9:	/* FCVTPU.  */
8482     case 16:	/* FCVTMS.  */
8483     case 17:	/* FCVTMU.  */
8484     default:
8485       HALT_NYI;
8486     }
8487 }
8488 
8489 static void
8490 set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
8491 {
8492   uint32_t flags;
8493 
8494   if (isnan (fvalue1) || isnan (fvalue2))
8495     flags = C|V;
8496   else
8497     {
8498       float result = fvalue1 - fvalue2;
8499 
8500       if (result == 0.0)
8501 	flags = Z|C;
8502       else if (result < 0)
8503 	flags = N;
8504       else /* (result > 0).  */
8505 	flags = C;
8506     }
8507 
8508   aarch64_set_CPSR (cpu, flags);
8509 }
8510 
8511 static void
8512 fcmps (sim_cpu *cpu)
8513 {
8514   unsigned sm = INSTR (20, 16);
8515   unsigned sn = INSTR ( 9,  5);
8516 
8517   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8518   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8519 
8520   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8521   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8522 }
8523 
8524 /* Float compare to zero -- Invalid Operation exception
8525    only on signaling NaNs.  */
8526 static void
8527 fcmpzs (sim_cpu *cpu)
8528 {
8529   unsigned sn = INSTR ( 9,  5);
8530   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8531 
8532   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8533   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8534 }
8535 
8536 /* Float compare -- Invalid Operation exception on all NaNs.  */
8537 static void
8538 fcmpes (sim_cpu *cpu)
8539 {
8540   unsigned sm = INSTR (20, 16);
8541   unsigned sn = INSTR ( 9,  5);
8542 
8543   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8544   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8545 
8546   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8547   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8548 }
8549 
8550 /* Float compare to zero -- Invalid Operation exception on all NaNs.  */
8551 static void
8552 fcmpzes (sim_cpu *cpu)
8553 {
8554   unsigned sn = INSTR ( 9,  5);
8555   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8556 
8557   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8558   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8559 }
8560 
8561 static void
8562 set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
8563 {
8564   uint32_t flags;
8565 
8566   if (isnan (dval1) || isnan (dval2))
8567     flags = C|V;
8568   else
8569     {
8570       double result = dval1 - dval2;
8571 
8572       if (result == 0.0)
8573 	flags = Z|C;
8574       else if (result < 0)
8575 	flags = N;
8576       else /* (result > 0).  */
8577 	flags = C;
8578     }
8579 
8580   aarch64_set_CPSR (cpu, flags);
8581 }
8582 
8583 /* Double compare -- Invalid Operation exception only on signaling NaNs.  */
8584 static void
8585 fcmpd (sim_cpu *cpu)
8586 {
8587   unsigned sm = INSTR (20, 16);
8588   unsigned sn = INSTR ( 9,  5);
8589 
8590   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8591   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8592 
8593   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8594   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8595 }
8596 
8597 /* Double compare to zero -- Invalid Operation exception
8598    only on signaling NaNs.  */
8599 static void
8600 fcmpzd (sim_cpu *cpu)
8601 {
8602   unsigned sn = INSTR ( 9,  5);
8603   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8604 
8605   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8606   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8607 }
8608 
8609 /* Double compare -- Invalid Operation exception on all NaNs.  */
8610 static void
8611 fcmped (sim_cpu *cpu)
8612 {
8613   unsigned sm = INSTR (20, 16);
8614   unsigned sn = INSTR ( 9,  5);
8615 
8616   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8617   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8618 
8619   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8620   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8621 }
8622 
8623 /* Double compare to zero -- Invalid Operation exception on all NaNs.  */
8624 static void
8625 fcmpzed (sim_cpu *cpu)
8626 {
8627   unsigned sn = INSTR ( 9,  5);
8628   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8629 
8630   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8631   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8632 }
8633 
8634 static void
8635 dexSimpleFPCompare (sim_cpu *cpu)
8636 {
8637   /* assert instr[28,25] == 1111
8638      instr[30:24:21:13,10] = 0011000
8639      instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
8640      instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
8641      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
8642      instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
8643      instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
8644                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
8645                               ow ==> UNALLOC  */
8646   uint32_t dispatch;
8647   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8648   uint32_t type = INSTR (23, 22);
8649   uint32_t op = INSTR (15, 14);
8650   uint32_t op2_2_0 = INSTR (2, 0);
8651 
8652   if (op2_2_0 != 0)
8653     HALT_UNALLOC;
8654 
8655   if (M_S != 0)
8656     HALT_UNALLOC;
8657 
8658   if (type > 1)
8659     HALT_UNALLOC;
8660 
8661   if (op != 0)
8662     HALT_UNALLOC;
8663 
8664   /* dispatch on type and top 2 bits of opcode.  */
8665   dispatch = (type << 2) | INSTR (4, 3);
8666 
8667   switch (dispatch)
8668     {
8669     case 0: fcmps (cpu); return;
8670     case 1: fcmpzs (cpu); return;
8671     case 2: fcmpes (cpu); return;
8672     case 3: fcmpzes (cpu); return;
8673     case 4: fcmpd (cpu); return;
8674     case 5: fcmpzd (cpu); return;
8675     case 6: fcmped (cpu); return;
8676     case 7: fcmpzed (cpu); return;
8677     }
8678 }
8679 
8680 static void
8681 do_scalar_FADDP (sim_cpu *cpu)
8682 {
8683   /* instr [31,23] = 0111 1110 0
8684      instr [22]    = single(0)/double(1)
8685      instr [21,10] = 11 0000 1101 10
8686      instr [9,5]   = Fn
8687      instr [4,0]   = Fd.  */
8688 
8689   unsigned Fn = INSTR (9, 5);
8690   unsigned Fd = INSTR (4, 0);
8691 
8692   NYI_assert (31, 23, 0x0FC);
8693   NYI_assert (21, 10, 0xC36);
8694 
8695   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8696   if (INSTR (22, 22))
8697     {
8698       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
8699       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
8700 
8701       aarch64_set_FP_double (cpu, Fd, val1 + val2);
8702     }
8703   else
8704     {
8705       float val1 = aarch64_get_vec_float (cpu, Fn, 0);
8706       float val2 = aarch64_get_vec_float (cpu, Fn, 1);
8707 
8708       aarch64_set_FP_float (cpu, Fd, val1 + val2);
8709     }
8710 }
8711 
8712 /* Floating point absolute difference.  */
8713 
8714 static void
8715 do_scalar_FABD (sim_cpu *cpu)
8716 {
8717   /* instr [31,23] = 0111 1110 1
8718      instr [22]    = float(0)/double(1)
8719      instr [21]    = 1
8720      instr [20,16] = Rm
8721      instr [15,10] = 1101 01
8722      instr [9, 5]  = Rn
8723      instr [4, 0]  = Rd.  */
8724 
8725   unsigned rm = INSTR (20, 16);
8726   unsigned rn = INSTR (9, 5);
8727   unsigned rd = INSTR (4, 0);
8728 
8729   NYI_assert (31, 23, 0x0FD);
8730   NYI_assert (21, 21, 1);
8731   NYI_assert (15, 10, 0x35);
8732 
8733   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8734   if (INSTR (22, 22))
8735     aarch64_set_FP_double (cpu, rd,
8736 			   fabs (aarch64_get_FP_double (cpu, rn)
8737 				 - aarch64_get_FP_double (cpu, rm)));
8738   else
8739     aarch64_set_FP_float (cpu, rd,
8740 			  fabsf (aarch64_get_FP_float (cpu, rn)
8741 				 - aarch64_get_FP_float (cpu, rm)));
8742 }
8743 
8744 static void
8745 do_scalar_CMGT (sim_cpu *cpu)
8746 {
8747   /* instr [31,21] = 0101 1110 111
8748      instr [20,16] = Rm
8749      instr [15,10] = 00 1101
8750      instr [9, 5]  = Rn
8751      instr [4, 0]  = Rd.  */
8752 
8753   unsigned rm = INSTR (20, 16);
8754   unsigned rn = INSTR (9, 5);
8755   unsigned rd = INSTR (4, 0);
8756 
8757   NYI_assert (31, 21, 0x2F7);
8758   NYI_assert (15, 10, 0x0D);
8759 
8760   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8761   aarch64_set_vec_u64 (cpu, rd, 0,
8762 		       aarch64_get_vec_u64 (cpu, rn, 0) >
8763 		       aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
8764 }
8765 
8766 static void
8767 do_scalar_USHR (sim_cpu *cpu)
8768 {
8769   /* instr [31,23] = 0111 1111 0
8770      instr [22,16] = shift amount
8771      instr [15,10] = 0000 01
8772      instr [9, 5]  = Rn
8773      instr [4, 0]  = Rd.  */
8774 
8775   unsigned amount = 128 - INSTR (22, 16);
8776   unsigned rn = INSTR (9, 5);
8777   unsigned rd = INSTR (4, 0);
8778 
8779   NYI_assert (31, 23, 0x0FE);
8780   NYI_assert (15, 10, 0x01);
8781 
8782   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8783   aarch64_set_vec_u64 (cpu, rd, 0,
8784 		       aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
8785 }
8786 
8787 static void
8788 do_scalar_SSHL (sim_cpu *cpu)
8789 {
8790   /* instr [31,21] = 0101 1110 111
8791      instr [20,16] = Rm
8792      instr [15,10] = 0100 01
8793      instr [9, 5]  = Rn
8794      instr [4, 0]  = Rd.  */
8795 
8796   unsigned rm = INSTR (20, 16);
8797   unsigned rn = INSTR (9, 5);
8798   unsigned rd = INSTR (4, 0);
8799   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
8800 
8801   NYI_assert (31, 21, 0x2F7);
8802   NYI_assert (15, 10, 0x11);
8803 
8804   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8805   if (shift >= 0)
8806     aarch64_set_vec_s64 (cpu, rd, 0,
8807 			 aarch64_get_vec_s64 (cpu, rn, 0) << shift);
8808   else
8809     aarch64_set_vec_s64 (cpu, rd, 0,
8810 			 aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
8811 }
8812 
8813 static void
8814 do_scalar_shift (sim_cpu *cpu)
8815 {
8816   /* instr [31,23] = 0101 1111 0
8817      instr [22,16] = shift amount
8818      instr [15,10] = 0101 01   [SHL]
8819      instr [15,10] = 0000 01   [SSHR]
8820      instr [9, 5]  = Rn
8821      instr [4, 0]  = Rd.  */
8822 
8823   unsigned rn = INSTR (9, 5);
8824   unsigned rd = INSTR (4, 0);
8825   unsigned amount;
8826 
8827   NYI_assert (31, 23, 0x0BE);
8828 
8829   if (INSTR (22, 22) == 0)
8830     HALT_UNALLOC;
8831 
8832   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8833   switch (INSTR (15, 10))
8834     {
8835     case 0x01: /* SSHR */
8836       amount = 128 - INSTR (22, 16);
8837       aarch64_set_vec_s64 (cpu, rd, 0,
8838 			   aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
8839       return;
8840     case 0x15: /* SHL */
8841       amount = INSTR (22, 16) - 64;
8842       aarch64_set_vec_u64 (cpu, rd, 0,
8843 			   aarch64_get_vec_u64 (cpu, rn, 0) << amount);
8844       return;
8845     default:
8846       HALT_NYI;
8847     }
8848 }
8849 
8850 /* FCMEQ FCMGT FCMGE.  */
8851 static void
8852 do_scalar_FCM (sim_cpu *cpu)
8853 {
8854   /* instr [31,30] = 01
8855      instr [29]    = U
8856      instr [28,24] = 1 1110
8857      instr [23]    = E
8858      instr [22]    = size
8859      instr [21]    = 1
8860      instr [20,16] = Rm
8861      instr [15,12] = 1110
8862      instr [11]    = AC
8863      instr [10]    = 1
8864      instr [9, 5]  = Rn
8865      instr [4, 0]  = Rd.  */
8866 
8867   unsigned rm = INSTR (20, 16);
8868   unsigned rn = INSTR (9, 5);
8869   unsigned rd = INSTR (4, 0);
8870   unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
8871   unsigned result;
8872   float val1;
8873   float val2;
8874 
8875   NYI_assert (31, 30, 1);
8876   NYI_assert (28, 24, 0x1E);
8877   NYI_assert (21, 21, 1);
8878   NYI_assert (15, 12, 0xE);
8879   NYI_assert (10, 10, 1);
8880 
8881   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8882   if (INSTR (22, 22))
8883     {
8884       double val1 = aarch64_get_FP_double (cpu, rn);
8885       double val2 = aarch64_get_FP_double (cpu, rm);
8886 
8887       switch (EUac)
8888 	{
8889 	case 0: /* 000 */
8890 	  result = val1 == val2;
8891 	  break;
8892 
8893 	case 3: /* 011 */
8894 	  val1 = fabs (val1);
8895 	  val2 = fabs (val2);
8896 	  /* Fall through. */
8897 	case 2: /* 010 */
8898 	  result = val1 >= val2;
8899 	  break;
8900 
8901 	case 7: /* 111 */
8902 	  val1 = fabs (val1);
8903 	  val2 = fabs (val2);
8904 	  /* Fall through. */
8905 	case 6: /* 110 */
8906 	  result = val1 > val2;
8907 	  break;
8908 
8909 	default:
8910 	  HALT_UNALLOC;
8911 	}
8912 
8913       aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8914       return;
8915     }
8916 
8917   val1 = aarch64_get_FP_float (cpu, rn);
8918   val2 = aarch64_get_FP_float (cpu, rm);
8919 
8920   switch (EUac)
8921     {
8922     case 0: /* 000 */
8923       result = val1 == val2;
8924       break;
8925 
8926     case 3: /* 011 */
8927       val1 = fabsf (val1);
8928       val2 = fabsf (val2);
8929       /* Fall through. */
8930     case 2: /* 010 */
8931       result = val1 >= val2;
8932       break;
8933 
8934     case 7: /* 111 */
8935       val1 = fabsf (val1);
8936       val2 = fabsf (val2);
8937       /* Fall through. */
8938     case 6: /* 110 */
8939       result = val1 > val2;
8940       break;
8941 
8942     default:
8943       HALT_UNALLOC;
8944     }
8945 
8946   aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8947 }
8948 
8949 /* An alias of DUP.  */
8950 static void
8951 do_scalar_MOV (sim_cpu *cpu)
8952 {
8953   /* instr [31,21] = 0101 1110 000
8954      instr [20,16] = imm5
8955      instr [15,10] = 0000 01
8956      instr [9, 5]  = Rn
8957      instr [4, 0]  = Rd.  */
8958 
8959   unsigned rn = INSTR (9, 5);
8960   unsigned rd = INSTR (4, 0);
8961   unsigned index;
8962 
8963   NYI_assert (31, 21, 0x2F0);
8964   NYI_assert (15, 10, 0x01);
8965 
8966   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8967   if (INSTR (16, 16))
8968     {
8969       /* 8-bit.  */
8970       index = INSTR (20, 17);
8971       aarch64_set_vec_u8
8972 	(cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
8973     }
8974   else if (INSTR (17, 17))
8975     {
8976       /* 16-bit.  */
8977       index = INSTR (20, 18);
8978       aarch64_set_vec_u16
8979 	(cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
8980     }
8981   else if (INSTR (18, 18))
8982     {
8983       /* 32-bit.  */
8984       index = INSTR (20, 19);
8985       aarch64_set_vec_u32
8986 	(cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
8987     }
8988   else if (INSTR (19, 19))
8989     {
8990       /* 64-bit.  */
8991       index = INSTR (20, 20);
8992       aarch64_set_vec_u64
8993 	(cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
8994     }
8995   else
8996     HALT_UNALLOC;
8997 }
8998 
8999 static void
9000 do_scalar_NEG (sim_cpu *cpu)
9001 {
9002   /* instr [31,10] = 0111 1110 1110 0000 1011 10
9003      instr [9, 5]  = Rn
9004      instr [4, 0]  = Rd.  */
9005 
9006   unsigned rn = INSTR (9, 5);
9007   unsigned rd = INSTR (4, 0);
9008 
9009   NYI_assert (31, 10, 0x1FB82E);
9010 
9011   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9012   aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
9013 }
9014 
9015 static void
9016 do_scalar_USHL (sim_cpu *cpu)
9017 {
9018   /* instr [31,21] = 0111 1110 111
9019      instr [20,16] = Rm
9020      instr [15,10] = 0100 01
9021      instr [9, 5]  = Rn
9022      instr [4, 0]  = Rd.  */
9023 
9024   unsigned rm = INSTR (20, 16);
9025   unsigned rn = INSTR (9, 5);
9026   unsigned rd = INSTR (4, 0);
9027   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
9028 
9029   NYI_assert (31, 21, 0x3F7);
9030   NYI_assert (15, 10, 0x11);
9031 
9032   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9033   if (shift >= 0)
9034     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
9035   else
9036     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
9037 }
9038 
9039 static void
9040 do_double_add (sim_cpu *cpu)
9041 {
9042   /* instr [31,21] = 0101 1110 111
9043      instr [20,16] = Fn
9044      instr [15,10] = 1000 01
9045      instr [9,5]   = Fm
9046      instr [4,0]   = Fd.  */
9047   unsigned Fd;
9048   unsigned Fm;
9049   unsigned Fn;
9050   double val1;
9051   double val2;
9052 
9053   NYI_assert (31, 21, 0x2F7);
9054   NYI_assert (15, 10, 0x21);
9055 
9056   Fd = INSTR (4, 0);
9057   Fm = INSTR (9, 5);
9058   Fn = INSTR (20, 16);
9059 
9060   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9061   val1 = aarch64_get_FP_double (cpu, Fm);
9062   val2 = aarch64_get_FP_double (cpu, Fn);
9063 
9064   aarch64_set_FP_double (cpu, Fd, val1 + val2);
9065 }
9066 
9067 static void
9068 do_scalar_UCVTF (sim_cpu *cpu)
9069 {
9070   /* instr [31,23] = 0111 1110 0
9071      instr [22]    = single(0)/double(1)
9072      instr [21,10] = 10 0001 1101 10
9073      instr [9,5]   = rn
9074      instr [4,0]   = rd.  */
9075 
9076   unsigned rn = INSTR (9, 5);
9077   unsigned rd = INSTR (4, 0);
9078 
9079   NYI_assert (31, 23, 0x0FC);
9080   NYI_assert (21, 10, 0x876);
9081 
9082   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9083   if (INSTR (22, 22))
9084     {
9085       uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
9086 
9087       aarch64_set_vec_double (cpu, rd, 0, (double) val);
9088     }
9089   else
9090     {
9091       uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
9092 
9093       aarch64_set_vec_float (cpu, rd, 0, (float) val);
9094     }
9095 }
9096 
9097 static void
9098 do_scalar_vec (sim_cpu *cpu)
9099 {
9100   /* instr [30] = 1.  */
9101   /* instr [28,25] = 1111.  */
9102   switch (INSTR (31, 23))
9103     {
9104     case 0xBC:
9105       switch (INSTR (15, 10))
9106 	{
9107 	case 0x01: do_scalar_MOV (cpu); return;
9108 	case 0x39: do_scalar_FCM (cpu); return;
9109 	case 0x3B: do_scalar_FCM (cpu); return;
9110 	}
9111       break;
9112 
9113     case 0xBE: do_scalar_shift (cpu); return;
9114 
9115     case 0xFC:
9116       switch (INSTR (15, 10))
9117 	{
9118 	case 0x36:
9119 	  switch (INSTR (21, 16))
9120 	    {
9121 	    case 0x30: do_scalar_FADDP (cpu); return;
9122 	    case 0x21: do_scalar_UCVTF (cpu); return;
9123 	    }
9124 	  HALT_NYI;
9125 	case 0x39: do_scalar_FCM (cpu); return;
9126 	case 0x3B: do_scalar_FCM (cpu); return;
9127 	}
9128       break;
9129 
9130     case 0xFD:
9131       switch (INSTR (15, 10))
9132 	{
9133 	case 0x0D: do_scalar_CMGT (cpu); return;
9134 	case 0x11: do_scalar_USHL (cpu); return;
9135 	case 0x2E: do_scalar_NEG (cpu); return;
9136 	case 0x35: do_scalar_FABD (cpu); return;
9137 	case 0x39: do_scalar_FCM (cpu); return;
9138 	case 0x3B: do_scalar_FCM (cpu); return;
9139 	default:
9140 	  HALT_NYI;
9141 	}
9142 
9143     case 0xFE: do_scalar_USHR (cpu); return;
9144 
9145     case 0xBD:
9146       switch (INSTR (15, 10))
9147 	{
9148 	case 0x21: do_double_add (cpu); return;
9149 	case 0x11: do_scalar_SSHL (cpu); return;
9150 	default:
9151 	  HALT_NYI;
9152 	}
9153 
9154     default:
9155       HALT_NYI;
9156     }
9157 }
9158 
9159 static void
9160 dexAdvSIMD1 (sim_cpu *cpu)
9161 {
9162   /* instr [28,25] = 1 111.  */
9163 
9164   /* We are currently only interested in the basic
9165      scalar fp routines which all have bit 30 = 0.  */
9166   if (INSTR (30, 30))
9167     do_scalar_vec (cpu);
9168 
9169   /* instr[24] is set for FP data processing 3-source and clear for
9170      all other basic scalar fp instruction groups.  */
9171   else if (INSTR (24, 24))
9172     dexSimpleFPDataProc3Source (cpu);
9173 
9174   /* instr[21] is clear for floating <-> fixed conversions and set for
9175      all other basic scalar fp instruction groups.  */
9176   else if (!INSTR (21, 21))
9177     dexSimpleFPFixedConvert (cpu);
9178 
9179   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
9180      11 ==> cond select,  00 ==> other.  */
9181   else
9182     switch (INSTR (11, 10))
9183       {
9184       case 1: dexSimpleFPCondCompare (cpu); return;
9185       case 2: dexSimpleFPDataProc2Source (cpu); return;
9186       case 3: dexSimpleFPCondSelect (cpu); return;
9187 
9188       default:
9189 	/* Now an ordered cascade of tests.
9190 	   FP immediate has instr [12] == 1.
9191 	   FP compare has   instr [13] == 1.
9192 	   FP Data Proc 1 Source has instr [14] == 1.
9193 	   FP floating <--> integer conversions has instr [15] == 0.  */
9194 	if (INSTR (12, 12))
9195 	  dexSimpleFPImmediate (cpu);
9196 
9197 	else if (INSTR (13, 13))
9198 	  dexSimpleFPCompare (cpu);
9199 
9200 	else if (INSTR (14, 14))
9201 	  dexSimpleFPDataProc1Source (cpu);
9202 
9203 	else if (!INSTR (15, 15))
9204 	  dexSimpleFPIntegerConvert (cpu);
9205 
9206 	else
9207 	  /* If we get here then instr[15] == 1 which means UNALLOC.  */
9208 	  HALT_UNALLOC;
9209       }
9210 }
9211 
9212 /* PC relative addressing.  */
9213 
9214 static void
9215 pcadr (sim_cpu *cpu)
9216 {
9217   /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
9218      instr[30,29] = immlo
9219      instr[23,5] = immhi.  */
9220   uint64_t address;
9221   unsigned rd = INSTR (4, 0);
9222   uint32_t isPage = INSTR (31, 31);
9223   union { int64_t u64; uint64_t s64; } imm;
9224   uint64_t offset;
9225 
9226   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
9227   offset = imm.u64;
9228   offset = (offset << 2) | INSTR (30, 29);
9229 
9230   address = aarch64_get_PC (cpu);
9231 
9232   if (isPage)
9233     {
9234       offset <<= 12;
9235       address &= ~0xfff;
9236     }
9237 
9238   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9239   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
9240 }
9241 
9242 /* Specific decode and execute for group Data Processing Immediate.  */
9243 
9244 static void
9245 dexPCRelAddressing (sim_cpu *cpu)
9246 {
9247   /* assert instr[28,24] = 10000.  */
9248   pcadr (cpu);
9249 }
9250 
9251 /* Immediate logical.
9252    The bimm32/64 argument is constructed by replicating a 2, 4, 8,
9253    16, 32 or 64 bit sequence pulled out at decode and possibly
9254    inverting it..
9255 
9256    N.B. the output register (dest) can normally be Xn or SP
9257    the exception occurs for flag setting instructions which may
9258    only use Xn for the output (dest).  The input register can
9259    never be SP.  */
9260 
9261 /* 32 bit and immediate.  */
9262 static void
9263 and32 (sim_cpu *cpu, uint32_t bimm)
9264 {
9265   unsigned rn = INSTR (9, 5);
9266   unsigned rd = INSTR (4, 0);
9267 
9268   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9269   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9270 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
9271 }
9272 
9273 /* 64 bit and immediate.  */
9274 static void
9275 and64 (sim_cpu *cpu, uint64_t bimm)
9276 {
9277   unsigned rn = INSTR (9, 5);
9278   unsigned rd = INSTR (4, 0);
9279 
9280   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9281   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9282 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
9283 }
9284 
9285 /* 32 bit and immediate set flags.  */
9286 static void
9287 ands32 (sim_cpu *cpu, uint32_t bimm)
9288 {
9289   unsigned rn = INSTR (9, 5);
9290   unsigned rd = INSTR (4, 0);
9291 
9292   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9293   uint32_t value2 = bimm;
9294 
9295   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9296   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9297   set_flags_for_binop32 (cpu, value1 & value2);
9298 }
9299 
9300 /* 64 bit and immediate set flags.  */
9301 static void
9302 ands64 (sim_cpu *cpu, uint64_t bimm)
9303 {
9304   unsigned rn = INSTR (9, 5);
9305   unsigned rd = INSTR (4, 0);
9306 
9307   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9308   uint64_t value2 = bimm;
9309 
9310   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9311   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9312   set_flags_for_binop64 (cpu, value1 & value2);
9313 }
9314 
9315 /* 32 bit exclusive or immediate.  */
9316 static void
9317 eor32 (sim_cpu *cpu, uint32_t bimm)
9318 {
9319   unsigned rn = INSTR (9, 5);
9320   unsigned rd = INSTR (4, 0);
9321 
9322   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9323   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9324 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
9325 }
9326 
9327 /* 64 bit exclusive or immediate.  */
9328 static void
9329 eor64 (sim_cpu *cpu, uint64_t bimm)
9330 {
9331   unsigned rn = INSTR (9, 5);
9332   unsigned rd = INSTR (4, 0);
9333 
9334   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9335   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9336 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
9337 }
9338 
9339 /* 32 bit or immediate.  */
9340 static void
9341 orr32 (sim_cpu *cpu, uint32_t bimm)
9342 {
9343   unsigned rn = INSTR (9, 5);
9344   unsigned rd = INSTR (4, 0);
9345 
9346   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9347   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9348 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
9349 }
9350 
9351 /* 64 bit or immediate.  */
9352 static void
9353 orr64 (sim_cpu *cpu, uint64_t bimm)
9354 {
9355   unsigned rn = INSTR (9, 5);
9356   unsigned rd = INSTR (4, 0);
9357 
9358   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9359   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9360 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
9361 }
9362 
9363 /* Logical shifted register.
9364    These allow an optional LSL, ASR, LSR or ROR to the second source
9365    register with a count up to the register bit count.
9366    N.B register args may not be SP.  */
9367 
9368 /* 32 bit AND shifted register.  */
9369 static void
9370 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9371 {
9372   unsigned rm = INSTR (20, 16);
9373   unsigned rn = INSTR (9, 5);
9374   unsigned rd = INSTR (4, 0);
9375 
9376   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9377   aarch64_set_reg_u64
9378     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9379      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9380 }
9381 
9382 /* 64 bit AND shifted register.  */
9383 static void
9384 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9385 {
9386   unsigned rm = INSTR (20, 16);
9387   unsigned rn = INSTR (9, 5);
9388   unsigned rd = INSTR (4, 0);
9389 
9390   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9391   aarch64_set_reg_u64
9392     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9393      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9394 }
9395 
9396 /* 32 bit AND shifted register setting flags.  */
9397 static void
9398 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9399 {
9400   unsigned rm = INSTR (20, 16);
9401   unsigned rn = INSTR (9, 5);
9402   unsigned rd = INSTR (4, 0);
9403 
9404   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9405   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9406 			       shift, count);
9407 
9408   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9409   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9410   set_flags_for_binop32 (cpu, value1 & value2);
9411 }
9412 
9413 /* 64 bit AND shifted register setting flags.  */
9414 static void
9415 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9416 {
9417   unsigned rm = INSTR (20, 16);
9418   unsigned rn = INSTR (9, 5);
9419   unsigned rd = INSTR (4, 0);
9420 
9421   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9422   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9423 			       shift, count);
9424 
9425   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9426   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9427   set_flags_for_binop64 (cpu, value1 & value2);
9428 }
9429 
9430 /* 32 bit BIC shifted register.  */
9431 static void
9432 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9433 {
9434   unsigned rm = INSTR (20, 16);
9435   unsigned rn = INSTR (9, 5);
9436   unsigned rd = INSTR (4, 0);
9437 
9438   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9439   aarch64_set_reg_u64
9440     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9441      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9442 }
9443 
9444 /* 64 bit BIC shifted register.  */
9445 static void
9446 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9447 {
9448   unsigned rm = INSTR (20, 16);
9449   unsigned rn = INSTR (9, 5);
9450   unsigned rd = INSTR (4, 0);
9451 
9452   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9453   aarch64_set_reg_u64
9454     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9455      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9456 }
9457 
9458 /* 32 bit BIC shifted register setting flags.  */
9459 static void
9460 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9461 {
9462   unsigned rm = INSTR (20, 16);
9463   unsigned rn = INSTR (9, 5);
9464   unsigned rd = INSTR (4, 0);
9465 
9466   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9467   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9468 				 shift, count);
9469 
9470   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9471   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9472   set_flags_for_binop32 (cpu, value1 & value2);
9473 }
9474 
9475 /* 64 bit BIC shifted register setting flags.  */
9476 static void
9477 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9478 {
9479   unsigned rm = INSTR (20, 16);
9480   unsigned rn = INSTR (9, 5);
9481   unsigned rd = INSTR (4, 0);
9482 
9483   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9484   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9485 				 shift, count);
9486 
9487   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9488   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9489   set_flags_for_binop64 (cpu, value1 & value2);
9490 }
9491 
9492 /* 32 bit EON shifted register.  */
9493 static void
9494 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9495 {
9496   unsigned rm = INSTR (20, 16);
9497   unsigned rn = INSTR (9, 5);
9498   unsigned rd = INSTR (4, 0);
9499 
9500   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9501   aarch64_set_reg_u64
9502     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9503      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9504 }
9505 
9506 /* 64 bit EON shifted register.  */
9507 static void
9508 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9509 {
9510   unsigned rm = INSTR (20, 16);
9511   unsigned rn = INSTR (9, 5);
9512   unsigned rd = INSTR (4, 0);
9513 
9514   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9515   aarch64_set_reg_u64
9516     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9517      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9518 }
9519 
9520 /* 32 bit EOR shifted register.  */
9521 static void
9522 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9523 {
9524   unsigned rm = INSTR (20, 16);
9525   unsigned rn = INSTR (9, 5);
9526   unsigned rd = INSTR (4, 0);
9527 
9528   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9529   aarch64_set_reg_u64
9530     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9531      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9532 }
9533 
9534 /* 64 bit EOR shifted register.  */
9535 static void
9536 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9537 {
9538   unsigned rm = INSTR (20, 16);
9539   unsigned rn = INSTR (9, 5);
9540   unsigned rd = INSTR (4, 0);
9541 
9542   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9543   aarch64_set_reg_u64
9544     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9545      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9546 }
9547 
9548 /* 32 bit ORR shifted register.  */
9549 static void
9550 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9551 {
9552   unsigned rm = INSTR (20, 16);
9553   unsigned rn = INSTR (9, 5);
9554   unsigned rd = INSTR (4, 0);
9555 
9556   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9557   aarch64_set_reg_u64
9558     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9559      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9560 }
9561 
9562 /* 64 bit ORR shifted register.  */
9563 static void
9564 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9565 {
9566   unsigned rm = INSTR (20, 16);
9567   unsigned rn = INSTR (9, 5);
9568   unsigned rd = INSTR (4, 0);
9569 
9570   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9571   aarch64_set_reg_u64
9572     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9573      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9574 }
9575 
9576 /* 32 bit ORN shifted register.  */
9577 static void
9578 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9579 {
9580   unsigned rm = INSTR (20, 16);
9581   unsigned rn = INSTR (9, 5);
9582   unsigned rd = INSTR (4, 0);
9583 
9584   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9585   aarch64_set_reg_u64
9586     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9587      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9588 }
9589 
9590 /* 64 bit ORN shifted register.  */
9591 static void
9592 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9593 {
9594   unsigned rm = INSTR (20, 16);
9595   unsigned rn = INSTR (9, 5);
9596   unsigned rd = INSTR (4, 0);
9597 
9598   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9599   aarch64_set_reg_u64
9600     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9601      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9602 }
9603 
9604 static void
9605 dexLogicalImmediate (sim_cpu *cpu)
9606 {
9607   /* assert instr[28,23] = 1001000
9608      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9609      instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
9610      instr[22] = N : used to construct immediate mask
9611      instr[21,16] = immr
9612      instr[15,10] = imms
9613      instr[9,5] = Rn
9614      instr[4,0] = Rd  */
9615 
9616   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
9617   uint32_t size = INSTR (31, 31);
9618   uint32_t N = INSTR (22, 22);
9619   /* uint32_t immr = INSTR (21, 16);.  */
9620   /* uint32_t imms = INSTR (15, 10);.  */
9621   uint32_t index = INSTR (22, 10);
9622   uint64_t bimm64 = LITable [index];
9623   uint32_t dispatch = INSTR (30, 29);
9624 
9625   if (~size & N)
9626     HALT_UNALLOC;
9627 
9628   if (!bimm64)
9629     HALT_UNALLOC;
9630 
9631   if (size == 0)
9632     {
9633       uint32_t bimm = (uint32_t) bimm64;
9634 
9635       switch (dispatch)
9636 	{
9637 	case 0: and32 (cpu, bimm); return;
9638 	case 1: orr32 (cpu, bimm); return;
9639 	case 2: eor32 (cpu, bimm); return;
9640 	case 3: ands32 (cpu, bimm); return;
9641 	}
9642     }
9643   else
9644     {
9645       switch (dispatch)
9646 	{
9647 	case 0: and64 (cpu, bimm64); return;
9648 	case 1: orr64 (cpu, bimm64); return;
9649 	case 2: eor64 (cpu, bimm64); return;
9650 	case 3: ands64 (cpu, bimm64); return;
9651 	}
9652     }
9653   HALT_UNALLOC;
9654 }
9655 
9656 /* Immediate move.
9657    The uimm argument is a 16 bit value to be inserted into the
9658    target register the pos argument locates the 16 bit word in the
9659    dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
9660    3} for 64 bit.
9661    N.B register arg may not be SP so it should be.
9662    accessed using the setGZRegisterXXX accessors.  */
9663 
9664 /* 32 bit move 16 bit immediate zero remaining shorts.  */
9665 static void
9666 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9667 {
9668   unsigned rd = INSTR (4, 0);
9669 
9670   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9671   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
9672 }
9673 
9674 /* 64 bit move 16 bit immediate zero remaining shorts.  */
9675 static void
9676 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9677 {
9678   unsigned rd = INSTR (4, 0);
9679 
9680   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9681   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
9682 }
9683 
9684 /* 32 bit move 16 bit immediate negated.  */
9685 static void
9686 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9687 {
9688   unsigned rd = INSTR (4, 0);
9689 
9690   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9691   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
9692 }
9693 
9694 /* 64 bit move 16 bit immediate negated.  */
9695 static void
9696 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9697 {
9698   unsigned rd = INSTR (4, 0);
9699 
9700   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9701   aarch64_set_reg_u64
9702     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
9703 		      ^ 0xffffffffffffffffULL));
9704 }
9705 
9706 /* 32 bit move 16 bit immediate keep remaining shorts.  */
9707 static void
9708 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9709 {
9710   unsigned rd = INSTR (4, 0);
9711   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
9712   uint32_t value = val << (pos * 16);
9713   uint32_t mask = ~(0xffffU << (pos * 16));
9714 
9715   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9716   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9717 }
9718 
9719 /* 64 bit move 16 it immediate keep remaining shorts.  */
9720 static void
9721 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9722 {
9723   unsigned rd = INSTR (4, 0);
9724   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
9725   uint64_t value = (uint64_t) val << (pos * 16);
9726   uint64_t mask = ~(0xffffULL << (pos * 16));
9727 
9728   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9729   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9730 }
9731 
9732 static void
9733 dexMoveWideImmediate (sim_cpu *cpu)
9734 {
9735   /* assert instr[28:23] = 100101
9736      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9737      instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
9738      instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
9739      instr[20,5] = uimm16
9740      instr[4,0] = Rd  */
9741 
9742   /* N.B. the (multiple of 16) shift is applied by the called routine,
9743      we just pass the multiplier.  */
9744 
9745   uint32_t imm;
9746   uint32_t size = INSTR (31, 31);
9747   uint32_t op = INSTR (30, 29);
9748   uint32_t shift = INSTR (22, 21);
9749 
9750   /* 32 bit can only shift 0 or 1 lot of 16.
9751      anything else is an unallocated instruction.  */
9752   if (size == 0 && (shift > 1))
9753     HALT_UNALLOC;
9754 
9755   if (op == 1)
9756     HALT_UNALLOC;
9757 
9758   imm = INSTR (20, 5);
9759 
9760   if (size == 0)
9761     {
9762       if (op == 0)
9763 	movn32 (cpu, imm, shift);
9764       else if (op == 2)
9765 	movz32 (cpu, imm, shift);
9766       else
9767 	movk32 (cpu, imm, shift);
9768     }
9769   else
9770     {
9771       if (op == 0)
9772 	movn64 (cpu, imm, shift);
9773       else if (op == 2)
9774 	movz64 (cpu, imm, shift);
9775       else
9776 	movk64 (cpu, imm, shift);
9777     }
9778 }
9779 
9780 /* Bitfield operations.
9781    These take a pair of bit positions r and s which are in {0..31}
9782    or {0..63} depending on the instruction word size.
9783    N.B register args may not be SP.  */
9784 
9785 /* OK, we start with ubfm which just needs to pick
9786    some bits out of source zero the rest and write
9787    the result to dest.  Just need two logical shifts.  */
9788 
9789 /* 32 bit bitfield move, left and right of affected zeroed
9790    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9791 static void
9792 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9793 {
9794   unsigned rd;
9795   unsigned rn = INSTR (9, 5);
9796   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9797 
9798   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9799   if (r <= s)
9800     {
9801       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9802          We want only bits s:xxx:r at the bottom of the word
9803          so we LSL bit s up to bit 31 i.e. by 31 - s
9804          and then we LSR to bring bit 31 down to bit s - r
9805 	 i.e. by 31 + r - s.  */
9806       value <<= 31 - s;
9807       value >>= 31 + r - s;
9808     }
9809   else
9810     {
9811       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
9812          We want only bits s:xxx:0 starting at it 31-(r-1)
9813          so we LSL bit s up to bit 31 i.e. by 31 - s
9814          and then we LSL to bring bit 31 down to 31-(r-1)+s
9815 	 i.e. by r - (s + 1).  */
9816       value <<= 31 - s;
9817       value >>= r - (s + 1);
9818     }
9819 
9820   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9821   rd = INSTR (4, 0);
9822   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9823 }
9824 
9825 /* 64 bit bitfield move, left and right of affected zeroed
9826    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9827 static void
9828 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9829 {
9830   unsigned rd;
9831   unsigned rn = INSTR (9, 5);
9832   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9833 
9834   if (r <= s)
9835     {
9836       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
9837          We want only bits s:xxx:r at the bottom of the word.
9838          So we LSL bit s up to bit 63 i.e. by 63 - s
9839          and then we LSR to bring bit 63 down to bit s - r
9840 	 i.e. by 63 + r - s.  */
9841       value <<= 63 - s;
9842       value >>= 63 + r - s;
9843     }
9844   else
9845     {
9846       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
9847          We want only bits s:xxx:0 starting at it 63-(r-1).
9848          So we LSL bit s up to bit 63 i.e. by 63 - s
9849          and then we LSL to bring bit 63 down to 63-(r-1)+s
9850 	 i.e. by r - (s + 1).  */
9851       value <<= 63 - s;
9852       value >>= r - (s + 1);
9853     }
9854 
9855   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9856   rd = INSTR (4, 0);
9857   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9858 }
9859 
9860 /* The signed versions need to insert sign bits
9861    on the left of the inserted bit field. so we do
9862    much the same as the unsigned version except we
9863    use an arithmetic shift right -- this just means
9864    we need to operate on signed values.  */
9865 
9866 /* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
9867 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9868 static void
9869 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9870 {
9871   unsigned rd;
9872   unsigned rn = INSTR (9, 5);
9873   /* as per ubfm32 but use an ASR instead of an LSR.  */
9874   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
9875 
9876   if (r <= s)
9877     {
9878       value <<= 31 - s;
9879       value >>= 31 + r - s;
9880     }
9881   else
9882     {
9883       value <<= 31 - s;
9884       value >>= r - (s + 1);
9885     }
9886 
9887   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9888   rd = INSTR (4, 0);
9889   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
9890 }
9891 
9892 /* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
9893 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9894 static void
9895 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9896 {
9897   unsigned rd;
9898   unsigned rn = INSTR (9, 5);
9899   /* acpu per ubfm but use an ASR instead of an LSR.  */
9900   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
9901 
9902   if (r <= s)
9903     {
9904       value <<= 63 - s;
9905       value >>= 63 + r - s;
9906     }
9907   else
9908     {
9909       value <<= 63 - s;
9910       value >>= r - (s + 1);
9911     }
9912 
9913   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9914   rd = INSTR (4, 0);
9915   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
9916 }
9917 
9918 /* Finally, these versions leave non-affected bits
9919    as is. so we need to generate the bits as per
9920    ubfm and also generate a mask to pick the
9921    bits from the original and computed values.  */
9922 
9923 /* 32 bit bitfield move, non-affected bits left as is.
9924    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9925 static void
9926 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9927 {
9928   unsigned rn = INSTR (9, 5);
9929   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9930   uint32_t mask = -1;
9931   unsigned rd;
9932   uint32_t value2;
9933 
9934   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9935   if (r <= s)
9936     {
9937       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9938          We want only bits s:xxx:r at the bottom of the word
9939          so we LSL bit s up to bit 31 i.e. by 31 - s
9940          and then we LSR to bring bit 31 down to bit s - r
9941 	 i.e. by 31 + r - s.  */
9942       value <<= 31 - s;
9943       value >>= 31 + r - s;
9944       /* the mask must include the same bits.  */
9945       mask <<= 31 - s;
9946       mask >>= 31 + r - s;
9947     }
9948   else
9949     {
9950       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
9951          We want only bits s:xxx:0 starting at it 31-(r-1)
9952          so we LSL bit s up to bit 31 i.e. by 31 - s
9953          and then we LSL to bring bit 31 down to 31-(r-1)+s
9954 	 i.e. by r - (s + 1).  */
9955       value <<= 31 - s;
9956       value >>= r - (s + 1);
9957       /* The mask must include the same bits.  */
9958       mask <<= 31 - s;
9959       mask >>= r - (s + 1);
9960     }
9961 
9962   rd = INSTR (4, 0);
9963   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
9964 
9965   value2 &= ~mask;
9966   value2 |= value;
9967 
9968   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9969   aarch64_set_reg_u64
9970     (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
9971 }
9972 
9973 /* 64 bit bitfield move, non-affected bits left as is.
9974    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9975 static void
9976 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9977 {
9978   unsigned rd;
9979   unsigned rn = INSTR (9, 5);
9980   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9981   uint64_t mask = 0xffffffffffffffffULL;
9982 
9983   if (r <= s)
9984     {
9985       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
9986          We want only bits s:xxx:r at the bottom of the word
9987          so we LSL bit s up to bit 63 i.e. by 63 - s
9988          and then we LSR to bring bit 63 down to bit s - r
9989 	 i.e. by 63 + r - s.  */
9990       value <<= 63 - s;
9991       value >>= 63 + r - s;
9992       /* The mask must include the same bits.  */
9993       mask <<= 63 - s;
9994       mask >>= 63 + r - s;
9995     }
9996   else
9997     {
9998       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
9999          We want only bits s:xxx:0 starting at it 63-(r-1)
10000          so we LSL bit s up to bit 63 i.e. by 63 - s
10001          and then we LSL to bring bit 63 down to 63-(r-1)+s
10002 	 i.e. by r - (s + 1).  */
10003       value <<= 63 - s;
10004       value >>= r - (s + 1);
10005       /* The mask must include the same bits.  */
10006       mask <<= 63 - s;
10007       mask >>= r - (s + 1);
10008     }
10009 
10010   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10011   rd = INSTR (4, 0);
10012   aarch64_set_reg_u64
10013     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
10014 }
10015 
10016 static void
10017 dexBitfieldImmediate (sim_cpu *cpu)
10018 {
10019   /* assert instr[28:23] = 100110
10020      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10021      instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
10022      instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
10023      instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
10024      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10025      instr[9,5] = Rn
10026      instr[4,0] = Rd  */
10027 
10028   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10029   uint32_t dispatch;
10030   uint32_t imms;
10031   uint32_t size = INSTR (31, 31);
10032   uint32_t N = INSTR (22, 22);
10033   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
10034   /* or else we have an UNALLOC.  */
10035   uint32_t immr = INSTR (21, 16);
10036 
10037   if (~size & N)
10038     HALT_UNALLOC;
10039 
10040   if (!size && uimm (immr, 5, 5))
10041     HALT_UNALLOC;
10042 
10043   imms = INSTR (15, 10);
10044   if (!size && uimm (imms, 5, 5))
10045     HALT_UNALLOC;
10046 
10047   /* Switch on combined size and op.  */
10048   dispatch = INSTR (31, 29);
10049   switch (dispatch)
10050     {
10051     case 0: sbfm32 (cpu, immr, imms); return;
10052     case 1: bfm32 (cpu, immr, imms); return;
10053     case 2: ubfm32 (cpu, immr, imms); return;
10054     case 4: sbfm (cpu, immr, imms); return;
10055     case 5: bfm (cpu, immr, imms); return;
10056     case 6: ubfm (cpu, immr, imms); return;
10057     default: HALT_UNALLOC;
10058     }
10059 }
10060 
10061 static void
10062 do_EXTR_32 (sim_cpu *cpu)
10063 {
10064   /* instr[31:21] = 00010011100
10065      instr[20,16] = Rm
10066      instr[15,10] = imms :  0xxxxx for 32 bit
10067      instr[9,5]   = Rn
10068      instr[4,0]   = Rd  */
10069   unsigned rm   = INSTR (20, 16);
10070   unsigned imms = INSTR (15, 10) & 31;
10071   unsigned rn   = INSTR ( 9,  5);
10072   unsigned rd   = INSTR ( 4,  0);
10073   uint64_t val1;
10074   uint64_t val2;
10075 
10076   val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
10077   val1 >>= imms;
10078   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10079   val2 <<= (32 - imms);
10080 
10081   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10082   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
10083 }
10084 
10085 static void
10086 do_EXTR_64 (sim_cpu *cpu)
10087 {
10088   /* instr[31:21] = 10010011100
10089      instr[20,16] = Rm
10090      instr[15,10] = imms
10091      instr[9,5]   = Rn
10092      instr[4,0]   = Rd  */
10093   unsigned rm   = INSTR (20, 16);
10094   unsigned imms = INSTR (15, 10) & 63;
10095   unsigned rn   = INSTR ( 9,  5);
10096   unsigned rd   = INSTR ( 4,  0);
10097   uint64_t val;
10098 
10099   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
10100   val >>= imms;
10101   val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));
10102 
10103   aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
10104 }
10105 
10106 static void
10107 dexExtractImmediate (sim_cpu *cpu)
10108 {
10109   /* assert instr[28:23] = 100111
10110      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
10111      instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
10112      instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
10113      instr[21]    = op0 : must be 0 or UNALLOC
10114      instr[20,16] = Rm
10115      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10116      instr[9,5]   = Rn
10117      instr[4,0]   = Rd  */
10118 
10119   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10120   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
10121   uint32_t dispatch;
10122   uint32_t size = INSTR (31, 31);
10123   uint32_t N = INSTR (22, 22);
10124   /* 32 bit operations must have imms[5] = 0
10125      or else we have an UNALLOC.  */
10126   uint32_t imms = INSTR (15, 10);
10127 
10128   if (size ^ N)
10129     HALT_UNALLOC;
10130 
10131   if (!size && uimm (imms, 5, 5))
10132     HALT_UNALLOC;
10133 
10134   /* Switch on combined size and op.  */
10135   dispatch = INSTR (31, 29);
10136 
10137   if (dispatch == 0)
10138     do_EXTR_32 (cpu);
10139 
10140   else if (dispatch == 4)
10141     do_EXTR_64 (cpu);
10142 
10143   else if (dispatch == 1)
10144     HALT_NYI;
10145   else
10146     HALT_UNALLOC;
10147 }
10148 
10149 static void
10150 dexDPImm (sim_cpu *cpu)
10151 {
10152   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
10153      assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
10154      bits [25,23] of a DPImm are the secondary dispatch vector.  */
10155   uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));
10156 
10157   switch (group2)
10158     {
10159     case DPIMM_PCADR_000:
10160     case DPIMM_PCADR_001:
10161       dexPCRelAddressing (cpu);
10162       return;
10163 
10164     case DPIMM_ADDSUB_010:
10165     case DPIMM_ADDSUB_011:
10166       dexAddSubtractImmediate (cpu);
10167       return;
10168 
10169     case DPIMM_LOG_100:
10170       dexLogicalImmediate (cpu);
10171       return;
10172 
10173     case DPIMM_MOV_101:
10174       dexMoveWideImmediate (cpu);
10175       return;
10176 
10177     case DPIMM_BITF_110:
10178       dexBitfieldImmediate (cpu);
10179       return;
10180 
10181     case DPIMM_EXTR_111:
10182       dexExtractImmediate (cpu);
10183       return;
10184 
10185     default:
10186       /* Should never reach here.  */
10187       HALT_NYI;
10188     }
10189 }
10190 
10191 static void
10192 dexLoadUnscaledImmediate (sim_cpu *cpu)
10193 {
10194   /* instr[29,24] == 111_00
10195      instr[21] == 0
10196      instr[11,10] == 00
10197      instr[31,30] = size
10198      instr[26] = V
10199      instr[23,22] = opc
10200      instr[20,12] = simm9
10201      instr[9,5] = rn may be SP.  */
10202   /* unsigned rt = INSTR (4, 0);  */
10203   uint32_t V = INSTR (26, 26);
10204   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10205   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
10206 
10207   if (!V)
10208     {
10209       /* GReg operations.  */
10210       switch (dispatch)
10211 	{
10212 	case 0:	 sturb (cpu, imm); return;
10213 	case 1:	 ldurb32 (cpu, imm); return;
10214 	case 2:	 ldursb64 (cpu, imm); return;
10215 	case 3:	 ldursb32 (cpu, imm); return;
10216 	case 4:	 sturh (cpu, imm); return;
10217 	case 5:	 ldurh32 (cpu, imm); return;
10218 	case 6:	 ldursh64 (cpu, imm); return;
10219 	case 7:	 ldursh32 (cpu, imm); return;
10220 	case 8:	 stur32 (cpu, imm); return;
10221 	case 9:	 ldur32 (cpu, imm); return;
10222 	case 10: ldursw (cpu, imm); return;
10223 	case 12: stur64 (cpu, imm); return;
10224 	case 13: ldur64 (cpu, imm); return;
10225 
10226 	case 14:
10227 	  /* PRFUM NYI.  */
10228 	  HALT_NYI;
10229 
10230 	default:
10231 	case 11:
10232 	case 15:
10233 	  HALT_UNALLOC;
10234 	}
10235     }
10236 
10237   /* FReg operations.  */
10238   switch (dispatch)
10239     {
10240     case 2:  fsturq (cpu, imm); return;
10241     case 3:  fldurq (cpu, imm); return;
10242     case 8:  fsturs (cpu, imm); return;
10243     case 9:  fldurs (cpu, imm); return;
10244     case 12: fsturd (cpu, imm); return;
10245     case 13: fldurd (cpu, imm); return;
10246 
10247     case 0: /* STUR 8 bit FP.  */
10248     case 1: /* LDUR 8 bit FP.  */
10249     case 4: /* STUR 16 bit FP.  */
10250     case 5: /* LDUR 8 bit FP.  */
10251       HALT_NYI;
10252 
10253     default:
10254     case 6:
10255     case 7:
10256     case 10:
10257     case 11:
10258     case 14:
10259     case 15:
10260       HALT_UNALLOC;
10261     }
10262 }
10263 
10264 /*  N.B. A preliminary note regarding all the ldrs<x>32
10265     instructions
10266 
10267    The signed value loaded by these instructions is cast to unsigned
10268    before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
10269    64 bit element of the GReg union. this performs a 32 bit sign extension
10270    (as required) but avoids 64 bit sign extension, thus ensuring that the
10271    top half of the register word is zero. this is what the spec demands
10272    when a 32 bit load occurs.  */
10273 
10274 /* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
10275 static void
10276 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
10277 {
10278   unsigned int rn = INSTR (9, 5);
10279   unsigned int rt = INSTR (4, 0);
10280 
10281   /* The target register may not be SP but the source may be
10282      there is no scaling required for a byte load.  */
10283   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
10284   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10285 		       (int64_t) aarch64_get_mem_s8 (cpu, address));
10286 }
10287 
10288 /* 32 bit load sign-extended byte scaled or unscaled zero-
10289    or sign-extended 32-bit register offset.  */
10290 static void
10291 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10292 {
10293   unsigned int rm = INSTR (20, 16);
10294   unsigned int rn = INSTR (9, 5);
10295   unsigned int rt = INSTR (4, 0);
10296 
10297   /* rn may reference SP, rm and rt must reference ZR.  */
10298 
10299   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10300   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10301 				 extension);
10302 
10303   /* There is no scaling required for a byte load.  */
10304   aarch64_set_reg_u64
10305     (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
10306 						   + displacement));
10307 }
10308 
10309 /* 32 bit load sign-extended byte unscaled signed 9 bit with
10310    pre- or post-writeback.  */
10311 static void
10312 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10313 {
10314   uint64_t address;
10315   unsigned int rn = INSTR (9, 5);
10316   unsigned int rt = INSTR (4, 0);
10317 
10318   if (rn == rt && wb != NoWriteBack)
10319     HALT_UNALLOC;
10320 
10321   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10322 
10323   if (wb == Pre)
10324       address += offset;
10325 
10326   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10327 		       (int64_t) aarch64_get_mem_s8 (cpu, address));
10328 
10329   if (wb == Post)
10330     address += offset;
10331 
10332   if (wb != NoWriteBack)
10333     aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
10334 }
10335 
10336 /* 8 bit store scaled.  */
10337 static void
10338 fstrb_abs (sim_cpu *cpu, uint32_t offset)
10339 {
10340   unsigned st = INSTR (4, 0);
10341   unsigned rn = INSTR (9, 5);
10342 
10343   aarch64_set_mem_u8 (cpu,
10344 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
10345 		      aarch64_get_vec_u8 (cpu, st, 0));
10346 }
10347 
10348 /* 8 bit store scaled or unscaled zero- or
10349    sign-extended 8-bit register offset.  */
10350 static void
10351 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10352 {
10353   unsigned rm = INSTR (20, 16);
10354   unsigned rn = INSTR (9, 5);
10355   unsigned st = INSTR (4, 0);
10356 
10357   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10358   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10359 			       extension);
10360   uint64_t  displacement = scaling == Scaled ? extended : 0;
10361 
10362   aarch64_set_mem_u8
10363     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
10364 }
10365 
10366 /* 16 bit store scaled.  */
10367 static void
10368 fstrh_abs (sim_cpu *cpu, uint32_t offset)
10369 {
10370   unsigned st = INSTR (4, 0);
10371   unsigned rn = INSTR (9, 5);
10372 
10373   aarch64_set_mem_u16
10374     (cpu,
10375      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
10376      aarch64_get_vec_u16 (cpu, st, 0));
10377 }
10378 
10379 /* 16 bit store scaled or unscaled zero-
10380    or sign-extended 16-bit register offset.  */
10381 static void
10382 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10383 {
10384   unsigned rm = INSTR (20, 16);
10385   unsigned rn = INSTR (9, 5);
10386   unsigned st = INSTR (4, 0);
10387 
10388   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10389   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10390 			       extension);
10391   uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
10392 
10393   aarch64_set_mem_u16
10394     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
10395 }
10396 
10397 /* 32 bit store scaled unsigned 12 bit.  */
10398 static void
10399 fstrs_abs (sim_cpu *cpu, uint32_t offset)
10400 {
10401   unsigned st = INSTR (4, 0);
10402   unsigned rn = INSTR (9, 5);
10403 
10404   aarch64_set_mem_u32
10405     (cpu,
10406      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
10407      aarch64_get_vec_u32 (cpu, st, 0));
10408 }
10409 
10410 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10411 static void
10412 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10413 {
10414   unsigned rn = INSTR (9, 5);
10415   unsigned st = INSTR (4, 0);
10416 
10417   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10418 
10419   if (wb != Post)
10420     address += offset;
10421 
10422   aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
10423 
10424   if (wb == Post)
10425     address += offset;
10426 
10427   if (wb != NoWriteBack)
10428     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10429 }
10430 
10431 /* 32 bit store scaled or unscaled zero-
10432    or sign-extended 32-bit register offset.  */
10433 static void
10434 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10435 {
10436   unsigned rm = INSTR (20, 16);
10437   unsigned rn = INSTR (9, 5);
10438   unsigned st = INSTR (4, 0);
10439 
10440   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10441   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10442 			       extension);
10443   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
10444 
10445   aarch64_set_mem_u32
10446     (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
10447 }
10448 
10449 /* 64 bit store scaled unsigned 12 bit.  */
10450 static void
10451 fstrd_abs (sim_cpu *cpu, uint32_t offset)
10452 {
10453   unsigned st = INSTR (4, 0);
10454   unsigned rn = INSTR (9, 5);
10455 
10456   aarch64_set_mem_u64
10457     (cpu,
10458      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
10459      aarch64_get_vec_u64 (cpu, st, 0));
10460 }
10461 
10462 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10463 static void
10464 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10465 {
10466   unsigned rn = INSTR (9, 5);
10467   unsigned st = INSTR (4, 0);
10468 
10469   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10470 
10471   if (wb != Post)
10472     address += offset;
10473 
10474   aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
10475 
10476   if (wb == Post)
10477     address += offset;
10478 
10479   if (wb != NoWriteBack)
10480     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10481 }
10482 
10483 /* 64 bit store scaled or unscaled zero-
10484    or sign-extended 32-bit register offset.  */
10485 static void
10486 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10487 {
10488   unsigned rm = INSTR (20, 16);
10489   unsigned rn = INSTR (9, 5);
10490   unsigned st = INSTR (4, 0);
10491 
10492   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10493   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10494 			       extension);
10495   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
10496 
10497   aarch64_set_mem_u64
10498     (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
10499 }
10500 
10501 /* 128 bit store scaled unsigned 12 bit.  */
10502 static void
10503 fstrq_abs (sim_cpu *cpu, uint32_t offset)
10504 {
10505   FRegister a;
10506   unsigned st = INSTR (4, 0);
10507   unsigned rn = INSTR (9, 5);
10508   uint64_t addr;
10509 
10510   aarch64_get_FP_long_double (cpu, st, & a);
10511 
10512   addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
10513   aarch64_set_mem_long_double (cpu, addr, a);
10514 }
10515 
10516 /* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10517 static void
10518 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10519 {
10520   FRegister a;
10521   unsigned rn = INSTR (9, 5);
10522   unsigned st = INSTR (4, 0);
10523   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10524 
10525   if (wb != Post)
10526     address += offset;
10527 
10528   aarch64_get_FP_long_double (cpu, st, & a);
10529   aarch64_set_mem_long_double (cpu, address, a);
10530 
10531   if (wb == Post)
10532     address += offset;
10533 
10534   if (wb != NoWriteBack)
10535     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10536 }
10537 
10538 /* 128 bit store scaled or unscaled zero-
10539    or sign-extended 32-bit register offset.  */
10540 static void
10541 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10542 {
10543   unsigned rm = INSTR (20, 16);
10544   unsigned rn = INSTR (9, 5);
10545   unsigned st = INSTR (4, 0);
10546 
10547   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10548   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10549 			       extension);
10550   uint64_t  displacement = OPT_SCALE (extended, 128, scaling);
10551 
10552   FRegister a;
10553 
10554   aarch64_get_FP_long_double (cpu, st, & a);
10555   aarch64_set_mem_long_double (cpu, address + displacement, a);
10556 }
10557 
10558 static void
10559 dexLoadImmediatePrePost (sim_cpu *cpu)
10560 {
10561   /* instr[31,30] = size
10562      instr[29,27] = 111
10563      instr[26]    = V
10564      instr[25,24] = 00
10565      instr[23,22] = opc
10566      instr[21]    = 0
10567      instr[20,12] = simm9
10568      instr[11]    = wb : 0 ==> Post, 1 ==> Pre
10569      instr[10]    = 0
10570      instr[9,5]   = Rn may be SP.
10571      instr[4,0]   = Rt */
10572 
10573   uint32_t  V        = INSTR (26, 26);
10574   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10575   int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
10576   WriteBack wb       = INSTR (11, 11);
10577 
10578   if (!V)
10579     {
10580       /* GReg operations.  */
10581       switch (dispatch)
10582 	{
10583 	case 0:	 strb_wb (cpu, imm, wb); return;
10584 	case 1:	 ldrb32_wb (cpu, imm, wb); return;
10585 	case 2:	 ldrsb_wb (cpu, imm, wb); return;
10586 	case 3:	 ldrsb32_wb (cpu, imm, wb); return;
10587 	case 4:	 strh_wb (cpu, imm, wb); return;
10588 	case 5:	 ldrh32_wb (cpu, imm, wb); return;
10589 	case 6:	 ldrsh64_wb (cpu, imm, wb); return;
10590 	case 7:	 ldrsh32_wb (cpu, imm, wb); return;
10591 	case 8:	 str32_wb (cpu, imm, wb); return;
10592 	case 9:	 ldr32_wb (cpu, imm, wb); return;
10593 	case 10: ldrsw_wb (cpu, imm, wb); return;
10594 	case 12: str_wb (cpu, imm, wb); return;
10595 	case 13: ldr_wb (cpu, imm, wb); return;
10596 
10597 	default:
10598 	case 11:
10599 	case 14:
10600 	case 15:
10601 	  HALT_UNALLOC;
10602 	}
10603     }
10604 
10605   /* FReg operations.  */
10606   switch (dispatch)
10607     {
10608     case 2:  fstrq_wb (cpu, imm, wb); return;
10609     case 3:  fldrq_wb (cpu, imm, wb); return;
10610     case 8:  fstrs_wb (cpu, imm, wb); return;
10611     case 9:  fldrs_wb (cpu, imm, wb); return;
10612     case 12: fstrd_wb (cpu, imm, wb); return;
10613     case 13: fldrd_wb (cpu, imm, wb); return;
10614 
10615     case 0:	  /* STUR 8 bit FP.  */
10616     case 1:	  /* LDUR 8 bit FP.  */
10617     case 4:	  /* STUR 16 bit FP.  */
10618     case 5:	  /* LDUR 8 bit FP.  */
10619       HALT_NYI;
10620 
10621     default:
10622     case 6:
10623     case 7:
10624     case 10:
10625     case 11:
10626     case 14:
10627     case 15:
10628       HALT_UNALLOC;
10629     }
10630 }
10631 
10632 static void
10633 dexLoadRegisterOffset (sim_cpu *cpu)
10634 {
10635   /* instr[31,30] = size
10636      instr[29,27] = 111
10637      instr[26]    = V
10638      instr[25,24] = 00
10639      instr[23,22] = opc
10640      instr[21]    = 1
10641      instr[20,16] = rm
10642      instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
10643                              110 ==> SXTW, 111 ==> SXTX,
10644                              ow ==> RESERVED
10645      instr[12]    = scaled
10646      instr[11,10] = 10
10647      instr[9,5]   = rn
10648      instr[4,0]   = rt.  */
10649 
10650   uint32_t  V = INSTR (26, 26);
10651   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10652   Scaling   scale = INSTR (12, 12);
10653   Extension extensionType = INSTR (15, 13);
10654 
10655   /* Check for illegal extension types.  */
10656   if (uimm (extensionType, 1, 1) == 0)
10657     HALT_UNALLOC;
10658 
10659   if (extensionType == UXTX || extensionType == SXTX)
10660     extensionType = NoExtension;
10661 
10662   if (!V)
10663     {
10664       /* GReg operations.  */
10665       switch (dispatch)
10666 	{
10667 	case 0:	 strb_scale_ext (cpu, scale, extensionType); return;
10668 	case 1:	 ldrb32_scale_ext (cpu, scale, extensionType); return;
10669 	case 2:	 ldrsb_scale_ext (cpu, scale, extensionType); return;
10670 	case 3:	 ldrsb32_scale_ext (cpu, scale, extensionType); return;
10671 	case 4:	 strh_scale_ext (cpu, scale, extensionType); return;
10672 	case 5:	 ldrh32_scale_ext (cpu, scale, extensionType); return;
10673 	case 6:	 ldrsh_scale_ext (cpu, scale, extensionType); return;
10674 	case 7:	 ldrsh32_scale_ext (cpu, scale, extensionType); return;
10675 	case 8:	 str32_scale_ext (cpu, scale, extensionType); return;
10676 	case 9:	 ldr32_scale_ext (cpu, scale, extensionType); return;
10677 	case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
10678 	case 12: str_scale_ext (cpu, scale, extensionType); return;
10679 	case 13: ldr_scale_ext (cpu, scale, extensionType); return;
10680 	case 14: prfm_scale_ext (cpu, scale, extensionType); return;
10681 
10682 	default:
10683 	case 11:
10684 	case 15:
10685 	  HALT_UNALLOC;
10686 	}
10687     }
10688 
10689   /* FReg operations.  */
10690   switch (dispatch)
10691     {
10692     case 1: /* LDUR 8 bit FP.  */
10693       HALT_NYI;
10694     case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
10695     case 5: /* LDUR 8 bit FP.  */
10696       HALT_NYI;
10697     case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
10698     case 13: fldrd_scale_ext (cpu, scale, extensionType); return;
10699 
10700     case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
10701     case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
10702     case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
10703     case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
10704     case 12: fstrd_scale_ext (cpu, scale, extensionType); return;
10705 
10706     default:
10707     case 6:
10708     case 7:
10709     case 10:
10710     case 11:
10711     case 14:
10712     case 15:
10713       HALT_UNALLOC;
10714     }
10715 }
10716 
10717 static void
10718 dexLoadUnsignedImmediate (sim_cpu *cpu)
10719 {
10720   /* instr[29,24] == 111_01
10721      instr[31,30] = size
10722      instr[26]    = V
10723      instr[23,22] = opc
10724      instr[21,10] = uimm12 : unsigned immediate offset
10725      instr[9,5]   = rn may be SP.
10726      instr[4,0]   = rt.  */
10727 
10728   uint32_t V = INSTR (26,26);
10729   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10730   uint32_t imm = INSTR (21, 10);
10731 
10732   if (!V)
10733     {
10734       /* GReg operations.  */
10735       switch (dispatch)
10736 	{
10737 	case 0:  strb_abs (cpu, imm); return;
10738 	case 1:  ldrb32_abs (cpu, imm); return;
10739 	case 2:  ldrsb_abs (cpu, imm); return;
10740 	case 3:  ldrsb32_abs (cpu, imm); return;
10741 	case 4:  strh_abs (cpu, imm); return;
10742 	case 5:  ldrh32_abs (cpu, imm); return;
10743 	case 6:  ldrsh_abs (cpu, imm); return;
10744 	case 7:  ldrsh32_abs (cpu, imm); return;
10745 	case 8:  str32_abs (cpu, imm); return;
10746 	case 9:  ldr32_abs (cpu, imm); return;
10747 	case 10: ldrsw_abs (cpu, imm); return;
10748 	case 12: str_abs (cpu, imm); return;
10749 	case 13: ldr_abs (cpu, imm); return;
10750 	case 14: prfm_abs (cpu, imm); return;
10751 
10752 	default:
10753 	case 11:
10754 	case 15:
10755 	  HALT_UNALLOC;
10756 	}
10757     }
10758 
10759   /* FReg operations.  */
10760   switch (dispatch)
10761     {
10762     case 0:  fstrb_abs (cpu, imm); return;
10763     case 4:  fstrh_abs (cpu, imm); return;
10764     case 8:  fstrs_abs (cpu, imm); return;
10765     case 12: fstrd_abs (cpu, imm); return;
10766     case 2:  fstrq_abs (cpu, imm); return;
10767 
10768     case 1:  fldrb_abs (cpu, imm); return;
10769     case 5:  fldrh_abs (cpu, imm); return;
10770     case 9:  fldrs_abs (cpu, imm); return;
10771     case 13: fldrd_abs (cpu, imm); return;
10772     case 3:  fldrq_abs (cpu, imm); return;
10773 
10774     default:
10775     case 6:
10776     case 7:
10777     case 10:
10778     case 11:
10779     case 14:
10780     case 15:
10781       HALT_UNALLOC;
10782     }
10783 }
10784 
10785 static void
10786 dexLoadExclusive (sim_cpu *cpu)
10787 {
10788   /* assert instr[29:24] = 001000;
10789      instr[31,30] = size
10790      instr[23] = 0 if exclusive
10791      instr[22] = L : 1 if load, 0 if store
10792      instr[21] = 1 if pair
10793      instr[20,16] = Rs
10794      instr[15] = o0 : 1 if ordered
10795      instr[14,10] = Rt2
10796      instr[9,5] = Rn
10797      instr[4.0] = Rt.  */
10798 
10799   switch (INSTR (22, 21))
10800     {
10801     case 2:   ldxr (cpu); return;
10802     case 0:   stxr (cpu); return;
10803     default:  HALT_NYI;
10804     }
10805 }
10806 
10807 static void
10808 dexLoadOther (sim_cpu *cpu)
10809 {
10810   uint32_t dispatch;
10811 
10812   /* instr[29,25] = 111_0
10813      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
10814      instr[21:11,10] is the secondary dispatch.  */
10815   if (INSTR (24, 24))
10816     {
10817       dexLoadUnsignedImmediate (cpu);
10818       return;
10819     }
10820 
10821   dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
10822   switch (dispatch)
10823     {
10824     case 0: dexLoadUnscaledImmediate (cpu); return;
10825     case 1: dexLoadImmediatePrePost (cpu); return;
10826     case 3: dexLoadImmediatePrePost (cpu); return;
10827     case 6: dexLoadRegisterOffset (cpu); return;
10828 
10829     default:
10830     case 2:
10831     case 4:
10832     case 5:
10833     case 7:
10834       HALT_NYI;
10835     }
10836 }
10837 
10838 static void
10839 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10840 {
10841   unsigned rn = INSTR (14, 10);
10842   unsigned rd = INSTR (9, 5);
10843   unsigned rm = INSTR (4, 0);
10844   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10845 
10846   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10847     HALT_UNALLOC; /* ??? */
10848 
10849   offset <<= 2;
10850 
10851   if (wb != Post)
10852     address += offset;
10853 
10854   aarch64_set_mem_u32 (cpu, address,
10855 		       aarch64_get_reg_u32 (cpu, rm, NO_SP));
10856   aarch64_set_mem_u32 (cpu, address + 4,
10857 		       aarch64_get_reg_u32 (cpu, rn, NO_SP));
10858 
10859   if (wb == Post)
10860     address += offset;
10861 
10862   if (wb != NoWriteBack)
10863     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10864 }
10865 
10866 static void
10867 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10868 {
10869   unsigned rn = INSTR (14, 10);
10870   unsigned rd = INSTR (9, 5);
10871   unsigned rm = INSTR (4, 0);
10872   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10873 
10874   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10875     HALT_UNALLOC; /* ??? */
10876 
10877   offset <<= 3;
10878 
10879   if (wb != Post)
10880     address += offset;
10881 
10882   aarch64_set_mem_u64 (cpu, address,
10883 		       aarch64_get_reg_u64 (cpu, rm, NO_SP));
10884   aarch64_set_mem_u64 (cpu, address + 8,
10885 		       aarch64_get_reg_u64 (cpu, rn, NO_SP));
10886 
10887   if (wb == Post)
10888     address += offset;
10889 
10890   if (wb != NoWriteBack)
10891     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10892 }
10893 
10894 static void
10895 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10896 {
10897   unsigned rn = INSTR (14, 10);
10898   unsigned rd = INSTR (9, 5);
10899   unsigned rm = INSTR (4, 0);
10900   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10901 
10902   /* Treat this as unalloc to make sure we don't do it.  */
10903   if (rn == rm)
10904     HALT_UNALLOC;
10905 
10906   offset <<= 2;
10907 
10908   if (wb != Post)
10909     address += offset;
10910 
10911   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
10912   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));
10913 
10914   if (wb == Post)
10915     address += offset;
10916 
10917   if (wb != NoWriteBack)
10918     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10919 }
10920 
10921 static void
10922 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10923 {
10924   unsigned rn = INSTR (14, 10);
10925   unsigned rd = INSTR (9, 5);
10926   unsigned rm = INSTR (4, 0);
10927   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10928 
10929   /* Treat this as unalloc to make sure we don't do it.  */
10930   if (rn == rm)
10931     HALT_UNALLOC;
10932 
10933   offset <<= 2;
10934 
10935   if (wb != Post)
10936     address += offset;
10937 
10938   aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
10939   aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));
10940 
10941   if (wb == Post)
10942     address += offset;
10943 
10944   if (wb != NoWriteBack)
10945     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10946 }
10947 
10948 static void
10949 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10950 {
10951   unsigned rn = INSTR (14, 10);
10952   unsigned rd = INSTR (9, 5);
10953   unsigned rm = INSTR (4, 0);
10954   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10955 
10956   /* Treat this as unalloc to make sure we don't do it.  */
10957   if (rn == rm)
10958     HALT_UNALLOC;
10959 
10960   offset <<= 3;
10961 
10962   if (wb != Post)
10963     address += offset;
10964 
10965   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
10966   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));
10967 
10968   if (wb == Post)
10969     address += offset;
10970 
10971   if (wb != NoWriteBack)
10972     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10973 }
10974 
10975 static void
10976 dex_load_store_pair_gr (sim_cpu *cpu)
10977 {
10978   /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
10979      instr[29,25] = instruction encoding: 101_0
10980      instr[26]    = V : 1 if fp 0 if gp
10981      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
10982      instr[22]    = load/store (1=> load)
10983      instr[21,15] = signed, scaled, offset
10984      instr[14,10] = Rn
10985      instr[ 9, 5] = Rd
10986      instr[ 4, 0] = Rm.  */
10987 
10988   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
10989   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
10990 
10991   switch (dispatch)
10992     {
10993     case 2: store_pair_u32 (cpu, offset, Post); return;
10994     case 3: load_pair_u32  (cpu, offset, Post); return;
10995     case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
10996     case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
10997     case 6: store_pair_u32 (cpu, offset, Pre); return;
10998     case 7: load_pair_u32  (cpu, offset, Pre); return;
10999 
11000     case 11: load_pair_s32  (cpu, offset, Post); return;
11001     case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
11002     case 15: load_pair_s32  (cpu, offset, Pre); return;
11003 
11004     case 18: store_pair_u64 (cpu, offset, Post); return;
11005     case 19: load_pair_u64  (cpu, offset, Post); return;
11006     case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
11007     case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
11008     case 22: store_pair_u64 (cpu, offset, Pre); return;
11009     case 23: load_pair_u64  (cpu, offset, Pre); return;
11010 
11011     default:
11012       HALT_UNALLOC;
11013     }
11014 }
11015 
11016 static void
11017 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11018 {
11019   unsigned rn = INSTR (14, 10);
11020   unsigned rd = INSTR (9, 5);
11021   unsigned rm = INSTR (4, 0);
11022   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11023 
11024   offset <<= 2;
11025 
11026   if (wb != Post)
11027     address += offset;
11028 
11029   aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
11030   aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
11031 
11032   if (wb == Post)
11033     address += offset;
11034 
11035   if (wb != NoWriteBack)
11036     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11037 }
11038 
11039 static void
11040 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11041 {
11042   unsigned rn = INSTR (14, 10);
11043   unsigned rd = INSTR (9, 5);
11044   unsigned rm = INSTR (4, 0);
11045   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11046 
11047   offset <<= 3;
11048 
11049   if (wb != Post)
11050     address += offset;
11051 
11052   aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
11053   aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
11054 
11055   if (wb == Post)
11056     address += offset;
11057 
11058   if (wb != NoWriteBack)
11059     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11060 }
11061 
11062 static void
11063 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11064 {
11065   FRegister a;
11066   unsigned rn = INSTR (14, 10);
11067   unsigned rd = INSTR (9, 5);
11068   unsigned rm = INSTR (4, 0);
11069   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11070 
11071   offset <<= 4;
11072 
11073   if (wb != Post)
11074     address += offset;
11075 
11076   aarch64_get_FP_long_double (cpu, rm, & a);
11077   aarch64_set_mem_long_double (cpu, address, a);
11078   aarch64_get_FP_long_double (cpu, rn, & a);
11079   aarch64_set_mem_long_double (cpu, address + 16, a);
11080 
11081   if (wb == Post)
11082     address += offset;
11083 
11084   if (wb != NoWriteBack)
11085     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11086 }
11087 
11088 static void
11089 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11090 {
11091   unsigned rn = INSTR (14, 10);
11092   unsigned rd = INSTR (9, 5);
11093   unsigned rm = INSTR (4, 0);
11094   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11095 
11096   if (rm == rn)
11097     HALT_UNALLOC;
11098 
11099   offset <<= 2;
11100 
11101   if (wb != Post)
11102     address += offset;
11103 
11104   aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
11105   aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
11106 
11107   if (wb == Post)
11108     address += offset;
11109 
11110   if (wb != NoWriteBack)
11111     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11112 }
11113 
11114 static void
11115 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11116 {
11117   unsigned rn = INSTR (14, 10);
11118   unsigned rd = INSTR (9, 5);
11119   unsigned rm = INSTR (4, 0);
11120   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11121 
11122   if (rm == rn)
11123     HALT_UNALLOC;
11124 
11125   offset <<= 3;
11126 
11127   if (wb != Post)
11128     address += offset;
11129 
11130   aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
11131   aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
11132 
11133   if (wb == Post)
11134     address += offset;
11135 
11136   if (wb != NoWriteBack)
11137     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11138 }
11139 
11140 static void
11141 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11142 {
11143   FRegister a;
11144   unsigned rn = INSTR (14, 10);
11145   unsigned rd = INSTR (9, 5);
11146   unsigned rm = INSTR (4, 0);
11147   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11148 
11149   if (rm == rn)
11150     HALT_UNALLOC;
11151 
11152   offset <<= 4;
11153 
11154   if (wb != Post)
11155     address += offset;
11156 
11157   aarch64_get_mem_long_double (cpu, address, & a);
11158   aarch64_set_FP_long_double (cpu, rm, a);
11159   aarch64_get_mem_long_double (cpu, address + 16, & a);
11160   aarch64_set_FP_long_double (cpu, rn, a);
11161 
11162   if (wb == Post)
11163     address += offset;
11164 
11165   if (wb != NoWriteBack)
11166     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11167 }
11168 
11169 static void
11170 dex_load_store_pair_fp (sim_cpu *cpu)
11171 {
11172   /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
11173      instr[29,25] = instruction encoding
11174      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11175      instr[22]    = load/store (1=> load)
11176      instr[21,15] = signed, scaled, offset
11177      instr[14,10] = Rn
11178      instr[ 9, 5] = Rd
11179      instr[ 4, 0] = Rm  */
11180 
11181   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11182   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11183 
11184   switch (dispatch)
11185     {
11186     case 2: store_pair_float (cpu, offset, Post); return;
11187     case 3: load_pair_float  (cpu, offset, Post); return;
11188     case 4: store_pair_float (cpu, offset, NoWriteBack); return;
11189     case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
11190     case 6: store_pair_float (cpu, offset, Pre); return;
11191     case 7: load_pair_float  (cpu, offset, Pre); return;
11192 
11193     case 10: store_pair_double (cpu, offset, Post); return;
11194     case 11: load_pair_double  (cpu, offset, Post); return;
11195     case 12: store_pair_double (cpu, offset, NoWriteBack); return;
11196     case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
11197     case 14: store_pair_double (cpu, offset, Pre); return;
11198     case 15: load_pair_double  (cpu, offset, Pre); return;
11199 
11200     case 18: store_pair_long_double (cpu, offset, Post); return;
11201     case 19: load_pair_long_double  (cpu, offset, Post); return;
11202     case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
11203     case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
11204     case 22: store_pair_long_double (cpu, offset, Pre); return;
11205     case 23: load_pair_long_double  (cpu, offset, Pre); return;
11206 
11207     default:
11208       HALT_UNALLOC;
11209     }
11210 }
11211 
11212 static inline unsigned
11213 vec_reg (unsigned v, unsigned o)
11214 {
11215   return (v + o) & 0x3F;
11216 }
11217 
11218 /* Load multiple N-element structures to N consecutive registers.  */
11219 static void
11220 vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
11221 {
11222   int      all  = INSTR (30, 30);
11223   unsigned size = INSTR (11, 10);
11224   unsigned vd   = INSTR (4, 0);
11225   unsigned i;
11226 
11227   switch (size)
11228     {
11229     case 0: /* 8-bit operations.  */
11230       if (all)
11231 	for (i = 0; i < (16 * N); i++)
11232 	  aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
11233 			      aarch64_get_mem_u8 (cpu, address + i));
11234       else
11235 	for (i = 0; i < (8 * N); i++)
11236 	  aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
11237 			      aarch64_get_mem_u8 (cpu, address + i));
11238       return;
11239 
11240     case 1: /* 16-bit operations.  */
11241       if (all)
11242 	for (i = 0; i < (8 * N); i++)
11243 	  aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
11244 			       aarch64_get_mem_u16 (cpu, address + i * 2));
11245       else
11246 	for (i = 0; i < (4 * N); i++)
11247 	  aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
11248 			       aarch64_get_mem_u16 (cpu, address + i * 2));
11249       return;
11250 
11251     case 2: /* 32-bit operations.  */
11252       if (all)
11253 	for (i = 0; i < (4 * N); i++)
11254 	  aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
11255 			       aarch64_get_mem_u32 (cpu, address + i * 4));
11256       else
11257 	for (i = 0; i < (2 * N); i++)
11258 	  aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
11259 			       aarch64_get_mem_u32 (cpu, address + i * 4));
11260       return;
11261 
11262     case 3: /* 64-bit operations.  */
11263       if (all)
11264 	for (i = 0; i < (2 * N); i++)
11265 	  aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
11266 			       aarch64_get_mem_u64 (cpu, address + i * 8));
11267       else
11268 	for (i = 0; i < N; i++)
11269 	  aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
11270 			       aarch64_get_mem_u64 (cpu, address + i * 8));
11271       return;
11272     }
11273 }
11274 
11275 /* LD4: load multiple 4-element to four consecutive registers.  */
11276 static void
11277 LD4 (sim_cpu *cpu, uint64_t address)
11278 {
11279   vec_load (cpu, address, 4);
11280 }
11281 
11282 /* LD3: load multiple 3-element structures to three consecutive registers.  */
11283 static void
11284 LD3 (sim_cpu *cpu, uint64_t address)
11285 {
11286   vec_load (cpu, address, 3);
11287 }
11288 
11289 /* LD2: load multiple 2-element structures to two consecutive registers.  */
11290 static void
11291 LD2 (sim_cpu *cpu, uint64_t address)
11292 {
11293   vec_load (cpu, address, 2);
11294 }
11295 
11296 /* Load multiple 1-element structures into one register.  */
11297 static void
11298 LD1_1 (sim_cpu *cpu, uint64_t address)
11299 {
11300   int      all  = INSTR (30, 30);
11301   unsigned size = INSTR (11, 10);
11302   unsigned vd   = INSTR (4, 0);
11303   unsigned i;
11304 
11305   switch (size)
11306     {
11307     case 0:
11308       /* LD1 {Vd.16b}, addr, #16 */
11309       /* LD1 {Vd.8b}, addr, #8 */
11310       for (i = 0; i < (all ? 16 : 8); i++)
11311 	aarch64_set_vec_u8 (cpu, vd, i,
11312 			    aarch64_get_mem_u8 (cpu, address + i));
11313       return;
11314 
11315     case 1:
11316       /* LD1 {Vd.8h}, addr, #16 */
11317       /* LD1 {Vd.4h}, addr, #8 */
11318       for (i = 0; i < (all ? 8 : 4); i++)
11319 	aarch64_set_vec_u16 (cpu, vd, i,
11320 			     aarch64_get_mem_u16 (cpu, address + i * 2));
11321       return;
11322 
11323     case 2:
11324       /* LD1 {Vd.4s}, addr, #16 */
11325       /* LD1 {Vd.2s}, addr, #8 */
11326       for (i = 0; i < (all ? 4 : 2); i++)
11327 	aarch64_set_vec_u32 (cpu, vd, i,
11328 			     aarch64_get_mem_u32 (cpu, address + i * 4));
11329       return;
11330 
11331     case 3:
11332       /* LD1 {Vd.2d}, addr, #16 */
11333       /* LD1 {Vd.1d}, addr, #8 */
11334       for (i = 0; i < (all ? 2 : 1); i++)
11335 	aarch64_set_vec_u64 (cpu, vd, i,
11336 			     aarch64_get_mem_u64 (cpu, address + i * 8));
11337       return;
11338     }
11339 }
11340 
11341 /* Load multiple 1-element structures into two registers.  */
11342 static void
11343 LD1_2 (sim_cpu *cpu, uint64_t address)
11344 {
11345   /* FIXME: This algorithm is *exactly* the same as the LD2 version.
11346      So why have two different instructions ?  There must be something
11347      wrong somewhere.  */
11348   vec_load (cpu, address, 2);
11349 }
11350 
11351 /* Load multiple 1-element structures into three registers.  */
11352 static void
11353 LD1_3 (sim_cpu *cpu, uint64_t address)
11354 {
11355   /* FIXME: This algorithm is *exactly* the same as the LD3 version.
11356      So why have two different instructions ?  There must be something
11357      wrong somewhere.  */
11358   vec_load (cpu, address, 3);
11359 }
11360 
11361 /* Load multiple 1-element structures into four registers.  */
11362 static void
11363 LD1_4 (sim_cpu *cpu, uint64_t address)
11364 {
11365   /* FIXME: This algorithm is *exactly* the same as the LD4 version.
11366      So why have two different instructions ?  There must be something
11367      wrong somewhere.  */
11368   vec_load (cpu, address, 4);
11369 }
11370 
11371 /* Store multiple N-element structures to N consecutive registers.  */
11372 static void
11373 vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
11374 {
11375   int      all  = INSTR (30, 30);
11376   unsigned size = INSTR (11, 10);
11377   unsigned vd   = INSTR (4, 0);
11378   unsigned i;
11379 
11380   switch (size)
11381     {
11382     case 0: /* 8-bit operations.  */
11383       if (all)
11384 	for (i = 0; i < (16 * N); i++)
11385 	  aarch64_set_mem_u8
11386 	    (cpu, address + i,
11387 	     aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
11388       else
11389 	for (i = 0; i < (8 * N); i++)
11390 	  aarch64_set_mem_u8
11391 	    (cpu, address + i,
11392 	     aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
11393       return;
11394 
11395     case 1: /* 16-bit operations.  */
11396       if (all)
11397 	for (i = 0; i < (8 * N); i++)
11398 	  aarch64_set_mem_u16
11399 	    (cpu, address + i * 2,
11400 	     aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
11401       else
11402 	for (i = 0; i < (4 * N); i++)
11403 	  aarch64_set_mem_u16
11404 	    (cpu, address + i * 2,
11405 	     aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
11406       return;
11407 
11408     case 2: /* 32-bit operations.  */
11409       if (all)
11410 	for (i = 0; i < (4 * N); i++)
11411 	  aarch64_set_mem_u32
11412 	    (cpu, address + i * 4,
11413 	     aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
11414       else
11415 	for (i = 0; i < (2 * N); i++)
11416 	  aarch64_set_mem_u32
11417 	    (cpu, address + i * 4,
11418 	     aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
11419       return;
11420 
11421     case 3: /* 64-bit operations.  */
11422       if (all)
11423 	for (i = 0; i < (2 * N); i++)
11424 	  aarch64_set_mem_u64
11425 	    (cpu, address + i * 8,
11426 	     aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
11427       else
11428 	for (i = 0; i < N; i++)
11429 	  aarch64_set_mem_u64
11430 	    (cpu, address + i * 8,
11431 	     aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
11432       return;
11433     }
11434 }
11435 
11436 /* Store multiple 4-element structure to four consecutive registers.  */
11437 static void
11438 ST4 (sim_cpu *cpu, uint64_t address)
11439 {
11440   vec_store (cpu, address, 4);
11441 }
11442 
11443 /* Store multiple 3-element structures to three consecutive registers.  */
11444 static void
11445 ST3 (sim_cpu *cpu, uint64_t address)
11446 {
11447   vec_store (cpu, address, 3);
11448 }
11449 
11450 /* Store multiple 2-element structures to two consecutive registers.  */
11451 static void
11452 ST2 (sim_cpu *cpu, uint64_t address)
11453 {
11454   vec_store (cpu, address, 2);
11455 }
11456 
11457 /* Store multiple 1-element structures into one register.  */
11458 static void
11459 ST1_1 (sim_cpu *cpu, uint64_t address)
11460 {
11461   int      all  = INSTR (30, 30);
11462   unsigned size = INSTR (11, 10);
11463   unsigned vd   = INSTR (4, 0);
11464   unsigned i;
11465 
11466   switch (size)
11467     {
11468     case 0:
11469       for (i = 0; i < (all ? 16 : 8); i++)
11470 	aarch64_set_mem_u8 (cpu, address + i,
11471 			    aarch64_get_vec_u8 (cpu, vd, i));
11472       return;
11473 
11474     case 1:
11475       for (i = 0; i < (all ? 8 : 4); i++)
11476 	aarch64_set_mem_u16 (cpu, address + i * 2,
11477 			     aarch64_get_vec_u16 (cpu, vd, i));
11478       return;
11479 
11480     case 2:
11481       for (i = 0; i < (all ? 4 : 2); i++)
11482 	aarch64_set_mem_u32 (cpu, address + i * 4,
11483 			     aarch64_get_vec_u32 (cpu, vd, i));
11484       return;
11485 
11486     case 3:
11487       for (i = 0; i < (all ? 2 : 1); i++)
11488 	aarch64_set_mem_u64 (cpu, address + i * 8,
11489 			     aarch64_get_vec_u64 (cpu, vd, i));
11490       return;
11491     }
11492 }
11493 
11494 /* Store multiple 1-element structures into two registers.  */
11495 static void
11496 ST1_2 (sim_cpu *cpu, uint64_t address)
11497 {
11498   /* FIXME: This algorithm is *exactly* the same as the ST2 version.
11499      So why have two different instructions ?  There must be
11500      something wrong somewhere.  */
11501   vec_store (cpu, address, 2);
11502 }
11503 
11504 /* Store multiple 1-element structures into three registers.  */
11505 static void
11506 ST1_3 (sim_cpu *cpu, uint64_t address)
11507 {
11508   /* FIXME: This algorithm is *exactly* the same as the ST3 version.
11509      So why have two different instructions ?  There must be
11510      something wrong somewhere.  */
11511   vec_store (cpu, address, 3);
11512 }
11513 
11514 /* Store multiple 1-element structures into four registers.  */
11515 static void
11516 ST1_4 (sim_cpu *cpu, uint64_t address)
11517 {
11518   /* FIXME: This algorithm is *exactly* the same as the ST4 version.
11519      So why have two different instructions ?  There must be
11520      something wrong somewhere.  */
11521   vec_store (cpu, address, 4);
11522 }
11523 
11524 static void
11525 do_vec_LDnR (sim_cpu *cpu, uint64_t address)
11526 {
11527   /* instr[31]    = 0
11528      instr[30]    = element selector 0=>half, 1=>all elements
11529      instr[29,24] = 00 1101
11530      instr[23]    = 0=>simple, 1=>post
11531      instr[22]    = 1
11532      instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
11533      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11534                       11111 (immediate post inc)
11535      instr[15,14] = 11
11536      instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
11537      instr[12]    = 0
11538      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11539                                  10=> word(s), 11=> double(d)
11540      instr[9,5]   = address
11541      instr[4,0]   = Vd  */
11542 
11543   unsigned full = INSTR (30, 30);
11544   unsigned vd = INSTR (4, 0);
11545   unsigned size = INSTR (11, 10);
11546   int i;
11547 
11548   NYI_assert (29, 24, 0x0D);
11549   NYI_assert (22, 22, 1);
11550   NYI_assert (15, 14, 3);
11551   NYI_assert (12, 12, 0);
11552 
11553   switch ((INSTR (13, 13) << 1) | INSTR (21, 21))
11554     {
11555     case 0: /* LD1R.  */
11556       switch (size)
11557 	{
11558 	case 0:
11559 	  {
11560 	    uint8_t val = aarch64_get_mem_u8 (cpu, address);
11561 	    for (i = 0; i < (full ? 16 : 8); i++)
11562 	      aarch64_set_vec_u8 (cpu, vd, i, val);
11563 	    break;
11564 	  }
11565 
11566 	case 1:
11567 	  {
11568 	    uint16_t val = aarch64_get_mem_u16 (cpu, address);
11569 	    for (i = 0; i < (full ? 8 : 4); i++)
11570 	      aarch64_set_vec_u16 (cpu, vd, i, val);
11571 	    break;
11572 	  }
11573 
11574 	case 2:
11575 	  {
11576 	    uint32_t val = aarch64_get_mem_u32 (cpu, address);
11577 	    for (i = 0; i < (full ? 4 : 2); i++)
11578 	      aarch64_set_vec_u32 (cpu, vd, i, val);
11579 	    break;
11580 	  }
11581 
11582 	case 3:
11583 	  {
11584 	    uint64_t val = aarch64_get_mem_u64 (cpu, address);
11585 	    for (i = 0; i < (full ? 2 : 1); i++)
11586 	      aarch64_set_vec_u64 (cpu, vd, i, val);
11587 	    break;
11588 	  }
11589 
11590 	default:
11591 	  HALT_UNALLOC;
11592 	}
11593       break;
11594 
11595     case 1: /* LD2R.  */
11596       switch (size)
11597 	{
11598 	case 0:
11599 	  {
11600 	    uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
11601 	    uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
11602 
11603 	    for (i = 0; i < (full ? 16 : 8); i++)
11604 	      {
11605 		aarch64_set_vec_u8 (cpu, vd, 0, val1);
11606 		aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
11607 	      }
11608 	    break;
11609 	  }
11610 
11611 	case 1:
11612 	  {
11613 	    uint16_t val1 = aarch64_get_mem_u16 (cpu, address);
11614 	    uint16_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
11615 
11616 	    for (i = 0; i < (full ? 8 : 4); i++)
11617 	      {
11618 		aarch64_set_vec_u16 (cpu, vd, 0, val1);
11619 		aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
11620 	      }
11621 	    break;
11622 	  }
11623 
11624 	case 2:
11625 	  {
11626 	    uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
11627 	    uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
11628 
11629 	    for (i = 0; i < (full ? 4 : 2); i++)
11630 	      {
11631 		aarch64_set_vec_u32 (cpu, vd, 0, val1);
11632 		aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
11633 	      }
11634 	    break;
11635 	  }
11636 
11637 	case 3:
11638 	  {
11639 	    uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
11640 	    uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
11641 
11642 	    for (i = 0; i < (full ? 2 : 1); i++)
11643 	      {
11644 		aarch64_set_vec_u64 (cpu, vd, 0, val1);
11645 		aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
11646 	      }
11647 	    break;
11648 	  }
11649 
11650 	default:
11651 	  HALT_UNALLOC;
11652 	}
11653       break;
11654 
11655     case 2: /* LD3R.  */
11656       switch (size)
11657 	{
11658 	case 0:
11659 	  {
11660 	    uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
11661 	    uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
11662 	    uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
11663 
11664 	    for (i = 0; i < (full ? 16 : 8); i++)
11665 	      {
11666 		aarch64_set_vec_u8 (cpu, vd, 0, val1);
11667 		aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
11668 		aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
11669 	      }
11670 	  }
11671 	  break;
11672 
11673 	case 1:
11674 	  {
11675 	    uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
11676 	    uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
11677 	    uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
11678 
11679 	    for (i = 0; i < (full ? 8 : 4); i++)
11680 	      {
11681 		aarch64_set_vec_u16 (cpu, vd, 0, val1);
11682 		aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
11683 		aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
11684 	      }
11685 	  }
11686 	  break;
11687 
11688 	case 2:
11689 	  {
11690 	    uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
11691 	    uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
11692 	    uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
11693 
11694 	    for (i = 0; i < (full ? 4 : 2); i++)
11695 	      {
11696 		aarch64_set_vec_u32 (cpu, vd, 0, val1);
11697 		aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
11698 		aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
11699 	      }
11700 	  }
11701 	  break;
11702 
11703 	case 3:
11704 	  {
11705 	    uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
11706 	    uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
11707 	    uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
11708 
11709 	    for (i = 0; i < (full ? 2 : 1); i++)
11710 	      {
11711 		aarch64_set_vec_u64 (cpu, vd, 0, val1);
11712 		aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
11713 		aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
11714 	      }
11715 	  }
11716 	  break;
11717 
11718 	default:
11719 	  HALT_UNALLOC;
11720 	}
11721       break;
11722 
11723     case 3: /* LD4R.  */
11724       switch (size)
11725 	{
11726 	case 0:
11727 	  {
11728 	    uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
11729 	    uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
11730 	    uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
11731 	    uint8_t val4 = aarch64_get_mem_u8 (cpu, address + 3);
11732 
11733 	    for (i = 0; i < (full ? 16 : 8); i++)
11734 	      {
11735 		aarch64_set_vec_u8 (cpu, vd, 0, val1);
11736 		aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
11737 		aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
11738 		aarch64_set_vec_u8 (cpu, vd + 3, 0, val4);
11739 	      }
11740 	  }
11741 	  break;
11742 
11743 	case 1:
11744 	  {
11745 	    uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
11746 	    uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
11747 	    uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
11748 	    uint32_t val4 = aarch64_get_mem_u16 (cpu, address + 6);
11749 
11750 	    for (i = 0; i < (full ? 8 : 4); i++)
11751 	      {
11752 		aarch64_set_vec_u16 (cpu, vd, 0, val1);
11753 		aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
11754 		aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
11755 		aarch64_set_vec_u16 (cpu, vd + 3, 0, val4);
11756 	      }
11757 	  }
11758 	  break;
11759 
11760 	case 2:
11761 	  {
11762 	    uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
11763 	    uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
11764 	    uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
11765 	    uint32_t val4 = aarch64_get_mem_u32 (cpu, address + 12);
11766 
11767 	    for (i = 0; i < (full ? 4 : 2); i++)
11768 	      {
11769 		aarch64_set_vec_u32 (cpu, vd, 0, val1);
11770 		aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
11771 		aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
11772 		aarch64_set_vec_u32 (cpu, vd + 3, 0, val4);
11773 	      }
11774 	  }
11775 	  break;
11776 
11777 	case 3:
11778 	  {
11779 	    uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
11780 	    uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
11781 	    uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
11782 	    uint64_t val4 = aarch64_get_mem_u64 (cpu, address + 24);
11783 
11784 	    for (i = 0; i < (full ? 2 : 1); i++)
11785 	      {
11786 		aarch64_set_vec_u64 (cpu, vd, 0, val1);
11787 		aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
11788 		aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
11789 		aarch64_set_vec_u64 (cpu, vd + 3, 0, val4);
11790 	      }
11791 	  }
11792 	  break;
11793 
11794 	default:
11795 	  HALT_UNALLOC;
11796 	}
11797       break;
11798 
11799     default:
11800       HALT_UNALLOC;
11801     }
11802 }
11803 
11804 static void
11805 do_vec_load_store (sim_cpu *cpu)
11806 {
11807   /* {LD|ST}<N>   {Vd..Vd+N}, vaddr
11808 
11809      instr[31]    = 0
11810      instr[30]    = element selector 0=>half, 1=>all elements
11811      instr[29,25] = 00110
11812      instr[24]    = ?
11813      instr[23]    = 0=>simple, 1=>post
11814      instr[22]    = 0=>store, 1=>load
11815      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
11816      instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
11817                     11111 (immediate post inc)
11818      instr[15,12] = elements and destinations.  eg for load:
11819                      0000=>LD4 => load multiple 4-element to
11820 		     four consecutive registers
11821                      0100=>LD3 => load multiple 3-element to
11822 		     three consecutive registers
11823                      1000=>LD2 => load multiple 2-element to
11824 		     two consecutive registers
11825                      0010=>LD1 => load multiple 1-element to
11826 		     four consecutive registers
11827                      0110=>LD1 => load multiple 1-element to
11828 		     three consecutive registers
11829                      1010=>LD1 => load multiple 1-element to
11830 		     two consecutive registers
11831                      0111=>LD1 => load multiple 1-element to
11832 		     one register
11833                      1100=>LDR1,LDR2
11834                      1110=>LDR3,LDR4
11835      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11836                                  10=> word(s), 11=> double(d)
11837      instr[9,5]   = Vn, can be SP
11838      instr[4,0]   = Vd  */
11839 
11840   int post;
11841   int load;
11842   unsigned vn;
11843   uint64_t address;
11844   int type;
11845 
11846   if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
11847     HALT_NYI;
11848 
11849   type = INSTR (15, 12);
11850   if (type != 0xE && type != 0xE && INSTR (21, 21) != 0)
11851     HALT_NYI;
11852 
11853   post = INSTR (23, 23);
11854   load = INSTR (22, 22);
11855   vn = INSTR (9, 5);
11856   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
11857 
11858   if (post)
11859     {
11860       unsigned vm = INSTR (20, 16);
11861 
11862       if (vm == R31)
11863 	{
11864 	  unsigned sizeof_operation;
11865 
11866 	  switch (type)
11867 	    {
11868 	    case 0: sizeof_operation = 32; break;
11869 	    case 4: sizeof_operation = 24; break;
11870 	    case 8: sizeof_operation = 16; break;
11871 
11872 	    case 0xC:
11873 	      sizeof_operation = INSTR (21, 21) ? 2 : 1;
11874 	      sizeof_operation <<= INSTR (11, 10);
11875 	      break;
11876 
11877 	    case 0xE:
11878 	      sizeof_operation = INSTR (21, 21) ? 8 : 4;
11879 	      sizeof_operation <<= INSTR (11, 10);
11880 	      break;
11881 
11882 	    case 7:
11883 	      /* One register, immediate offset variant.  */
11884 	      sizeof_operation = 8;
11885 	      break;
11886 
11887 	    case 10:
11888 	      /* Two registers, immediate offset variant.  */
11889 	      sizeof_operation = 16;
11890 	      break;
11891 
11892 	    case 6:
11893 	      /* Three registers, immediate offset variant.  */
11894 	      sizeof_operation = 24;
11895 	      break;
11896 
11897 	    case 2:
11898 	      /* Four registers, immediate offset variant.  */
11899 	      sizeof_operation = 32;
11900 	      break;
11901 
11902 	    default:
11903 	      HALT_UNALLOC;
11904 	    }
11905 
11906 	  if (INSTR (30, 30))
11907 	    sizeof_operation *= 2;
11908 
11909 	  aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
11910 	}
11911       else
11912 	aarch64_set_reg_u64 (cpu, vn, SP_OK,
11913 			     address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
11914     }
11915   else
11916     {
11917       NYI_assert (20, 16, 0);
11918     }
11919 
11920   if (load)
11921     {
11922       switch (type)
11923 	{
11924 	case 0:  LD4 (cpu, address); return;
11925 	case 4:  LD3 (cpu, address); return;
11926 	case 8:  LD2 (cpu, address); return;
11927 	case 2:  LD1_4 (cpu, address); return;
11928 	case 6:  LD1_3 (cpu, address); return;
11929 	case 10: LD1_2 (cpu, address); return;
11930 	case 7:  LD1_1 (cpu, address); return;
11931 
11932 	case 0xE:
11933 	case 0xC: do_vec_LDnR (cpu, address); return;
11934 
11935 	default:
11936 	  HALT_NYI;
11937 	}
11938     }
11939 
11940   /* Stores.  */
11941   switch (type)
11942     {
11943     case 0:  ST4 (cpu, address); return;
11944     case 4:  ST3 (cpu, address); return;
11945     case 8:  ST2 (cpu, address); return;
11946     case 2:  ST1_4 (cpu, address); return;
11947     case 6:  ST1_3 (cpu, address); return;
11948     case 10: ST1_2 (cpu, address); return;
11949     case 7:  ST1_1 (cpu, address); return;
11950     default:
11951       HALT_NYI;
11952     }
11953 }
11954 
11955 static void
11956 dexLdSt (sim_cpu *cpu)
11957 {
11958   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
11959      assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
11960              group == GROUP_LDST_1100 || group == GROUP_LDST_1110
11961      bits [29,28:26] of a LS are the secondary dispatch vector.  */
11962   uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));
11963 
11964   switch (group2)
11965     {
11966     case LS_EXCL_000:
11967       dexLoadExclusive (cpu); return;
11968 
11969     case LS_LIT_010:
11970     case LS_LIT_011:
11971       dexLoadLiteral (cpu); return;
11972 
11973     case LS_OTHER_110:
11974     case LS_OTHER_111:
11975       dexLoadOther (cpu); return;
11976 
11977     case LS_ADVSIMD_001:
11978       do_vec_load_store (cpu); return;
11979 
11980     case LS_PAIR_100:
11981       dex_load_store_pair_gr (cpu); return;
11982 
11983     case LS_PAIR_101:
11984       dex_load_store_pair_fp (cpu); return;
11985 
11986     default:
11987       /* Should never reach here.  */
11988       HALT_NYI;
11989     }
11990 }
11991 
11992 /* Specific decode and execute for group Data Processing Register.  */
11993 
11994 static void
11995 dexLogicalShiftedRegister (sim_cpu *cpu)
11996 {
11997   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
11998      instr[30,29] = op
11999      instr[28:24] = 01010
12000      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
12001      instr[21]    = N
12002      instr[20,16] = Rm
12003      instr[15,10] = count : must be 0xxxxx for 32 bit
12004      instr[9,5]   = Rn
12005      instr[4,0]   = Rd  */
12006 
12007   uint32_t size      = INSTR (31, 31);
12008   Shift    shiftType = INSTR (23, 22);
12009   uint32_t count     = INSTR (15, 10);
12010 
12011   /* 32 bit operations must have count[5] = 0.
12012      or else we have an UNALLOC.  */
12013   if (size == 0 && uimm (count, 5, 5))
12014     HALT_UNALLOC;
12015 
12016   /* Dispatch on size:op:N.  */
12017   switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
12018     {
12019     case 0: and32_shift  (cpu, shiftType, count); return;
12020     case 1: bic32_shift  (cpu, shiftType, count); return;
12021     case 2: orr32_shift  (cpu, shiftType, count); return;
12022     case 3: orn32_shift  (cpu, shiftType, count); return;
12023     case 4: eor32_shift  (cpu, shiftType, count); return;
12024     case 5: eon32_shift  (cpu, shiftType, count); return;
12025     case 6: ands32_shift (cpu, shiftType, count); return;
12026     case 7: bics32_shift (cpu, shiftType, count); return;
12027     case 8: and64_shift  (cpu, shiftType, count); return;
12028     case 9: bic64_shift  (cpu, shiftType, count); return;
12029     case 10:orr64_shift  (cpu, shiftType, count); return;
12030     case 11:orn64_shift  (cpu, shiftType, count); return;
12031     case 12:eor64_shift  (cpu, shiftType, count); return;
12032     case 13:eon64_shift  (cpu, shiftType, count); return;
12033     case 14:ands64_shift (cpu, shiftType, count); return;
12034     case 15:bics64_shift (cpu, shiftType, count); return;
12035     }
12036 }
12037 
12038 /* 32 bit conditional select.  */
12039 static void
12040 csel32 (sim_cpu *cpu, CondCode cc)
12041 {
12042   unsigned rm = INSTR (20, 16);
12043   unsigned rn = INSTR (9, 5);
12044   unsigned rd = INSTR (4, 0);
12045 
12046   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12047 		       testConditionCode (cpu, cc)
12048 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12049 		       : aarch64_get_reg_u32 (cpu, rm, NO_SP));
12050 }
12051 
12052 /* 64 bit conditional select.  */
12053 static void
12054 csel64 (sim_cpu *cpu, CondCode cc)
12055 {
12056   unsigned rm = INSTR (20, 16);
12057   unsigned rn = INSTR (9, 5);
12058   unsigned rd = INSTR (4, 0);
12059 
12060   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12061 		       testConditionCode (cpu, cc)
12062 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12063 		       : aarch64_get_reg_u64 (cpu, rm, NO_SP));
12064 }
12065 
12066 /* 32 bit conditional increment.  */
12067 static void
12068 csinc32 (sim_cpu *cpu, CondCode cc)
12069 {
12070   unsigned rm = INSTR (20, 16);
12071   unsigned rn = INSTR (9, 5);
12072   unsigned rd = INSTR (4, 0);
12073 
12074   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12075 		       testConditionCode (cpu, cc)
12076 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12077 		       : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
12078 }
12079 
12080 /* 64 bit conditional increment.  */
12081 static void
12082 csinc64 (sim_cpu *cpu, CondCode cc)
12083 {
12084   unsigned rm = INSTR (20, 16);
12085   unsigned rn = INSTR (9, 5);
12086   unsigned rd = INSTR (4, 0);
12087 
12088   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12089 		       testConditionCode (cpu, cc)
12090 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12091 		       : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
12092 }
12093 
12094 /* 32 bit conditional invert.  */
12095 static void
12096 csinv32 (sim_cpu *cpu, CondCode cc)
12097 {
12098   unsigned rm = INSTR (20, 16);
12099   unsigned rn = INSTR (9, 5);
12100   unsigned rd = INSTR (4, 0);
12101 
12102   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12103 		       testConditionCode (cpu, cc)
12104 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12105 		       : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
12106 }
12107 
12108 /* 64 bit conditional invert.  */
12109 static void
12110 csinv64 (sim_cpu *cpu, CondCode cc)
12111 {
12112   unsigned rm = INSTR (20, 16);
12113   unsigned rn = INSTR (9, 5);
12114   unsigned rd = INSTR (4, 0);
12115 
12116   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12117 		       testConditionCode (cpu, cc)
12118 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12119 		       : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
12120 }
12121 
12122 /* 32 bit conditional negate.  */
12123 static void
12124 csneg32 (sim_cpu *cpu, CondCode cc)
12125 {
12126   unsigned rm = INSTR (20, 16);
12127   unsigned rn = INSTR (9, 5);
12128   unsigned rd = INSTR (4, 0);
12129 
12130   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12131 		       testConditionCode (cpu, cc)
12132 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12133 		       : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
12134 }
12135 
12136 /* 64 bit conditional negate.  */
12137 static void
12138 csneg64 (sim_cpu *cpu, CondCode cc)
12139 {
12140   unsigned rm = INSTR (20, 16);
12141   unsigned rn = INSTR (9, 5);
12142   unsigned rd = INSTR (4, 0);
12143 
12144   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12145 		       testConditionCode (cpu, cc)
12146 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12147 		       : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
12148 }
12149 
12150 static void
12151 dexCondSelect (sim_cpu *cpu)
12152 {
12153   /* instr[28,21] = 11011011
12154      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12155      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
12156                             100 ==> CSINV, 101 ==> CSNEG,
12157                             _1_ ==> UNALLOC
12158      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12159      instr[15,12] = cond
12160      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
12161 
12162   CondCode cc = INSTR (15, 12);
12163   uint32_t S = INSTR (29, 29);
12164   uint32_t op2 = INSTR (11, 10);
12165 
12166   if (S == 1)
12167     HALT_UNALLOC;
12168 
12169   if (op2 & 0x2)
12170     HALT_UNALLOC;
12171 
12172   switch ((INSTR (31, 30) << 1) | op2)
12173     {
12174     case 0: csel32  (cpu, cc); return;
12175     case 1: csinc32 (cpu, cc); return;
12176     case 2: csinv32 (cpu, cc); return;
12177     case 3: csneg32 (cpu, cc); return;
12178     case 4: csel64  (cpu, cc); return;
12179     case 5: csinc64 (cpu, cc); return;
12180     case 6: csinv64 (cpu, cc); return;
12181     case 7: csneg64 (cpu, cc); return;
12182     }
12183 }
12184 
12185 /* Some helpers for counting leading 1 or 0 bits.  */
12186 
12187 /* Counts the number of leading bits which are the same
12188    in a 32 bit value in the range 1 to 32.  */
12189 static uint32_t
12190 leading32 (uint32_t value)
12191 {
12192   int32_t mask= 0xffff0000;
12193   uint32_t count= 16; /* Counts number of bits set in mask.  */
12194   uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
12195   uint32_t hi = 32;   /* Upper bound for number of sign bits.  */
12196 
12197   while (lo + 1 < hi)
12198     {
12199       int32_t test = (value & mask);
12200 
12201       if (test == 0 || test == mask)
12202 	{
12203 	  lo = count;
12204 	  count = (lo + hi) / 2;
12205 	  mask >>= (count - lo);
12206 	}
12207       else
12208 	{
12209 	  hi = count;
12210 	  count = (lo + hi) / 2;
12211 	  mask <<= hi - count;
12212 	}
12213     }
12214 
12215   if (lo != hi)
12216     {
12217       int32_t test;
12218 
12219       mask >>= 1;
12220       test = (value & mask);
12221 
12222       if (test == 0 || test == mask)
12223 	count = hi;
12224       else
12225 	count = lo;
12226     }
12227 
12228   return count;
12229 }
12230 
12231 /* Counts the number of leading bits which are the same
12232    in a 64 bit value in the range 1 to 64.  */
12233 static uint64_t
12234 leading64 (uint64_t value)
12235 {
12236   int64_t mask= 0xffffffff00000000LL;
12237   uint64_t count = 32; /* Counts number of bits set in mask.  */
12238   uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
12239   uint64_t hi = 64;    /* Upper bound for number of sign bits.  */
12240 
12241   while (lo + 1 < hi)
12242     {
12243       int64_t test = (value & mask);
12244 
12245       if (test == 0 || test == mask)
12246 	{
12247 	  lo = count;
12248 	  count = (lo + hi) / 2;
12249 	  mask >>= (count - lo);
12250 	}
12251       else
12252 	{
12253 	  hi = count;
12254 	  count = (lo + hi) / 2;
12255 	  mask <<= hi - count;
12256 	}
12257     }
12258 
12259   if (lo != hi)
12260     {
12261       int64_t test;
12262 
12263       mask >>= 1;
12264       test = (value & mask);
12265 
12266       if (test == 0 || test == mask)
12267 	count = hi;
12268       else
12269 	count = lo;
12270     }
12271 
12272   return count;
12273 }
12274 
12275 /* Bit operations.  */
12276 /* N.B register args may not be SP.  */
12277 
12278 /* 32 bit count leading sign bits.  */
12279 static void
12280 cls32 (sim_cpu *cpu)
12281 {
12282   unsigned rn = INSTR (9, 5);
12283   unsigned rd = INSTR (4, 0);
12284 
12285   /* N.B. the result needs to exclude the leading bit.  */
12286   aarch64_set_reg_u64
12287     (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
12288 }
12289 
12290 /* 64 bit count leading sign bits.  */
12291 static void
12292 cls64 (sim_cpu *cpu)
12293 {
12294   unsigned rn = INSTR (9, 5);
12295   unsigned rd = INSTR (4, 0);
12296 
12297   /* N.B. the result needs to exclude the leading bit.  */
12298   aarch64_set_reg_u64
12299     (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
12300 }
12301 
12302 /* 32 bit count leading zero bits.  */
12303 static void
12304 clz32 (sim_cpu *cpu)
12305 {
12306   unsigned rn = INSTR (9, 5);
12307   unsigned rd = INSTR (4, 0);
12308   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12309 
12310   /* if the sign (top) bit is set then the count is 0.  */
12311   if (pick32 (value, 31, 31))
12312     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12313   else
12314     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
12315 }
12316 
12317 /* 64 bit count leading zero bits.  */
12318 static void
12319 clz64 (sim_cpu *cpu)
12320 {
12321   unsigned rn = INSTR (9, 5);
12322   unsigned rd = INSTR (4, 0);
12323   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12324 
12325   /* if the sign (top) bit is set then the count is 0.  */
12326   if (pick64 (value, 63, 63))
12327     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12328   else
12329     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
12330 }
12331 
12332 /* 32 bit reverse bits.  */
12333 static void
12334 rbit32 (sim_cpu *cpu)
12335 {
12336   unsigned rn = INSTR (9, 5);
12337   unsigned rd = INSTR (4, 0);
12338   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12339   uint32_t result = 0;
12340   int i;
12341 
12342   for (i = 0; i < 32; i++)
12343     {
12344       result <<= 1;
12345       result |= (value & 1);
12346       value >>= 1;
12347     }
12348   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12349 }
12350 
12351 /* 64 bit reverse bits.  */
12352 static void
12353 rbit64 (sim_cpu *cpu)
12354 {
12355   unsigned rn = INSTR (9, 5);
12356   unsigned rd = INSTR (4, 0);
12357   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12358   uint64_t result = 0;
12359   int i;
12360 
12361   for (i = 0; i < 64; i++)
12362     {
12363       result <<= 1;
12364       result |= (value & 1UL);
12365       value >>= 1;
12366     }
12367   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12368 }
12369 
12370 /* 32 bit reverse bytes.  */
12371 static void
12372 rev32 (sim_cpu *cpu)
12373 {
12374   unsigned rn = INSTR (9, 5);
12375   unsigned rd = INSTR (4, 0);
12376   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12377   uint32_t result = 0;
12378   int i;
12379 
12380   for (i = 0; i < 4; i++)
12381     {
12382       result <<= 8;
12383       result |= (value & 0xff);
12384       value >>= 8;
12385     }
12386   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12387 }
12388 
12389 /* 64 bit reverse bytes.  */
12390 static void
12391 rev64 (sim_cpu *cpu)
12392 {
12393   unsigned rn = INSTR (9, 5);
12394   unsigned rd = INSTR (4, 0);
12395   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12396   uint64_t result = 0;
12397   int i;
12398 
12399   for (i = 0; i < 8; i++)
12400     {
12401       result <<= 8;
12402       result |= (value & 0xffULL);
12403       value >>= 8;
12404     }
12405   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12406 }
12407 
12408 /* 32 bit reverse shorts.  */
12409 /* N.B.this reverses the order of the bytes in each half word.  */
12410 static void
12411 revh32 (sim_cpu *cpu)
12412 {
12413   unsigned rn = INSTR (9, 5);
12414   unsigned rd = INSTR (4, 0);
12415   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12416   uint32_t result = 0;
12417   int i;
12418 
12419   for (i = 0; i < 2; i++)
12420     {
12421       result <<= 8;
12422       result |= (value & 0x00ff00ff);
12423       value >>= 8;
12424     }
12425   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12426 }
12427 
12428 /* 64 bit reverse shorts.  */
12429 /* N.B.this reverses the order of the bytes in each half word.  */
12430 static void
12431 revh64 (sim_cpu *cpu)
12432 {
12433   unsigned rn = INSTR (9, 5);
12434   unsigned rd = INSTR (4, 0);
12435   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12436   uint64_t result = 0;
12437   int i;
12438 
12439   for (i = 0; i < 2; i++)
12440     {
12441       result <<= 8;
12442       result |= (value & 0x00ff00ff00ff00ffULL);
12443       value >>= 8;
12444     }
12445   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12446 }
12447 
12448 static void
12449 dexDataProc1Source (sim_cpu *cpu)
12450 {
12451   /* instr[30]    = 1
12452      instr[28,21] = 111010110
12453      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12454      instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
12455      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
12456      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
12457                              000010 ==> REV, 000011 ==> UNALLOC
12458                              000100 ==> CLZ, 000101 ==> CLS
12459                              ow ==> UNALLOC
12460      instr[9,5]   = rn : may not be SP
12461      instr[4,0]   = rd : may not be SP.  */
12462 
12463   uint32_t S = INSTR (29, 29);
12464   uint32_t opcode2 = INSTR (20, 16);
12465   uint32_t opcode = INSTR (15, 10);
12466   uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
12467 
12468   if (S == 1)
12469     HALT_UNALLOC;
12470 
12471   if (opcode2 != 0)
12472     HALT_UNALLOC;
12473 
12474   if (opcode & 0x38)
12475     HALT_UNALLOC;
12476 
12477   switch (dispatch)
12478     {
12479     case 0: rbit32 (cpu); return;
12480     case 1: revh32 (cpu); return;
12481     case 2: rev32 (cpu); return;
12482     case 4: clz32 (cpu); return;
12483     case 5: cls32 (cpu); return;
12484     case 8: rbit64 (cpu); return;
12485     case 9: revh64 (cpu); return;
12486     case 10:rev32 (cpu); return;
12487     case 11:rev64 (cpu); return;
12488     case 12:clz64 (cpu); return;
12489     case 13:cls64 (cpu); return;
12490     default: HALT_UNALLOC;
12491     }
12492 }
12493 
12494 /* Variable shift.
12495    Shifts by count supplied in register.
12496    N.B register args may not be SP.
12497    These all use the shifted auxiliary function for
12498    simplicity and clarity.  Writing the actual shift
12499    inline would avoid a branch and so be faster but
12500    would also necessitate getting signs right.  */
12501 
12502 /* 32 bit arithmetic shift right.  */
12503 static void
12504 asrv32 (sim_cpu *cpu)
12505 {
12506   unsigned rm = INSTR (20, 16);
12507   unsigned rn = INSTR (9, 5);
12508   unsigned rd = INSTR (4, 0);
12509 
12510   aarch64_set_reg_u64
12511     (cpu, rd, NO_SP,
12512      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
12513 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12514 }
12515 
12516 /* 64 bit arithmetic shift right.  */
12517 static void
12518 asrv64 (sim_cpu *cpu)
12519 {
12520   unsigned rm = INSTR (20, 16);
12521   unsigned rn = INSTR (9, 5);
12522   unsigned rd = INSTR (4, 0);
12523 
12524   aarch64_set_reg_u64
12525     (cpu, rd, NO_SP,
12526      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
12527 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12528 }
12529 
12530 /* 32 bit logical shift left.  */
12531 static void
12532 lslv32 (sim_cpu *cpu)
12533 {
12534   unsigned rm = INSTR (20, 16);
12535   unsigned rn = INSTR (9, 5);
12536   unsigned rd = INSTR (4, 0);
12537 
12538   aarch64_set_reg_u64
12539     (cpu, rd, NO_SP,
12540      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
12541 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12542 }
12543 
12544 /* 64 bit arithmetic shift left.  */
12545 static void
12546 lslv64 (sim_cpu *cpu)
12547 {
12548   unsigned rm = INSTR (20, 16);
12549   unsigned rn = INSTR (9, 5);
12550   unsigned rd = INSTR (4, 0);
12551 
12552   aarch64_set_reg_u64
12553     (cpu, rd, NO_SP,
12554      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
12555 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12556 }
12557 
12558 /* 32 bit logical shift right.  */
12559 static void
12560 lsrv32 (sim_cpu *cpu)
12561 {
12562   unsigned rm = INSTR (20, 16);
12563   unsigned rn = INSTR (9, 5);
12564   unsigned rd = INSTR (4, 0);
12565 
12566   aarch64_set_reg_u64
12567     (cpu, rd, NO_SP,
12568      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
12569 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12570 }
12571 
12572 /* 64 bit logical shift right.  */
12573 static void
12574 lsrv64 (sim_cpu *cpu)
12575 {
12576   unsigned rm = INSTR (20, 16);
12577   unsigned rn = INSTR (9, 5);
12578   unsigned rd = INSTR (4, 0);
12579 
12580   aarch64_set_reg_u64
12581     (cpu, rd, NO_SP,
12582      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
12583 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12584 }
12585 
12586 /* 32 bit rotate right.  */
12587 static void
12588 rorv32 (sim_cpu *cpu)
12589 {
12590   unsigned rm = INSTR (20, 16);
12591   unsigned rn = INSTR (9, 5);
12592   unsigned rd = INSTR (4, 0);
12593 
12594   aarch64_set_reg_u64
12595     (cpu, rd, NO_SP,
12596      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
12597 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12598 }
12599 
12600 /* 64 bit rotate right.  */
12601 static void
12602 rorv64 (sim_cpu *cpu)
12603 {
12604   unsigned rm = INSTR (20, 16);
12605   unsigned rn = INSTR (9, 5);
12606   unsigned rd = INSTR (4, 0);
12607 
12608   aarch64_set_reg_u64
12609     (cpu, rd, NO_SP,
12610      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
12611 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12612 }
12613 
12614 
12615 /* divide.  */
12616 
12617 /* 32 bit signed divide.  */
12618 static void
12619 cpuiv32 (sim_cpu *cpu)
12620 {
12621   unsigned rm = INSTR (20, 16);
12622   unsigned rn = INSTR (9, 5);
12623   unsigned rd = INSTR (4, 0);
12624   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12625   /* TODO : check that this rounds towards zero as required.  */
12626   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
12627   int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);
12628 
12629   aarch64_set_reg_s64 (cpu, rd, NO_SP,
12630 		       divisor ? ((int32_t) (dividend / divisor)) : 0);
12631 }
12632 
12633 /* 64 bit signed divide.  */
12634 static void
12635 cpuiv64 (sim_cpu *cpu)
12636 {
12637   unsigned rm = INSTR (20, 16);
12638   unsigned rn = INSTR (9, 5);
12639   unsigned rd = INSTR (4, 0);
12640 
12641   /* TODO : check that this rounds towards zero as required.  */
12642   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
12643 
12644   aarch64_set_reg_s64
12645     (cpu, rd, NO_SP,
12646      divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
12647 }
12648 
12649 /* 32 bit unsigned divide.  */
12650 static void
12651 udiv32 (sim_cpu *cpu)
12652 {
12653   unsigned rm = INSTR (20, 16);
12654   unsigned rn = INSTR (9, 5);
12655   unsigned rd = INSTR (4, 0);
12656 
12657   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12658   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12659   uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);
12660 
12661   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12662 		       divisor ? (uint32_t) (dividend / divisor) : 0);
12663 }
12664 
12665 /* 64 bit unsigned divide.  */
12666 static void
12667 udiv64 (sim_cpu *cpu)
12668 {
12669   unsigned rm = INSTR (20, 16);
12670   unsigned rn = INSTR (9, 5);
12671   unsigned rd = INSTR (4, 0);
12672 
12673   /* TODO : check that this rounds towards zero as required.  */
12674   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12675 
12676   aarch64_set_reg_u64
12677     (cpu, rd, NO_SP,
12678      divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
12679 }
12680 
12681 static void
12682 dexDataProc2Source (sim_cpu *cpu)
12683 {
12684   /* assert instr[30] == 0
12685      instr[28,21] == 11010110
12686      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
12687      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12688      instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
12689                              001000 ==> LSLV, 001001 ==> LSRV
12690                              001010 ==> ASRV, 001011 ==> RORV
12691                              ow ==> UNALLOC.  */
12692 
12693   uint32_t dispatch;
12694   uint32_t S = INSTR (29, 29);
12695   uint32_t opcode = INSTR (15, 10);
12696 
12697   if (S == 1)
12698     HALT_UNALLOC;
12699 
12700   if (opcode & 0x34)
12701     HALT_UNALLOC;
12702 
12703   dispatch = (  (INSTR (31, 31) << 3)
12704 	      | (uimm (opcode, 3, 3) << 2)
12705 	      |  uimm (opcode, 1, 0));
12706   switch (dispatch)
12707     {
12708     case 2:  udiv32 (cpu); return;
12709     case 3:  cpuiv32 (cpu); return;
12710     case 4:  lslv32 (cpu); return;
12711     case 5:  lsrv32 (cpu); return;
12712     case 6:  asrv32 (cpu); return;
12713     case 7:  rorv32 (cpu); return;
12714     case 10: udiv64 (cpu); return;
12715     case 11: cpuiv64 (cpu); return;
12716     case 12: lslv64 (cpu); return;
12717     case 13: lsrv64 (cpu); return;
12718     case 14: asrv64 (cpu); return;
12719     case 15: rorv64 (cpu); return;
12720     default: HALT_UNALLOC;
12721     }
12722 }
12723 
12724 
12725 /* Multiply.  */
12726 
12727 /* 32 bit multiply and add.  */
12728 static void
12729 madd32 (sim_cpu *cpu)
12730 {
12731   unsigned rm = INSTR (20, 16);
12732   unsigned ra = INSTR (14, 10);
12733   unsigned rn = INSTR (9, 5);
12734   unsigned rd = INSTR (4, 0);
12735 
12736   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12737   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12738 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
12739 		       + aarch64_get_reg_u32 (cpu, rn, NO_SP)
12740 		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12741 }
12742 
12743 /* 64 bit multiply and add.  */
12744 static void
12745 madd64 (sim_cpu *cpu)
12746 {
12747   unsigned rm = INSTR (20, 16);
12748   unsigned ra = INSTR (14, 10);
12749   unsigned rn = INSTR (9, 5);
12750   unsigned rd = INSTR (4, 0);
12751 
12752   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12753   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12754 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
12755 		       + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
12756 			  * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
12757 }
12758 
12759 /* 32 bit multiply and sub.  */
12760 static void
12761 msub32 (sim_cpu *cpu)
12762 {
12763   unsigned rm = INSTR (20, 16);
12764   unsigned ra = INSTR (14, 10);
12765   unsigned rn = INSTR (9, 5);
12766   unsigned rd = INSTR (4, 0);
12767 
12768   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12769   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12770 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
12771 		       - aarch64_get_reg_u32 (cpu, rn, NO_SP)
12772 		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12773 }
12774 
12775 /* 64 bit multiply and sub.  */
12776 static void
12777 msub64 (sim_cpu *cpu)
12778 {
12779   unsigned rm = INSTR (20, 16);
12780   unsigned ra = INSTR (14, 10);
12781   unsigned rn = INSTR (9, 5);
12782   unsigned rd = INSTR (4, 0);
12783 
12784   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12785   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12786 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
12787 		       - aarch64_get_reg_u64 (cpu, rn, NO_SP)
12788 		       * aarch64_get_reg_u64 (cpu, rm, NO_SP));
12789 }
12790 
12791 /* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
12792 static void
12793 smaddl (sim_cpu *cpu)
12794 {
12795   unsigned rm = INSTR (20, 16);
12796   unsigned ra = INSTR (14, 10);
12797   unsigned rn = INSTR (9, 5);
12798   unsigned rd = INSTR (4, 0);
12799 
12800   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12801      obtain a 64 bit product.  */
12802   aarch64_set_reg_s64
12803     (cpu, rd, NO_SP,
12804      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12805      + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12806      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12807 }
12808 
12809 /* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
12810 static void
12811 smsubl (sim_cpu *cpu)
12812 {
12813   unsigned rm = INSTR (20, 16);
12814   unsigned ra = INSTR (14, 10);
12815   unsigned rn = INSTR (9, 5);
12816   unsigned rd = INSTR (4, 0);
12817 
12818   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12819      obtain a 64 bit product.  */
12820   aarch64_set_reg_s64
12821     (cpu, rd, NO_SP,
12822      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12823      - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12824      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12825 }
12826 
12827 /* Integer Multiply/Divide.  */
12828 
12829 /* First some macros and a helper function.  */
12830 /* Macros to test or access elements of 64 bit words.  */
12831 
12832 /* Mask used to access lo 32 bits of 64 bit unsigned int.  */
12833 #define LOW_WORD_MASK ((1ULL << 32) - 1)
12834 /* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12835 #define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
12836 /* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12837 #define highWordToU64(_value_u64) ((_value_u64) >> 32)
12838 
12839 /* Offset of sign bit in 64 bit signed integger.  */
12840 #define SIGN_SHIFT_U64 63
12841 /* The sign bit itself -- also identifies the minimum negative int value.  */
12842 #define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
12843 /* Return true if a 64 bit signed int presented as an unsigned int is the
12844    most negative value.  */
12845 #define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
12846 /* Return true (non-zero) if a 64 bit signed int presented as an unsigned
12847    int has its sign bit set to false.  */
12848 #define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
12849 /* Return 1L or -1L according to whether a 64 bit signed int presented as
12850    an unsigned int has its sign bit set or not.  */
12851 #define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
12852 /* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
12853 #define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)
12854 
12855 /* Multiply two 64 bit ints and return.
12856    the hi 64 bits of the 128 bit product.  */
12857 
12858 static uint64_t
12859 mul64hi (uint64_t value1, uint64_t value2)
12860 {
12861   uint64_t resultmid1;
12862   uint64_t result;
12863   uint64_t value1_lo = lowWordToU64 (value1);
12864   uint64_t value1_hi = highWordToU64 (value1) ;
12865   uint64_t value2_lo = lowWordToU64 (value2);
12866   uint64_t value2_hi = highWordToU64 (value2);
12867 
12868   /* Cross-multiply and collect results.  */
12869   uint64_t xproductlo = value1_lo * value2_lo;
12870   uint64_t xproductmid1 = value1_lo * value2_hi;
12871   uint64_t xproductmid2 = value1_hi * value2_lo;
12872   uint64_t xproducthi = value1_hi * value2_hi;
12873   uint64_t carry = 0;
12874   /* Start accumulating 64 bit results.  */
12875   /* Drop bottom half of lowest cross-product.  */
12876   uint64_t resultmid = xproductlo >> 32;
12877   /* Add in middle products.  */
12878   resultmid = resultmid + xproductmid1;
12879 
12880   /* Check for overflow.  */
12881   if (resultmid < xproductmid1)
12882     /* Carry over 1 into top cross-product.  */
12883     carry++;
12884 
12885   resultmid1  = resultmid + xproductmid2;
12886 
12887   /* Check for overflow.  */
12888   if (resultmid1 < xproductmid2)
12889     /* Carry over 1 into top cross-product.  */
12890     carry++;
12891 
12892   /* Drop lowest 32 bits of middle cross-product.  */
12893   result = resultmid1 >> 32;
12894 
12895   /* Add top cross-product plus and any carry.  */
12896   result += xproducthi + carry;
12897 
12898   return result;
12899 }
12900 
12901 /* Signed multiply high, source, source2 :
12902    64 bit, dest <-- high 64-bit of result.  */
12903 static void
12904 smulh (sim_cpu *cpu)
12905 {
12906   uint64_t uresult;
12907   int64_t  result;
12908   unsigned rm = INSTR (20, 16);
12909   unsigned rn = INSTR (9, 5);
12910   unsigned rd = INSTR (4, 0);
12911   GReg     ra = INSTR (14, 10);
12912   int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12913   int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12914   uint64_t uvalue1;
12915   uint64_t uvalue2;
12916   int64_t  signum = 1;
12917 
12918   if (ra != R31)
12919     HALT_UNALLOC;
12920 
12921   /* Convert to unsigned and use the unsigned mul64hi routine
12922      the fix the sign up afterwards.  */
12923   if (value1 < 0)
12924     {
12925       signum *= -1L;
12926       uvalue1 = -value1;
12927     }
12928   else
12929     {
12930       uvalue1 = value1;
12931     }
12932 
12933   if (value2 < 0)
12934     {
12935       signum *= -1L;
12936       uvalue2 = -value2;
12937     }
12938   else
12939     {
12940       uvalue2 = value2;
12941     }
12942 
12943   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12944   uresult = mul64hi (uvalue1, uvalue2);
12945   result = uresult;
12946   result *= signum;
12947 
12948   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
12949 }
12950 
12951 /* Unsigned multiply add long -- source, source2 :
12952    32 bit, source3 : 64 bit.  */
12953 static void
12954 umaddl (sim_cpu *cpu)
12955 {
12956   unsigned rm = INSTR (20, 16);
12957   unsigned ra = INSTR (14, 10);
12958   unsigned rn = INSTR (9, 5);
12959   unsigned rd = INSTR (4, 0);
12960 
12961   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12962   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12963      obtain a 64 bit product.  */
12964   aarch64_set_reg_u64
12965     (cpu, rd, NO_SP,
12966      aarch64_get_reg_u64 (cpu, ra, NO_SP)
12967      + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
12968      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
12969 }
12970 
12971 /* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
12972 static void
12973 umsubl (sim_cpu *cpu)
12974 {
12975   unsigned rm = INSTR (20, 16);
12976   unsigned ra = INSTR (14, 10);
12977   unsigned rn = INSTR (9, 5);
12978   unsigned rd = INSTR (4, 0);
12979 
12980   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12981   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12982      obtain a 64 bit product.  */
12983   aarch64_set_reg_u64
12984     (cpu, rd, NO_SP,
12985      aarch64_get_reg_u64 (cpu, ra, NO_SP)
12986      - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
12987      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
12988 }
12989 
12990 /* Unsigned multiply high, source, source2 :
12991    64 bit, dest <-- high 64-bit of result.  */
12992 static void
12993 umulh (sim_cpu *cpu)
12994 {
12995   unsigned rm = INSTR (20, 16);
12996   unsigned rn = INSTR (9, 5);
12997   unsigned rd = INSTR (4, 0);
12998   GReg     ra = INSTR (14, 10);
12999 
13000   if (ra != R31)
13001     HALT_UNALLOC;
13002 
13003   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13004   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13005 		       mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
13006 				aarch64_get_reg_u64 (cpu, rm, NO_SP)));
13007 }
13008 
13009 static void
13010 dexDataProc3Source (sim_cpu *cpu)
13011 {
13012   /* assert instr[28,24] == 11011.  */
13013   /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
13014      instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
13015      instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
13016      instr[15] = o0 : 0/1 ==> ok
13017      instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
13018                               0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
13019                               0100 ==> SMULH,                   (64 bit only)
13020                               1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
13021                               1100 ==> UMULH                    (64 bit only)
13022                               ow ==> UNALLOC.  */
13023 
13024   uint32_t dispatch;
13025   uint32_t size = INSTR (31, 31);
13026   uint32_t op54 = INSTR (30, 29);
13027   uint32_t op31 = INSTR (23, 21);
13028   uint32_t o0 = INSTR (15, 15);
13029 
13030   if (op54 != 0)
13031     HALT_UNALLOC;
13032 
13033   if (size == 0)
13034     {
13035       if (op31 != 0)
13036 	HALT_UNALLOC;
13037 
13038       if (o0 == 0)
13039 	madd32 (cpu);
13040       else
13041 	msub32 (cpu);
13042       return;
13043     }
13044 
13045   dispatch = (op31 << 1) | o0;
13046 
13047   switch (dispatch)
13048     {
13049     case 0:  madd64 (cpu); return;
13050     case 1:  msub64 (cpu); return;
13051     case 2:  smaddl (cpu); return;
13052     case 3:  smsubl (cpu); return;
13053     case 4:  smulh (cpu); return;
13054     case 10: umaddl (cpu); return;
13055     case 11: umsubl (cpu); return;
13056     case 12: umulh (cpu); return;
13057     default: HALT_UNALLOC;
13058     }
13059 }
13060 
13061 static void
13062 dexDPReg (sim_cpu *cpu)
13063 {
13064   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13065      assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
13066      bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
13067   uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));
13068 
13069   switch (group2)
13070     {
13071     case DPREG_LOG_000:
13072     case DPREG_LOG_001:
13073       dexLogicalShiftedRegister (cpu); return;
13074 
13075     case DPREG_ADDSHF_010:
13076       dexAddSubtractShiftedRegister (cpu); return;
13077 
13078     case DPREG_ADDEXT_011:
13079       dexAddSubtractExtendedRegister (cpu); return;
13080 
13081     case DPREG_ADDCOND_100:
13082       {
13083 	/* This set bundles a variety of different operations.  */
13084 	/* Check for.  */
13085 	/* 1) add/sub w carry.  */
13086 	uint32_t mask1 = 0x1FE00000U;
13087 	uint32_t val1  = 0x1A000000U;
13088 	/* 2) cond compare register/immediate.  */
13089 	uint32_t mask2 = 0x1FE00000U;
13090 	uint32_t val2  = 0x1A400000U;
13091 	/* 3) cond select.  */
13092 	uint32_t mask3 = 0x1FE00000U;
13093 	uint32_t val3  = 0x1A800000U;
13094 	/* 4) data proc 1/2 source.  */
13095 	uint32_t mask4 = 0x1FE00000U;
13096 	uint32_t val4  = 0x1AC00000U;
13097 
13098 	if ((aarch64_get_instr (cpu) & mask1) == val1)
13099 	  dexAddSubtractWithCarry (cpu);
13100 
13101 	else if ((aarch64_get_instr (cpu) & mask2) == val2)
13102 	  CondCompare (cpu);
13103 
13104 	else if ((aarch64_get_instr (cpu) & mask3) == val3)
13105 	  dexCondSelect (cpu);
13106 
13107 	else if ((aarch64_get_instr (cpu) & mask4) == val4)
13108 	  {
13109 	    /* Bit 30 is clear for data proc 2 source
13110 	       and set for data proc 1 source.  */
13111 	    if (aarch64_get_instr (cpu)  & (1U << 30))
13112 	      dexDataProc1Source (cpu);
13113 	    else
13114 	      dexDataProc2Source (cpu);
13115 	  }
13116 
13117 	else
13118 	  /* Should not reach here.  */
13119 	  HALT_NYI;
13120 
13121 	return;
13122       }
13123 
13124     case DPREG_3SRC_110:
13125       dexDataProc3Source (cpu); return;
13126 
13127     case DPREG_UNALLOC_101:
13128       HALT_UNALLOC;
13129 
13130     case DPREG_3SRC_111:
13131       dexDataProc3Source (cpu); return;
13132 
13133     default:
13134       /* Should never reach here.  */
13135       HALT_NYI;
13136     }
13137 }
13138 
13139 /* Unconditional Branch immediate.
13140    Offset is a PC-relative byte offset in the range +/- 128MiB.
13141    The offset is assumed to be raw from the decode i.e. the
13142    simulator is expected to scale them from word offsets to byte.  */
13143 
13144 /* Unconditional branch.  */
13145 static void
13146 buc (sim_cpu *cpu, int32_t offset)
13147 {
13148   aarch64_set_next_PC_by_offset (cpu, offset);
13149 }
13150 
13151 static unsigned stack_depth = 0;
13152 
13153 /* Unconditional branch and link -- writes return PC to LR.  */
13154 static void
13155 bl (sim_cpu *cpu, int32_t offset)
13156 {
13157   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13158   aarch64_save_LR (cpu);
13159   aarch64_set_next_PC_by_offset (cpu, offset);
13160 
13161   if (TRACE_BRANCH_P (cpu))
13162     {
13163       ++ stack_depth;
13164       TRACE_BRANCH (cpu,
13165 		    " %*scall %" PRIx64 " [%s]"
13166 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13167 		    stack_depth, " ", aarch64_get_next_PC (cpu),
13168 		    aarch64_get_func (aarch64_get_next_PC (cpu)),
13169 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
13170 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
13171 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
13172 		    );
13173     }
13174 }
13175 
13176 /* Unconditional Branch register.
13177    Branch/return address is in source register.  */
13178 
13179 /* Unconditional branch.  */
13180 static void
13181 br (sim_cpu *cpu)
13182 {
13183   unsigned rn = INSTR (9, 5);
13184   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13185   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13186 }
13187 
13188 /* Unconditional branch and link -- writes return PC to LR.  */
13189 static void
13190 blr (sim_cpu *cpu)
13191 {
13192   unsigned rn = INSTR (9, 5);
13193 
13194   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13195   /* The pseudo code in the spec says we update LR before fetching.
13196      the value from the rn.  */
13197   aarch64_save_LR (cpu);
13198   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13199 
13200   if (TRACE_BRANCH_P (cpu))
13201     {
13202       ++ stack_depth;
13203       TRACE_BRANCH (cpu,
13204 		    " %*scall %" PRIx64 " [%s]"
13205 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13206 		    stack_depth, " ", aarch64_get_next_PC (cpu),
13207 		    aarch64_get_func (aarch64_get_next_PC (cpu)),
13208 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
13209 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
13210 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
13211 		    );
13212     }
13213 }
13214 
13215 /* Return -- assembler will default source to LR this is functionally
13216    equivalent to br but, presumably, unlike br it side effects the
13217    branch predictor.  */
13218 static void
13219 ret (sim_cpu *cpu)
13220 {
13221   unsigned rn = INSTR (9, 5);
13222   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13223 
13224   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13225   if (TRACE_BRANCH_P (cpu))
13226     {
13227       TRACE_BRANCH (cpu,
13228 		    " %*sreturn [result: %" PRIx64 "]",
13229 		    stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
13230       -- stack_depth;
13231     }
13232 }
13233 
13234 /* NOP -- we implement this and call it from the decode in case we
13235    want to intercept it later.  */
13236 
13237 static void
13238 nop (sim_cpu *cpu)
13239 {
13240   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13241 }
13242 
13243 /* Data synchronization barrier.  */
13244 
13245 static void
13246 dsb (sim_cpu *cpu)
13247 {
13248   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13249 }
13250 
13251 /* Data memory barrier.  */
13252 
13253 static void
13254 dmb (sim_cpu *cpu)
13255 {
13256   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13257 }
13258 
13259 /* Instruction synchronization barrier.  */
13260 
13261 static void
13262 isb (sim_cpu *cpu)
13263 {
13264   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13265 }
13266 
13267 static void
13268 dexBranchImmediate (sim_cpu *cpu)
13269 {
13270   /* assert instr[30,26] == 00101
13271      instr[31] ==> 0 == B, 1 == BL
13272      instr[25,0] == imm26 branch offset counted in words.  */
13273 
13274   uint32_t top = INSTR (31, 31);
13275   /* We have a 26 byte signed word offset which we need to pass to the
13276      execute routine as a signed byte offset.  */
13277   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
13278 
13279   if (top)
13280     bl (cpu, offset);
13281   else
13282     buc (cpu, offset);
13283 }
13284 
13285 /* Control Flow.  */
13286 
13287 /* Conditional branch
13288 
13289    Offset is a PC-relative byte offset in the range +/- 1MiB pos is
13290    a bit position in the range 0 .. 63
13291 
13292    cc is a CondCode enum value as pulled out of the decode
13293 
13294    N.B. any offset register (source) can only be Xn or Wn.  */
13295 
13296 static void
13297 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
13298 {
13299   /* The test returns TRUE if CC is met.  */
13300   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13301   if (testConditionCode (cpu, cc))
13302     aarch64_set_next_PC_by_offset (cpu, offset);
13303 }
13304 
13305 /* 32 bit branch on register non-zero.  */
13306 static void
13307 cbnz32 (sim_cpu *cpu, int32_t offset)
13308 {
13309   unsigned rt = INSTR (4, 0);
13310 
13311   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13312   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
13313     aarch64_set_next_PC_by_offset (cpu, offset);
13314 }
13315 
13316 /* 64 bit branch on register zero.  */
13317 static void
13318 cbnz (sim_cpu *cpu, int32_t offset)
13319 {
13320   unsigned rt = INSTR (4, 0);
13321 
13322   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13323   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
13324     aarch64_set_next_PC_by_offset (cpu, offset);
13325 }
13326 
13327 /* 32 bit branch on register non-zero.  */
13328 static void
13329 cbz32 (sim_cpu *cpu, int32_t offset)
13330 {
13331   unsigned rt = INSTR (4, 0);
13332 
13333   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13334   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
13335     aarch64_set_next_PC_by_offset (cpu, offset);
13336 }
13337 
13338 /* 64 bit branch on register zero.  */
13339 static void
13340 cbz (sim_cpu *cpu, int32_t offset)
13341 {
13342   unsigned rt = INSTR (4, 0);
13343 
13344   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13345   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
13346     aarch64_set_next_PC_by_offset (cpu, offset);
13347 }
13348 
13349 /* Branch on register bit test non-zero -- one size fits all.  */
13350 static void
13351 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13352 {
13353   unsigned rt = INSTR (4, 0);
13354 
13355   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13356   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (1 << pos))
13357     aarch64_set_next_PC_by_offset (cpu, offset);
13358 }
13359 
13360 /* Branch on register bit test zero -- one size fits all.  */
13361 static void
13362 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13363 {
13364   unsigned rt = INSTR (4, 0);
13365 
13366   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13367   if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (1 << pos)))
13368     aarch64_set_next_PC_by_offset (cpu, offset);
13369 }
13370 
13371 static void
13372 dexCompareBranchImmediate (sim_cpu *cpu)
13373 {
13374   /* instr[30,25] = 01 1010
13375      instr[31]    = size : 0 ==> 32, 1 ==> 64
13376      instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
13377      instr[23,5]  = simm19 branch offset counted in words
13378      instr[4,0]   = rt  */
13379 
13380   uint32_t size = INSTR (31, 31);
13381   uint32_t op   = INSTR (24, 24);
13382   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13383 
13384   if (size == 0)
13385     {
13386       if (op == 0)
13387 	cbz32 (cpu, offset);
13388       else
13389 	cbnz32 (cpu, offset);
13390     }
13391   else
13392     {
13393       if (op == 0)
13394 	cbz (cpu, offset);
13395       else
13396 	cbnz (cpu, offset);
13397     }
13398 }
13399 
13400 static void
13401 dexTestBranchImmediate (sim_cpu *cpu)
13402 {
13403   /* instr[31]    = b5 : bit 5 of test bit idx
13404      instr[30,25] = 01 1011
13405      instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
13406      instr[23,19] = b40 : bits 4 to 0 of test bit idx
13407      instr[18,5]  = simm14 : signed offset counted in words
13408      instr[4,0]   = uimm5  */
13409 
13410   uint32_t pos = ((INSTR (31, 31) << 4) | INSTR (23, 19));
13411   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
13412 
13413   NYI_assert (30, 25, 0x1b);
13414 
13415   if (INSTR (24, 24) == 0)
13416     tbz (cpu, pos, offset);
13417   else
13418     tbnz (cpu, pos, offset);
13419 }
13420 
13421 static void
13422 dexCondBranchImmediate (sim_cpu *cpu)
13423 {
13424   /* instr[31,25] = 010 1010
13425      instr[24]    = op1; op => 00 ==> B.cond
13426      instr[23,5]  = simm19 : signed offset counted in words
13427      instr[4]     = op0
13428      instr[3,0]   = cond  */
13429 
13430   int32_t offset;
13431   uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
13432 
13433   NYI_assert (31, 25, 0x2a);
13434 
13435   if (op != 0)
13436     HALT_UNALLOC;
13437 
13438   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13439 
13440   bcc (cpu, offset, INSTR (3, 0));
13441 }
13442 
13443 static void
13444 dexBranchRegister (sim_cpu *cpu)
13445 {
13446   /* instr[31,25] = 110 1011
13447      instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
13448      instr[20,16] = op2 : must be 11111
13449      instr[15,10] = op3 : must be 000000
13450      instr[4,0]   = op2 : must be 11111.  */
13451 
13452   uint32_t op = INSTR (24, 21);
13453   uint32_t op2 = INSTR (20, 16);
13454   uint32_t op3 = INSTR (15, 10);
13455   uint32_t op4 = INSTR (4, 0);
13456 
13457   NYI_assert (31, 25, 0x6b);
13458 
13459   if (op2 != 0x1F || op3 != 0 || op4 != 0)
13460     HALT_UNALLOC;
13461 
13462   if (op == 0)
13463     br (cpu);
13464 
13465   else if (op == 1)
13466     blr (cpu);
13467 
13468   else if (op == 2)
13469     ret (cpu);
13470 
13471   else
13472     {
13473       /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
13474       /* anything else is unallocated.  */
13475       uint32_t rn = INSTR (4, 0);
13476 
13477       if (rn != 0x1f)
13478 	HALT_UNALLOC;
13479 
13480       if (op == 4 || op == 5)
13481 	HALT_NYI;
13482 
13483       HALT_UNALLOC;
13484     }
13485 }
13486 
13487 /* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
13488    but this may not be available.  So instead we define the values we need
13489    here.  */
13490 #define AngelSVC_Reason_Open		0x01
13491 #define AngelSVC_Reason_Close		0x02
13492 #define AngelSVC_Reason_Write		0x05
13493 #define AngelSVC_Reason_Read		0x06
13494 #define AngelSVC_Reason_IsTTY		0x09
13495 #define AngelSVC_Reason_Seek		0x0A
13496 #define AngelSVC_Reason_FLen		0x0C
13497 #define AngelSVC_Reason_Remove		0x0E
13498 #define AngelSVC_Reason_Rename		0x0F
13499 #define AngelSVC_Reason_Clock		0x10
13500 #define AngelSVC_Reason_Time		0x11
13501 #define AngelSVC_Reason_System		0x12
13502 #define AngelSVC_Reason_Errno		0x13
13503 #define AngelSVC_Reason_GetCmdLine	0x15
13504 #define AngelSVC_Reason_HeapInfo	0x16
13505 #define AngelSVC_Reason_ReportException 0x18
13506 #define AngelSVC_Reason_Elapsed         0x30
13507 
13508 
13509 static void
13510 handle_halt (sim_cpu *cpu, uint32_t val)
13511 {
13512   uint64_t result = 0;
13513 
13514   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13515   if (val != 0xf000)
13516     {
13517       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
13518       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13519 		       sim_stopped, SIM_SIGTRAP);
13520     }
13521 
13522   /* We have encountered an Angel SVC call.  See if we can process it.  */
13523   switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
13524     {
13525     case AngelSVC_Reason_HeapInfo:
13526       {
13527 	/* Get the values.  */
13528 	uint64_t stack_top = aarch64_get_stack_start (cpu);
13529 	uint64_t heap_base = aarch64_get_heap_start (cpu);
13530 
13531 	/* Get the pointer  */
13532 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13533 	ptr = aarch64_get_mem_u64 (cpu, ptr);
13534 
13535 	/* Fill in the memory block.  */
13536 	/* Start addr of heap.  */
13537 	aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
13538 	/* End addr of heap.  */
13539 	aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
13540 	/* Lowest stack addr.  */
13541 	aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
13542 	/* Initial stack addr.  */
13543 	aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);
13544 
13545 	TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
13546       }
13547       break;
13548 
13549     case AngelSVC_Reason_Open:
13550       {
13551 	/* Get the pointer  */
13552 	/* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
13553 	/* FIXME: For now we just assume that we will only be asked
13554 	   to open the standard file descriptors.  */
13555 	static int fd = 0;
13556 	result = fd ++;
13557 
13558 	TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
13559       }
13560       break;
13561 
13562     case AngelSVC_Reason_Close:
13563       {
13564 	uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13565 	TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
13566 	result = 0;
13567       }
13568       break;
13569 
13570     case AngelSVC_Reason_Errno:
13571       result = 0;
13572       TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
13573       break;
13574 
13575     case AngelSVC_Reason_Clock:
13576       result =
13577 #ifdef CLOCKS_PER_SEC
13578 	(CLOCKS_PER_SEC >= 100)
13579 	? (clock () / (CLOCKS_PER_SEC / 100))
13580 	: ((clock () * 100) / CLOCKS_PER_SEC)
13581 #else
13582 	/* Presume unix... clock() returns microseconds.  */
13583 	(clock () / 10000)
13584 #endif
13585 	;
13586 	TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
13587       break;
13588 
13589     case AngelSVC_Reason_GetCmdLine:
13590       {
13591 	/* Get the pointer  */
13592 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13593 	ptr = aarch64_get_mem_u64 (cpu, ptr);
13594 
13595 	/* FIXME: No command line for now.  */
13596 	aarch64_set_mem_u64 (cpu, ptr, 0);
13597 	TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
13598       }
13599       break;
13600 
13601     case AngelSVC_Reason_IsTTY:
13602       result = 1;
13603 	TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
13604       break;
13605 
13606     case AngelSVC_Reason_Write:
13607       {
13608 	/* Get the pointer  */
13609 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13610 	/* Get the write control block.  */
13611 	uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
13612 	uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
13613 	uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);
13614 
13615 	TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
13616 		       PRIx64 " on descriptor %" PRIx64,
13617 		       len, buf, fd);
13618 
13619 	if (len > 1280)
13620 	  {
13621 	    TRACE_SYSCALL (cpu,
13622 			   " AngelSVC: Write: Suspiciously long write: %ld",
13623 			   (long) len);
13624 	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13625 			     sim_stopped, SIM_SIGBUS);
13626 	  }
13627 	else if (fd == 1)
13628 	  {
13629 	    printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
13630 	  }
13631 	else if (fd == 2)
13632 	  {
13633 	    TRACE (cpu, 0, "\n");
13634 	    sim_io_eprintf (CPU_STATE (cpu), "%.*s",
13635 			    (int) len, aarch64_get_mem_ptr (cpu, buf));
13636 	    TRACE (cpu, 0, "\n");
13637 	  }
13638 	else
13639 	  {
13640 	    TRACE_SYSCALL (cpu,
13641 			   " AngelSVC: Write: Unexpected file handle: %d",
13642 			   (int) fd);
13643 	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13644 			     sim_stopped, SIM_SIGABRT);
13645 	  }
13646       }
13647       break;
13648 
13649     case AngelSVC_Reason_ReportException:
13650       {
13651 	/* Get the pointer  */
13652 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13653 	/*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
13654 	uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
13655 	uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);
13656 
13657 	TRACE_SYSCALL (cpu,
13658 		       "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
13659 		       type, state);
13660 
13661 	if (type == 0x20026)
13662 	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13663 			   sim_exited, state);
13664 	else
13665 	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13666 			   sim_stopped, SIM_SIGINT);
13667       }
13668       break;
13669 
13670     case AngelSVC_Reason_Read:
13671     case AngelSVC_Reason_FLen:
13672     case AngelSVC_Reason_Seek:
13673     case AngelSVC_Reason_Remove:
13674     case AngelSVC_Reason_Time:
13675     case AngelSVC_Reason_System:
13676     case AngelSVC_Reason_Rename:
13677     case AngelSVC_Reason_Elapsed:
13678     default:
13679       TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
13680 		     aarch64_get_reg_u32 (cpu, 0, NO_SP));
13681       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13682 		       sim_stopped, SIM_SIGTRAP);
13683     }
13684 
13685   aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
13686 }
13687 
13688 static void
13689 dexExcpnGen (sim_cpu *cpu)
13690 {
13691   /* instr[31:24] = 11010100
13692      instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
13693                           010 ==> HLT,       101 ==> DBG GEN EXCPN
13694      instr[20,5]  = imm16
13695      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
13696      instr[1,0]   = LL : discriminates opc  */
13697 
13698   uint32_t opc = INSTR (23, 21);
13699   uint32_t imm16 = INSTR (20, 5);
13700   uint32_t opc2 = INSTR (4, 2);
13701   uint32_t LL;
13702 
13703   NYI_assert (31, 24, 0xd4);
13704 
13705   if (opc2 != 0)
13706     HALT_UNALLOC;
13707 
13708   LL = INSTR (1, 0);
13709 
13710   /* We only implement HLT and BRK for now.  */
13711   if (opc == 1 && LL == 0)
13712     {
13713       TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
13714       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13715 		       sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
13716     }
13717 
13718   if (opc == 2 && LL == 0)
13719     handle_halt (cpu, imm16);
13720 
13721   else if (opc == 0 || opc == 5)
13722     HALT_NYI;
13723 
13724   else
13725     HALT_UNALLOC;
13726 }
13727 
13728 /* Stub for accessing system registers.  */
13729 
13730 static uint64_t
13731 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13732 	    unsigned crm, unsigned op2)
13733 {
13734   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
13735     /* DCZID_EL0 - the Data Cache Zero ID register.
13736        We do not support DC ZVA at the moment, so
13737        we return a value with the disable bit set.
13738        We implement support for the DCZID register since
13739        it is used by the C library's memset function.  */
13740     return ((uint64_t) 1) << 4;
13741 
13742   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
13743     /* Cache Type Register.  */
13744     return 0x80008000UL;
13745 
13746   if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
13747     /* TPIDR_EL0 - thread pointer id.  */
13748     return aarch64_get_thread_id (cpu);
13749 
13750   if (op1 == 3 && crm == 4 && op2 == 0)
13751     return aarch64_get_FPCR (cpu);
13752 
13753   if (op1 == 3 && crm == 4 && op2 == 1)
13754     return aarch64_get_FPSR (cpu);
13755 
13756   else if (op1 == 3 && crm == 2 && op2 == 0)
13757     return aarch64_get_CPSR (cpu);
13758 
13759   HALT_NYI;
13760 }
13761 
13762 static void
13763 system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13764 	    unsigned crm, unsigned op2, uint64_t val)
13765 {
13766   if (op1 == 3 && crm == 4 && op2 == 0)
13767     aarch64_set_FPCR (cpu, val);
13768 
13769   else if (op1 == 3 && crm == 4 && op2 == 1)
13770     aarch64_set_FPSR (cpu, val);
13771 
13772   else if (op1 == 3 && crm == 2 && op2 == 0)
13773     aarch64_set_CPSR (cpu, val);
13774 
13775   else
13776     HALT_NYI;
13777 }
13778 
13779 static void
13780 do_mrs (sim_cpu *cpu)
13781 {
13782   /* instr[31:20] = 1101 0101 0001 1
13783      instr[19]    = op0
13784      instr[18,16] = op1
13785      instr[15,12] = CRn
13786      instr[11,8]  = CRm
13787      instr[7,5]   = op2
13788      instr[4,0]   = Rt  */
13789   unsigned sys_op0 = INSTR (19, 19) + 2;
13790   unsigned sys_op1 = INSTR (18, 16);
13791   unsigned sys_crn = INSTR (15, 12);
13792   unsigned sys_crm = INSTR (11, 8);
13793   unsigned sys_op2 = INSTR (7, 5);
13794   unsigned rt = INSTR (4, 0);
13795 
13796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13797   aarch64_set_reg_u64 (cpu, rt, NO_SP,
13798 		       system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
13799 }
13800 
13801 static void
13802 do_MSR_immediate (sim_cpu *cpu)
13803 {
13804   /* instr[31:19] = 1101 0101 0000 0
13805      instr[18,16] = op1
13806      instr[15,12] = 0100
13807      instr[11,8]  = CRm
13808      instr[7,5]   = op2
13809      instr[4,0]   = 1 1111  */
13810 
13811   unsigned op1 = INSTR (18, 16);
13812   /*unsigned crm = INSTR (11, 8);*/
13813   unsigned op2 = INSTR (7, 5);
13814 
13815   NYI_assert (31, 19, 0x1AA0);
13816   NYI_assert (15, 12, 0x4);
13817   NYI_assert (4,  0,  0x1F);
13818 
13819   if (op1 == 0)
13820     {
13821       if (op2 == 5)
13822 	HALT_NYI; /* set SPSel.  */
13823       else
13824 	HALT_UNALLOC;
13825     }
13826   else if (op1 == 3)
13827     {
13828       if (op2 == 6)
13829 	HALT_NYI; /* set DAIFset.  */
13830       else if (op2 == 7)
13831 	HALT_NYI; /* set DAIFclr.  */
13832       else
13833 	HALT_UNALLOC;
13834     }
13835   else
13836     HALT_UNALLOC;
13837 }
13838 
13839 static void
13840 do_MSR_reg (sim_cpu *cpu)
13841 {
13842   /* instr[31:20] = 1101 0101 0001
13843      instr[19]    = op0
13844      instr[18,16] = op1
13845      instr[15,12] = CRn
13846      instr[11,8]  = CRm
13847      instr[7,5]   = op2
13848      instr[4,0]   = Rt  */
13849 
13850   unsigned sys_op0 = INSTR (19, 19) + 2;
13851   unsigned sys_op1 = INSTR (18, 16);
13852   unsigned sys_crn = INSTR (15, 12);
13853   unsigned sys_crm = INSTR (11, 8);
13854   unsigned sys_op2 = INSTR (7, 5);
13855   unsigned rt = INSTR (4, 0);
13856 
13857   NYI_assert (31, 20, 0xD51);
13858 
13859   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13860   system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
13861 	      aarch64_get_reg_u64 (cpu, rt, NO_SP));
13862 }
13863 
13864 static void
13865 do_SYS (sim_cpu *cpu)
13866 {
13867   /* instr[31,19] = 1101 0101 0000 1
13868      instr[18,16] = op1
13869      instr[15,12] = CRn
13870      instr[11,8]  = CRm
13871      instr[7,5]   = op2
13872      instr[4,0]   = Rt  */
13873   NYI_assert (31, 19, 0x1AA1);
13874 
13875   /* FIXME: For now we just silently accept system ops.  */
13876 }
13877 
13878 static void
13879 dexSystem (sim_cpu *cpu)
13880 {
13881   /* instr[31:22] = 1101 01010 0
13882      instr[21]    = L
13883      instr[20,19] = op0
13884      instr[18,16] = op1
13885      instr[15,12] = CRn
13886      instr[11,8]  = CRm
13887      instr[7,5]   = op2
13888      instr[4,0]   = uimm5  */
13889 
13890   /* We are interested in HINT, DSB, DMB and ISB
13891 
13892      Hint #0 encodes NOOP (this is the only hint we care about)
13893      L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
13894      CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101
13895 
13896      DSB, DMB, ISB are data store barrier, data memory barrier and
13897      instruction store barrier, respectively, where
13898 
13899      L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
13900      op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
13901      CRm<3:2> ==> domain, CRm<1:0> ==> types,
13902      domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
13903               10 ==> InerShareable, 11 ==> FullSystem
13904      types :  01 ==> Reads, 10 ==> Writes,
13905               11 ==> All, 00 ==> All (domain == FullSystem).  */
13906 
13907   unsigned rt = INSTR (4, 0);
13908 
13909   NYI_assert (31, 22, 0x354);
13910 
13911   switch (INSTR (21, 12))
13912     {
13913     case 0x032:
13914       if (rt == 0x1F)
13915 	{
13916 	  /* NOP has CRm != 0000 OR.  */
13917 	  /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
13918 	  uint32_t crm = INSTR (11, 8);
13919 	  uint32_t op2 = INSTR (7, 5);
13920 
13921 	  if (crm != 0 || (op2 == 0 || op2 > 5))
13922 	    {
13923 	      /* Actually call nop method so we can reimplement it later.  */
13924 	      nop (cpu);
13925 	      return;
13926 	    }
13927 	}
13928       HALT_NYI;
13929 
13930     case 0x033:
13931       {
13932 	uint32_t op2 =  INSTR (7, 5);
13933 
13934 	switch (op2)
13935 	  {
13936 	  case 2: HALT_NYI;
13937 	  case 4: dsb (cpu); return;
13938 	  case 5: dmb (cpu); return;
13939 	  case 6: isb (cpu); return;
13940 	  default: HALT_UNALLOC;
13941 	}
13942       }
13943 
13944     case 0x3B0:
13945     case 0x3B4:
13946     case 0x3BD:
13947       do_mrs (cpu);
13948       return;
13949 
13950     case 0x0B7:
13951       do_SYS (cpu); /* DC is an alias of SYS.  */
13952       return;
13953 
13954     default:
13955       if (INSTR (21, 20) == 0x1)
13956 	do_MSR_reg (cpu);
13957       else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
13958 	do_MSR_immediate (cpu);
13959       else
13960 	HALT_NYI;
13961       return;
13962     }
13963 }
13964 
13965 static void
13966 dexBr (sim_cpu *cpu)
13967 {
13968   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13969      assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
13970      bits [31,29] of a BrExSys are the secondary dispatch vector.  */
13971   uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));
13972 
13973   switch (group2)
13974     {
13975     case BR_IMM_000:
13976       return dexBranchImmediate (cpu);
13977 
13978     case BR_IMMCMP_001:
13979       /* Compare has bit 25 clear while test has it set.  */
13980       if (!INSTR (25, 25))
13981 	dexCompareBranchImmediate (cpu);
13982       else
13983 	dexTestBranchImmediate (cpu);
13984       return;
13985 
13986     case BR_IMMCOND_010:
13987       /* This is a conditional branch if bit 25 is clear otherwise
13988          unallocated.  */
13989       if (!INSTR (25, 25))
13990 	dexCondBranchImmediate (cpu);
13991       else
13992 	HALT_UNALLOC;
13993       return;
13994 
13995     case BR_UNALLOC_011:
13996       HALT_UNALLOC;
13997 
13998     case BR_IMM_100:
13999       dexBranchImmediate (cpu);
14000       return;
14001 
14002     case BR_IMMCMP_101:
14003       /* Compare has bit 25 clear while test has it set.  */
14004       if (!INSTR (25, 25))
14005 	dexCompareBranchImmediate (cpu);
14006       else
14007 	dexTestBranchImmediate (cpu);
14008       return;
14009 
14010     case BR_REG_110:
14011       /* Unconditional branch reg has bit 25 set.  */
14012       if (INSTR (25, 25))
14013 	dexBranchRegister (cpu);
14014 
14015       /* This includes both Excpn Gen, System and unalloc operations.
14016          We need to decode the Excpn Gen operation BRK so we can plant
14017          debugger entry points.
14018          Excpn Gen operations have instr [24] = 0.
14019          we need to decode at least one of the System operations NOP
14020          which is an alias for HINT #0.
14021          System operations have instr [24,22] = 100.  */
14022       else if (INSTR (24, 24) == 0)
14023 	dexExcpnGen (cpu);
14024 
14025       else if (INSTR (24, 22) == 4)
14026 	dexSystem (cpu);
14027 
14028       else
14029 	HALT_UNALLOC;
14030 
14031       return;
14032 
14033     case BR_UNALLOC_111:
14034       HALT_UNALLOC;
14035 
14036     default:
14037       /* Should never reach here.  */
14038       HALT_NYI;
14039     }
14040 }
14041 
14042 static void
14043 aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
14044 {
14045   /* We need to check if gdb wants an in here.  */
14046   /* checkBreak (cpu);.  */
14047 
14048   uint64_t group = dispatchGroup (aarch64_get_instr (cpu));
14049 
14050   switch (group)
14051     {
14052     case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
14053     case GROUP_LDST_0100:     dexLdSt (cpu); break;
14054     case GROUP_DPREG_0101:    dexDPReg (cpu); break;
14055     case GROUP_LDST_0110:     dexLdSt (cpu); break;
14056     case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
14057     case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
14058     case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
14059     case GROUP_BREXSYS_1010:  dexBr (cpu); break;
14060     case GROUP_BREXSYS_1011:  dexBr (cpu); break;
14061     case GROUP_LDST_1100:     dexLdSt (cpu); break;
14062     case GROUP_DPREG_1101:    dexDPReg (cpu); break;
14063     case GROUP_LDST_1110:     dexLdSt (cpu); break;
14064     case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;
14065 
14066     case GROUP_UNALLOC_0001:
14067     case GROUP_UNALLOC_0010:
14068     case GROUP_UNALLOC_0011:
14069       HALT_UNALLOC;
14070 
14071     default:
14072       /* Should never reach here.  */
14073       HALT_NYI;
14074     }
14075 }
14076 
14077 static bfd_boolean
14078 aarch64_step (sim_cpu *cpu)
14079 {
14080   uint64_t pc = aarch64_get_PC (cpu);
14081 
14082   if (pc == TOP_LEVEL_RETURN_PC)
14083     return FALSE;
14084 
14085   aarch64_set_next_PC (cpu, pc + 4);
14086 
14087   /* Code is always little-endian.  */
14088   sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
14089 			& aarch64_get_instr (cpu), pc, 4);
14090   aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
14091 
14092   TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
14093 	      aarch64_get_instr (cpu));
14094   TRACE_DISASM (cpu, pc);
14095 
14096   aarch64_decode_and_execute (cpu, pc);
14097 
14098   return TRUE;
14099 }
14100 
14101 void
14102 aarch64_run (SIM_DESC sd)
14103 {
14104   sim_cpu *cpu = STATE_CPU (sd, 0);
14105 
14106   while (aarch64_step (cpu))
14107     aarch64_update_PC (cpu);
14108 
14109   sim_engine_halt (sd, NULL, NULL, aarch64_get_PC (cpu),
14110 		   sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
14111 }
14112 
14113 void
14114 aarch64_init (sim_cpu *cpu, uint64_t pc)
14115 {
14116   uint64_t sp = aarch64_get_stack_start (cpu);
14117 
14118   /* Install SP, FP and PC and set LR to -20
14119      so we can detect a top-level return.  */
14120   aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
14121   aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
14122   aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
14123   aarch64_set_next_PC (cpu, pc);
14124   aarch64_update_PC (cpu);
14125   aarch64_init_LIT_table ();
14126 }
14127