xref: /netbsd-src/external/gpl3/gcc/dist/libphobos/libdruntime/core/simd.d (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 // Written in the D programming language.
2 
3 /**
4  * Builtin SIMD intrinsics
5  *
6  * Source: $(DRUNTIMESRC core/_simd.d)
7  *
8  * Copyright: Copyright Digital Mars 2012-2020
9  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
10  * Authors:   $(HTTP digitalmars.com, Walter Bright),
11  * Source:    $(DRUNTIMESRC core/_simd.d)
12  */
13 
14 module core.simd;
15 
16 pure:
17 nothrow:
18 @safe:
19 @nogc:
20 
21 /*******************************
22  * Create a vector type.
23  *
24  * Parameters:
25  *      T = one of double[2], float[4], void[16], byte[16], ubyte[16],
26  *      short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
27  *      For 256 bit vectors,
28  *      one of double[4], float[8], void[32], byte[32], ubyte[32],
29  *      short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
30  */
31 
Vector(T)32 template Vector(T)
33 {
34     /* __vector is compiler magic, hide it behind a template.
35      * The compiler will reject T's that don't work.
36      */
37     alias __vector(T) Vector;
38 }
39 
40 /* Handy aliases
41  */
42 static if (is(Vector!(void[8])))    alias Vector!(void[8])    void8;        ///
43 static if (is(Vector!(double[1])))  alias Vector!(double[1])  double1;      ///
44 static if (is(Vector!(float[2])))   alias Vector!(float[2])   float2;       ///
45 static if (is(Vector!(byte[8])))    alias Vector!(byte[8])    byte8;        ///
46 static if (is(Vector!(ubyte[8])))   alias Vector!(ubyte[8])   ubyte8;       ///
47 static if (is(Vector!(short[4])))   alias Vector!(short[4])   short4;       ///
48 static if (is(Vector!(ushort[4])))  alias Vector!(ushort[4])  ushort4;      ///
49 static if (is(Vector!(int[2])))     alias Vector!(int[2])     int2;         ///
50 static if (is(Vector!(uint[2])))    alias Vector!(uint[2])    uint2;        ///
51 static if (is(Vector!(long[1])))    alias Vector!(long[1])    long1;        ///
52 static if (is(Vector!(ulong[1])))   alias Vector!(ulong[1])   ulong1;       ///
53 
54 static if (is(Vector!(void[16])))   alias Vector!(void[16])   void16;       ///
55 static if (is(Vector!(double[2])))  alias Vector!(double[2])  double2;      ///
56 static if (is(Vector!(float[4])))   alias Vector!(float[4])   float4;       ///
57 static if (is(Vector!(byte[16])))   alias Vector!(byte[16])   byte16;       ///
58 static if (is(Vector!(ubyte[16])))  alias Vector!(ubyte[16])  ubyte16;      ///
59 static if (is(Vector!(short[8])))   alias Vector!(short[8])   short8;       ///
60 static if (is(Vector!(ushort[8])))  alias Vector!(ushort[8])  ushort8;      ///
61 static if (is(Vector!(int[4])))     alias Vector!(int[4])     int4;         ///
62 static if (is(Vector!(uint[4])))    alias Vector!(uint[4])    uint4;        ///
63 static if (is(Vector!(long[2])))    alias Vector!(long[2])    long2;        ///
64 static if (is(Vector!(ulong[2])))   alias Vector!(ulong[2])   ulong2;       ///
65 
66 static if (is(Vector!(void[32])))   alias Vector!(void[32])   void32;       ///
67 static if (is(Vector!(double[4])))  alias Vector!(double[4])  double4;      ///
68 static if (is(Vector!(float[8])))   alias Vector!(float[8])   float8;       ///
69 static if (is(Vector!(byte[32])))   alias Vector!(byte[32])   byte32;       ///
70 static if (is(Vector!(ubyte[32])))  alias Vector!(ubyte[32])  ubyte32;      ///
71 static if (is(Vector!(short[16])))  alias Vector!(short[16])  short16;      ///
72 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16;     ///
73 static if (is(Vector!(int[8])))     alias Vector!(int[8])     int8;         ///
74 static if (is(Vector!(uint[8])))    alias Vector!(uint[8])    uint8;        ///
75 static if (is(Vector!(long[4])))    alias Vector!(long[4])    long4;        ///
76 static if (is(Vector!(ulong[4])))   alias Vector!(ulong[4])   ulong4;       ///
77 
78 static if (is(Vector!(void[64])))   alias Vector!(void[64])   void64;       ///
79 static if (is(Vector!(double[8])))  alias Vector!(double[8])  double8;      ///
80 static if (is(Vector!(float[16])))  alias Vector!(float[16])  float16;      ///
81 static if (is(Vector!(byte[64])))   alias Vector!(byte[64])   byte64;       ///
82 static if (is(Vector!(ubyte[64])))  alias Vector!(ubyte[64])  ubyte64;      ///
83 static if (is(Vector!(short[32])))  alias Vector!(short[32])  short32;      ///
84 static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32;     ///
85 static if (is(Vector!(int[16])))    alias Vector!(int[16])    int16;        ///
86 static if (is(Vector!(uint[16])))   alias Vector!(uint[16])   uint16;       ///
87 static if (is(Vector!(long[8])))    alias Vector!(long[8])    long8;        ///
88 static if (is(Vector!(ulong[8])))   alias Vector!(ulong[8])   ulong8;       ///
89 
version(D_SIMD)90 version (D_SIMD)
91 {
92     /** XMM opcodes that conform to the following:
93     *
94     *  opcode xmm1,xmm2/mem
95     *
96     * and do not have side effects (i.e. do not write to memory).
97     */
98     enum XMM
99     {
100         ADDSS = 0xF30F58,
101         ADDSD = 0xF20F58,
102         ADDPS = 0x000F58,
103         ADDPD = 0x660F58,
104         PADDB = 0x660FFC,
105         PADDW = 0x660FFD,
106         PADDD = 0x660FFE,
107         PADDQ = 0x660FD4,
108 
109         SUBSS = 0xF30F5C,
110         SUBSD = 0xF20F5C,
111         SUBPS = 0x000F5C,
112         SUBPD = 0x660F5C,
113         PSUBB = 0x660FF8,
114         PSUBW = 0x660FF9,
115         PSUBD = 0x660FFA,
116         PSUBQ = 0x660FFB,
117 
118         MULSS = 0xF30F59,
119         MULSD = 0xF20F59,
120         MULPS = 0x000F59,
121         MULPD = 0x660F59,
122         PMULLW = 0x660FD5,
123 
124         DIVSS = 0xF30F5E,
125         DIVSD = 0xF20F5E,
126         DIVPS = 0x000F5E,
127         DIVPD = 0x660F5E,
128 
129         PAND  = 0x660FDB,
130         POR   = 0x660FEB,
131 
132         UCOMISS = 0x000F2E,
133         UCOMISD = 0x660F2E,
134 
135         XORPS = 0x000F57,
136         XORPD = 0x660F57,
137 
138         // Use STO and LOD instead of MOV to distinguish the direction
139         // (Destination is first operand, Source is second operand)
140         STOSS  = 0xF30F11,        /// MOVSS xmm1/m32, xmm2
141         STOSD  = 0xF20F11,        /// MOVSD xmm1/m64, xmm2
142         STOAPS = 0x000F29,        /// MOVAPS xmm2/m128, xmm1
143         STOAPD = 0x660F29,        /// MOVAPD xmm2/m128, xmm1
144         STODQA = 0x660F7F,        /// MOVDQA xmm2/m128, xmm1
145         STOD   = 0x660F7E,        /// MOVD reg/mem64, xmm   66 0F 7E /r
146         STOQ   = 0x660FD6,        /// MOVQ xmm2/m64, xmm1
147 
148         LODSS  = 0xF30F10,        /// MOVSS xmm1, xmm2/m32
149         LODSD  = 0xF20F10,        /// MOVSD xmm1, xmm2/m64
150         LODAPS = 0x000F28,        /// MOVAPS xmm1, xmm2/m128
151         LODAPD = 0x660F28,        /// MOVAPD xmm1, xmm2/m128
152         LODDQA = 0x660F6F,        /// MOVDQA xmm1, xmm2/m128
153         LODD   = 0x660F6E,        /// MOVD xmm, reg/mem64   66 0F 6E /r
154         LODQ   = 0xF30F7E,        /// MOVQ xmm1, xmm2/m64
155 
156         LODDQU   = 0xF30F6F,      /// MOVDQU xmm1, xmm2/mem128  F3 0F 6F /r
157         STODQU   = 0xF30F7F,      /// MOVDQU xmm1/mem128, xmm2  F3 0F 7F /r
158         MOVDQ2Q  = 0xF20FD6,      /// MOVDQ2Q mmx, xmm          F2 0F D6 /r
159         MOVHLPS  = 0x0F12,        /// MOVHLPS xmm1, xmm2        0F 12 /r
160         LODHPD   = 0x660F16,      /// MOVHPD xmm1, m64
161         STOHPD   = 0x660F17,      /// MOVHPD mem64, xmm1        66 0F 17 /r
162         LODHPS   = 0x0F16,        /// MOVHPS xmm1, m64
163         STOHPS   = 0x0F17,        /// MOVHPS m64, xmm1
164         MOVLHPS  = 0x0F16,        /// MOVLHPS xmm1, xmm2
165         LODLPD   = 0x660F12,      /// MOVLPD xmm1, m64
166         STOLPD   = 0x660F13,      /// MOVLPD m64, xmm1
167         LODLPS   = 0x0F12,        /// MOVLPS xmm1, m64
168         STOLPS   = 0x0F13,        /// MOVLPS m64, xmm1
169         MOVMSKPD = 0x660F50,      /// MOVMSKPD reg, xmm
170         MOVMSKPS = 0x0F50,        /// MOVMSKPS reg, xmm
171         MOVNTDQ  = 0x660FE7,      /// MOVNTDQ m128, xmm1
172         MOVNTI   = 0x0FC3,        /// MOVNTI m32, r32
173         MOVNTPD  = 0x660F2B,      /// MOVNTPD m128, xmm1
174         MOVNTPS  = 0x0F2B,        /// MOVNTPS m128, xmm1
175         MOVNTQ   = 0x0FE7,        /// MOVNTQ m64, mm
176         MOVQ2DQ  = 0xF30FD6,      /// MOVQ2DQ
177         LODUPD   = 0x660F10,      /// MOVUPD xmm1, xmm2/m128
178         STOUPD   = 0x660F11,      /// MOVUPD xmm2/m128, xmm1
179         LODUPS   = 0x0F10,        /// MOVUPS xmm1, xmm2/m128
180         STOUPS   = 0x0F11,        /// MOVUPS xmm2/m128, xmm1
181 
182         PACKSSDW = 0x660F6B,
183         PACKSSWB = 0x660F63,
184         PACKUSWB = 0x660F67,
185         PADDSB = 0x660FEC,
186         PADDSW = 0x660FED,
187         PADDUSB = 0x660FDC,
188         PADDUSW = 0x660FDD,
189         PANDN = 0x660FDF,
190         PCMPEQB = 0x660F74,
191         PCMPEQD = 0x660F76,
192         PCMPEQW = 0x660F75,
193         PCMPGTB = 0x660F64,
194         PCMPGTD = 0x660F66,
195         PCMPGTW = 0x660F65,
196         PMADDWD = 0x660FF5,
197         PSLLW = 0x660FF1,
198         PSLLD = 0x660FF2,
199         PSLLQ = 0x660FF3,
200         PSRAW = 0x660FE1,
201         PSRAD = 0x660FE2,
202         PSRLW = 0x660FD1,
203         PSRLD = 0x660FD2,
204         PSRLQ = 0x660FD3,
205         PSUBSB = 0x660FE8,
206         PSUBSW = 0x660FE9,
207         PSUBUSB = 0x660FD8,
208         PSUBUSW = 0x660FD9,
209         PUNPCKHBW = 0x660F68,
210         PUNPCKHDQ = 0x660F6A,
211         PUNPCKHWD = 0x660F69,
212         PUNPCKLBW = 0x660F60,
213         PUNPCKLDQ = 0x660F62,
214         PUNPCKLWD = 0x660F61,
215         PXOR = 0x660FEF,
216         ANDPD = 0x660F54,
217         ANDPS = 0x0F54,
218         ANDNPD = 0x660F55,
219         ANDNPS = 0x0F55,
220         CMPPS = 0x0FC2,
221         CMPPD = 0x660FC2,
222         CMPSD = 0xF20FC2,
223         CMPSS = 0xF30FC2,
224         COMISD = 0x660F2F,
225         COMISS = 0x0F2F,
226         CVTDQ2PD = 0xF30FE6,
227         CVTDQ2PS = 0x0F5B,
228         CVTPD2DQ = 0xF20FE6,
229         CVTPD2PI = 0x660F2D,
230         CVTPD2PS = 0x660F5A,
231         CVTPI2PD = 0x660F2A,
232         CVTPI2PS = 0x0F2A,
233         CVTPS2DQ = 0x660F5B,
234         CVTPS2PD = 0x0F5A,
235         CVTPS2PI = 0x0F2D,
236         CVTSD2SI = 0xF20F2D,
237         CVTSD2SS = 0xF20F5A,
238         CVTSI2SD = 0xF20F2A,
239         CVTSI2SS = 0xF30F2A,
240         CVTSS2SD = 0xF30F5A,
241         CVTSS2SI = 0xF30F2D,
242         CVTTPD2PI = 0x660F2C,
243         CVTTPD2DQ = 0x660FE6,
244         CVTTPS2DQ = 0xF30F5B,
245         CVTTPS2PI = 0x0F2C,
246         CVTTSD2SI = 0xF20F2C,
247         CVTTSS2SI = 0xF30F2C,
248         MASKMOVDQU = 0x660FF7,
249         MASKMOVQ = 0x0FF7,
250         MAXPD = 0x660F5F,
251         MAXPS = 0x0F5F,
252         MAXSD = 0xF20F5F,
253         MAXSS = 0xF30F5F,
254         MINPD = 0x660F5D,
255         MINPS = 0x0F5D,
256         MINSD = 0xF20F5D,
257         MINSS = 0xF30F5D,
258         ORPD = 0x660F56,
259         ORPS = 0x0F56,
260         PAVGB = 0x660FE0,
261         PAVGW = 0x660FE3,
262         PMAXSW = 0x660FEE,
263         //PINSRW = 0x660FC4,
264         PMAXUB = 0x660FDE,
265         PMINSW = 0x660FEA,
266         PMINUB = 0x660FDA,
267         //PMOVMSKB = 0x660FD7,
268         PMULHUW = 0x660FE4,
269         PMULHW = 0x660FE5,
270         PMULUDQ = 0x660FF4,
271         PSADBW = 0x660FF6,
272         PUNPCKHQDQ = 0x660F6D,
273         PUNPCKLQDQ = 0x660F6C,
274         RCPPS = 0x0F53,
275         RCPSS = 0xF30F53,
276         RSQRTPS = 0x0F52,
277         RSQRTSS = 0xF30F52,
278         SQRTPD = 0x660F51,
279         SHUFPD = 0x660FC6,
280         SHUFPS = 0x0FC6,
281         SQRTPS = 0x0F51,
282         SQRTSD = 0xF20F51,
283         SQRTSS = 0xF30F51,
284         UNPCKHPD = 0x660F15,
285         UNPCKHPS = 0x0F15,
286         UNPCKLPD = 0x660F14,
287         UNPCKLPS = 0x0F14,
288 
289         PSHUFD = 0x660F70,
290         PSHUFHW = 0xF30F70,
291         PSHUFLW = 0xF20F70,
292         PSHUFW = 0x0F70,
293         PSLLDQ = 0x07660F73,
294         PSRLDQ = 0x03660F73,
295 
296         //PREFETCH = 0x0F18,
297 
298         // SSE3 Pentium 4 (Prescott)
299 
300         ADDSUBPD = 0x660FD0,
301         ADDSUBPS = 0xF20FD0,
302         HADDPD   = 0x660F7C,
303         HADDPS   = 0xF20F7C,
304         HSUBPD   = 0x660F7D,
305         HSUBPS   = 0xF20F7D,
306         MOVDDUP  = 0xF20F12,
307         MOVSHDUP = 0xF30F16,
308         MOVSLDUP = 0xF30F12,
309         LDDQU    = 0xF20FF0,
310         MONITOR  = 0x0F01C8,
311         MWAIT    = 0x0F01C9,
312 
313         // SSSE3
314         PALIGNR = 0x660F3A0F,
315         PHADDD = 0x660F3802,
316         PHADDW = 0x660F3801,
317         PHADDSW = 0x660F3803,
318         PABSB = 0x660F381C,
319         PABSD = 0x660F381E,
320         PABSW = 0x660F381D,
321         PSIGNB = 0x660F3808,
322         PSIGND = 0x660F380A,
323         PSIGNW = 0x660F3809,
324         PSHUFB = 0x660F3800,
325         PMADDUBSW = 0x660F3804,
326         PMULHRSW = 0x660F380B,
327         PHSUBD = 0x660F3806,
328         PHSUBW = 0x660F3805,
329         PHSUBSW = 0x660F3807,
330 
331         // SSE4.1
332 
333         BLENDPD   = 0x660F3A0D,
334         BLENDPS   = 0x660F3A0C,
335         BLENDVPD  = 0x660F3815,
336         BLENDVPS  = 0x660F3814,
337         DPPD      = 0x660F3A41,
338         DPPS      = 0x660F3A40,
339         EXTRACTPS = 0x660F3A17,
340         INSERTPS  = 0x660F3A21,
341         MPSADBW   = 0x660F3A42,
342         PBLENDVB  = 0x660F3810,
343         PBLENDW   = 0x660F3A0E,
344         PEXTRD    = 0x660F3A16,
345         PEXTRQ    = 0x660F3A16,
346         PINSRB    = 0x660F3A20,
347         PINSRD    = 0x660F3A22,
348         PINSRQ    = 0x660F3A22,
349 
350         MOVNTDQA = 0x660F382A,
351         PACKUSDW = 0x660F382B,
352         PCMPEQQ = 0x660F3829,
353         PEXTRB = 0x660F3A14,
354         PHMINPOSUW = 0x660F3841,
355         PMAXSB = 0x660F383C,
356         PMAXSD = 0x660F383D,
357         PMAXUD = 0x660F383F,
358         PMAXUW = 0x660F383E,
359         PMINSB = 0x660F3838,
360         PMINSD = 0x660F3839,
361         PMINUD = 0x660F383B,
362         PMINUW = 0x660F383A,
363         PMOVSXBW = 0x660F3820,
364         PMOVSXBD = 0x660F3821,
365         PMOVSXBQ = 0x660F3822,
366         PMOVSXWD = 0x660F3823,
367         PMOVSXWQ = 0x660F3824,
368         PMOVSXDQ = 0x660F3825,
369         PMOVZXBW = 0x660F3830,
370         PMOVZXBD = 0x660F3831,
371         PMOVZXBQ = 0x660F3832,
372         PMOVZXWD = 0x660F3833,
373         PMOVZXWQ = 0x660F3834,
374         PMOVZXDQ = 0x660F3835,
375         PMULDQ   = 0x660F3828,
376         PMULLD   = 0x660F3840,
377         PTEST    = 0x660F3817,
378 
379         ROUNDPD = 0x660F3A09,
380         ROUNDPS = 0x660F3A08,
381         ROUNDSD = 0x660F3A0B,
382         ROUNDSS = 0x660F3A0A,
383 
384         // SSE4.2
385         PCMPESTRI  = 0x660F3A61,
386         PCMPESTRM  = 0x660F3A60,
387         PCMPISTRI  = 0x660F3A63,
388         PCMPISTRM  = 0x660F3A62,
389         PCMPGTQ    = 0x660F3837,
390         //CRC32
391 
392         // SSE4a (AMD only)
393         // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
394 
395         // POPCNT and LZCNT (have their own CPUID bits)
396         POPCNT     = 0xF30FB8,
397         // LZCNT
398     }
399 
400     /**
401     * Generate two operand instruction with XMM 128 bit operands.
402     *
403     * This is a compiler magic function - it doesn't behave like
404     * regular D functions.
405     *
406     * Parameters:
407     *      opcode = any of the XMM opcodes; it must be a compile time constant
408     *      op1    = first operand
409     *      op2    = second operand
410     * Returns:
411     *      result of opcode
412     */
413     pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
414 
415     ///
416     unittest
417     {
418         float4 a;
419         a = cast(float4)__simd(XMM.PXOR, a, a);
420     }
421 
422     /**
423     * Unary SIMD instructions.
424     */
425     pure @safe void16 __simd(XMM opcode, void16 op1);
426     pure @safe void16 __simd(XMM opcode, double d);   ///
427     pure @safe void16 __simd(XMM opcode, float f);    ///
428 
429     ///
430     unittest
431     {
432         float4 a;
433         a = cast(float4)__simd(XMM.LODSS, a);
434     }
435 
436     /****
437     * For instructions:
438     * CMPPD, CMPSS, CMPSD, CMPPS,
439     * PSHUFD, PSHUFHW, PSHUFLW,
440     * BLENDPD, BLENDPS, DPPD, DPPS,
441     * MPSADBW, PBLENDW,
442     * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
443     * Parameters:
444     *      opcode = any of the above XMM opcodes; it must be a compile time constant
445     *      op1    = first operand
446     *      op2    = second operand
447     *      imm8   = third operand; must be a compile time constant
448     * Returns:
449     *      result of opcode
450     */
451     pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
452 
453     ///
454     unittest
455     {
456         float4 a;
457         a = cast(float4)__simd(XMM.CMPPD, a, a, 0x7A);
458     }
459 
460     /***
461     * For instructions with the imm8 version:
462     * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
463     * PSRLDQ, PSLLDQ
464     * Parameters:
465     *      opcode = any of the XMM opcodes; it must be a compile time constant
466     *      op1    = first operand
467     *      imm8   = second operand; must be a compile time constant
468     * Returns:
469     *      result of opcode
470     */
471     pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
472 
473     ///
474     unittest
475     {
476         float4 a;
477         a = cast(float4) __simd_ib(XMM.PSRLQ, a, 0x7A);
478     }
479 
480     /*****
481     * For "store" operations of the form:
482     *    op1 op= op2
483     * such as MOVLPS.
484     * Returns:
485     *    op2
486     * These cannot be marked as pure, as semantic() doesn't check them.
487     */
488     @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
489     @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
490     @safe void16 __simd_sto(XMM opcode, float op1, void16 op2);  ///
491     @safe void16 __simd_sto(XMM opcode, void16 op1, long op2); ///
492 
493     ///
494     unittest
495     {
496         void16 a;
497         float f = 1;
498         double d = 1;
499 
500         cast(void)__simd_sto(XMM.STOUPS, a, a);
501         cast(void)__simd_sto(XMM.STOUPS, f, a);
502         cast(void)__simd_sto(XMM.STOUPS, d, a);
503     }
504 
505     /* The following use overloading to ensure correct typing.
506     * Compile with inlining on for best performance.
507     */
508 
509     pure @safe short8 pcmpeq()(short8 v1, short8 v2)
510     {
511         return cast(short8)__simd(XMM.PCMPEQW, v1, v2);
512     }
513 
514     pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
515     {
516         return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2);
517     }
518 
519     /*********************
520     * Emit prefetch instruction.
521     * Params:
522     *    address = address to be prefetched
523     *    writeFetch = true for write fetch, false for read fetch
524     *    locality = 0..3 (0 meaning least local, 3 meaning most local)
525     * Note:
526     *    The Intel mappings are:
527     *    $(TABLE
528     *    $(THEAD writeFetch, locality, Instruction)
529     *    $(TROW false, 0, prefetchnta)
530     *    $(TROW false, 1, prefetch2)
531     *    $(TROW false, 2, prefetch1)
532     *    $(TROW false, 3, prefetch0)
533     *    $(TROW true, 0, prefetchw)
534     *    $(TROW true, 1, prefetchw)
535     *    $(TROW true, 2, prefetchw)
536     *    $(TROW true, 3, prefetchw)
537     *    )
538     */
539     void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
540     {
541         static if (writeFetch)
542             __prefetch(address, 4);
543         else static if (locality < 4)
544             __prefetch(address, 3 - locality);
545         else
546             static assert(0, "0..3 expected for locality");
547     }
548 
549     private void __prefetch(const(void*) address, ubyte encoding);
550 
551     /*************************************
552     * Load unaligned vector from address.
553     * This is a compiler intrinsic.
554     * Params:
555     *    p = pointer to vector
556     * Returns:
557     *    vector
558     */
559 
560     V loadUnaligned(V)(const V* p)
561         if (is(V == void16) ||
562             is(V == byte16) ||
563             is(V == ubyte16) ||
564             is(V == short8) ||
565             is(V == ushort8) ||
566             is(V == int4) ||
567             is(V == uint4) ||
568             is(V == long2) ||
569             is(V == ulong2) ||
570             is(V == double2) ||
571             is(V == float4))
572     {
573         pragma(inline, true);
574         static if (is(V == double2))
575             return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
576         else static if (is(V == float4))
577             return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
578         else
579             return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
580     }
581 
582     @system
583     unittest
584     {
585         // Memory to load into the vector:
586         // Should have enough data to test all 16-byte alignments, and still
587         // have room for a 16-byte vector
588         ubyte[32] data;
589         foreach (i; 0..data.length)
590         {
591             data[i] = cast(ubyte)i;
592         }
593 
594         // to test all alignments from 1 ~ 16
595         foreach (i; 0..16)
596         {
597             ubyte* d = &data[i];
598 
599             void test(T)()
600             {
601                 // load the data
602                 T v = loadUnaligned(cast(T*)d);
603 
604                 // check that the data was loaded correctly
605                 ubyte* ptrToV = cast(ubyte*)&v;
606                 foreach (j; 0..T.sizeof)
607                 {
608                     assert(ptrToV[j] == d[j]);
609                 }
610             }
611 
612             test!void16();
613             test!byte16();
614             test!ubyte16();
615             test!short8();
616             test!ushort8();
617             test!int4();
618             test!uint4();
619             test!long2();
620             test!ulong2();
621             test!double2();
622             test!float4();
623         }
624     }
625 
626     /*************************************
627     * Store vector to unaligned address.
628     * This is a compiler intrinsic.
629     * Params:
630     *    p = pointer to vector
631     *    value = value to store
632     * Returns:
633     *    value
634     */
635 
636     V storeUnaligned(V)(V* p, V value)
637         if (is(V == void16) ||
638             is(V == byte16) ||
639             is(V == ubyte16) ||
640             is(V == short8) ||
641             is(V == ushort8) ||
642             is(V == int4) ||
643             is(V == uint4) ||
644             is(V == long2) ||
645             is(V == ulong2) ||
646             is(V == double2) ||
647             is(V == float4))
648     {
649         pragma(inline, true);
650         static if (is(V == double2))
651             return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
652         else static if (is(V == float4))
653             return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
654         else
655             return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
656     }
657 
658     @system
659     unittest
660     {
661         // Memory to store the vector to:
662         // Should have enough data to test all 16-byte alignments, and still
663         // have room for a 16-byte vector
664         ubyte[32] data;
665 
666         // to test all alignments from 1 ~ 16
667         foreach (i; 0..16)
668         {
669             ubyte* d = &data[i];
670 
671             void test(T)()
672             {
673                 T v;
674 
675                 // populate v` with data
676                 ubyte* ptrToV = cast(ubyte*)&v;
677                 foreach (j; 0..T.sizeof)
678                 {
679                     ptrToV[j] = cast(ubyte)j;
680                 }
681 
682                 // store `v` to location pointed to by `d`
683                 storeUnaligned(cast(T*)d, v);
684 
685                 // check that the the data was stored correctly
686                 foreach (j; 0..T.sizeof)
687                 {
688                     assert(ptrToV[j] == d[j]);
689                 }
690             }
691 
692             test!void16();
693             test!byte16();
694             test!ubyte16();
695             test!short8();
696             test!ushort8();
697             test!int4();
698             test!uint4();
699             test!long2();
700             test!ulong2();
701             test!double2();
702             test!float4();
703         }
704     }
705 }
706