1 // Written in the D programming language.
2
3 /**
4 * Builtin SIMD intrinsics
5 *
6 * Source: $(DRUNTIMESRC core/_simd.d)
7 *
8 * Copyright: Copyright Digital Mars 2012-2020
9 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
10 * Authors: $(HTTP digitalmars.com, Walter Bright),
11 * Source: $(DRUNTIMESRC core/_simd.d)
12 */
13
14 module core.simd;
15
16 pure:
17 nothrow:
18 @safe:
19 @nogc:
20
21 /*******************************
22 * Create a vector type.
23 *
24 * Parameters:
25 * T = one of double[2], float[4], void[16], byte[16], ubyte[16],
26 * short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
27 * For 256 bit vectors,
28 * one of double[4], float[8], void[32], byte[32], ubyte[32],
29 * short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
30 */
31
Vector(T)32 template Vector(T)
33 {
34 /* __vector is compiler magic, hide it behind a template.
35 * The compiler will reject T's that don't work.
36 */
37 alias __vector(T) Vector;
38 }
39
40 /* Handy aliases
41 */
42 static if (is(Vector!(void[8]))) alias Vector!(void[8]) void8; ///
43 static if (is(Vector!(double[1]))) alias Vector!(double[1]) double1; ///
44 static if (is(Vector!(float[2]))) alias Vector!(float[2]) float2; ///
45 static if (is(Vector!(byte[8]))) alias Vector!(byte[8]) byte8; ///
46 static if (is(Vector!(ubyte[8]))) alias Vector!(ubyte[8]) ubyte8; ///
47 static if (is(Vector!(short[4]))) alias Vector!(short[4]) short4; ///
48 static if (is(Vector!(ushort[4]))) alias Vector!(ushort[4]) ushort4; ///
49 static if (is(Vector!(int[2]))) alias Vector!(int[2]) int2; ///
50 static if (is(Vector!(uint[2]))) alias Vector!(uint[2]) uint2; ///
51 static if (is(Vector!(long[1]))) alias Vector!(long[1]) long1; ///
52 static if (is(Vector!(ulong[1]))) alias Vector!(ulong[1]) ulong1; ///
53
54 static if (is(Vector!(void[16]))) alias Vector!(void[16]) void16; ///
55 static if (is(Vector!(double[2]))) alias Vector!(double[2]) double2; ///
56 static if (is(Vector!(float[4]))) alias Vector!(float[4]) float4; ///
57 static if (is(Vector!(byte[16]))) alias Vector!(byte[16]) byte16; ///
58 static if (is(Vector!(ubyte[16]))) alias Vector!(ubyte[16]) ubyte16; ///
59 static if (is(Vector!(short[8]))) alias Vector!(short[8]) short8; ///
60 static if (is(Vector!(ushort[8]))) alias Vector!(ushort[8]) ushort8; ///
61 static if (is(Vector!(int[4]))) alias Vector!(int[4]) int4; ///
62 static if (is(Vector!(uint[4]))) alias Vector!(uint[4]) uint4; ///
63 static if (is(Vector!(long[2]))) alias Vector!(long[2]) long2; ///
64 static if (is(Vector!(ulong[2]))) alias Vector!(ulong[2]) ulong2; ///
65
66 static if (is(Vector!(void[32]))) alias Vector!(void[32]) void32; ///
67 static if (is(Vector!(double[4]))) alias Vector!(double[4]) double4; ///
68 static if (is(Vector!(float[8]))) alias Vector!(float[8]) float8; ///
69 static if (is(Vector!(byte[32]))) alias Vector!(byte[32]) byte32; ///
70 static if (is(Vector!(ubyte[32]))) alias Vector!(ubyte[32]) ubyte32; ///
71 static if (is(Vector!(short[16]))) alias Vector!(short[16]) short16; ///
72 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16; ///
73 static if (is(Vector!(int[8]))) alias Vector!(int[8]) int8; ///
74 static if (is(Vector!(uint[8]))) alias Vector!(uint[8]) uint8; ///
75 static if (is(Vector!(long[4]))) alias Vector!(long[4]) long4; ///
76 static if (is(Vector!(ulong[4]))) alias Vector!(ulong[4]) ulong4; ///
77
78 static if (is(Vector!(void[64]))) alias Vector!(void[64]) void64; ///
79 static if (is(Vector!(double[8]))) alias Vector!(double[8]) double8; ///
80 static if (is(Vector!(float[16]))) alias Vector!(float[16]) float16; ///
81 static if (is(Vector!(byte[64]))) alias Vector!(byte[64]) byte64; ///
82 static if (is(Vector!(ubyte[64]))) alias Vector!(ubyte[64]) ubyte64; ///
83 static if (is(Vector!(short[32]))) alias Vector!(short[32]) short32; ///
84 static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32; ///
85 static if (is(Vector!(int[16]))) alias Vector!(int[16]) int16; ///
86 static if (is(Vector!(uint[16]))) alias Vector!(uint[16]) uint16; ///
87 static if (is(Vector!(long[8]))) alias Vector!(long[8]) long8; ///
88 static if (is(Vector!(ulong[8]))) alias Vector!(ulong[8]) ulong8; ///
89
version(D_SIMD)90 version (D_SIMD)
91 {
92 /** XMM opcodes that conform to the following:
93 *
94 * opcode xmm1,xmm2/mem
95 *
96 * and do not have side effects (i.e. do not write to memory).
97 */
98 enum XMM
99 {
100 ADDSS = 0xF30F58,
101 ADDSD = 0xF20F58,
102 ADDPS = 0x000F58,
103 ADDPD = 0x660F58,
104 PADDB = 0x660FFC,
105 PADDW = 0x660FFD,
106 PADDD = 0x660FFE,
107 PADDQ = 0x660FD4,
108
109 SUBSS = 0xF30F5C,
110 SUBSD = 0xF20F5C,
111 SUBPS = 0x000F5C,
112 SUBPD = 0x660F5C,
113 PSUBB = 0x660FF8,
114 PSUBW = 0x660FF9,
115 PSUBD = 0x660FFA,
116 PSUBQ = 0x660FFB,
117
118 MULSS = 0xF30F59,
119 MULSD = 0xF20F59,
120 MULPS = 0x000F59,
121 MULPD = 0x660F59,
122 PMULLW = 0x660FD5,
123
124 DIVSS = 0xF30F5E,
125 DIVSD = 0xF20F5E,
126 DIVPS = 0x000F5E,
127 DIVPD = 0x660F5E,
128
129 PAND = 0x660FDB,
130 POR = 0x660FEB,
131
132 UCOMISS = 0x000F2E,
133 UCOMISD = 0x660F2E,
134
135 XORPS = 0x000F57,
136 XORPD = 0x660F57,
137
138 // Use STO and LOD instead of MOV to distinguish the direction
139 // (Destination is first operand, Source is second operand)
140 STOSS = 0xF30F11, /// MOVSS xmm1/m32, xmm2
141 STOSD = 0xF20F11, /// MOVSD xmm1/m64, xmm2
142 STOAPS = 0x000F29, /// MOVAPS xmm2/m128, xmm1
143 STOAPD = 0x660F29, /// MOVAPD xmm2/m128, xmm1
144 STODQA = 0x660F7F, /// MOVDQA xmm2/m128, xmm1
145 STOD = 0x660F7E, /// MOVD reg/mem64, xmm 66 0F 7E /r
146 STOQ = 0x660FD6, /// MOVQ xmm2/m64, xmm1
147
148 LODSS = 0xF30F10, /// MOVSS xmm1, xmm2/m32
149 LODSD = 0xF20F10, /// MOVSD xmm1, xmm2/m64
150 LODAPS = 0x000F28, /// MOVAPS xmm1, xmm2/m128
151 LODAPD = 0x660F28, /// MOVAPD xmm1, xmm2/m128
152 LODDQA = 0x660F6F, /// MOVDQA xmm1, xmm2/m128
153 LODD = 0x660F6E, /// MOVD xmm, reg/mem64 66 0F 6E /r
154 LODQ = 0xF30F7E, /// MOVQ xmm1, xmm2/m64
155
156 LODDQU = 0xF30F6F, /// MOVDQU xmm1, xmm2/mem128 F3 0F 6F /r
157 STODQU = 0xF30F7F, /// MOVDQU xmm1/mem128, xmm2 F3 0F 7F /r
158 MOVDQ2Q = 0xF20FD6, /// MOVDQ2Q mmx, xmm F2 0F D6 /r
159 MOVHLPS = 0x0F12, /// MOVHLPS xmm1, xmm2 0F 12 /r
160 LODHPD = 0x660F16, /// MOVHPD xmm1, m64
161 STOHPD = 0x660F17, /// MOVHPD mem64, xmm1 66 0F 17 /r
162 LODHPS = 0x0F16, /// MOVHPS xmm1, m64
163 STOHPS = 0x0F17, /// MOVHPS m64, xmm1
164 MOVLHPS = 0x0F16, /// MOVLHPS xmm1, xmm2
165 LODLPD = 0x660F12, /// MOVLPD xmm1, m64
166 STOLPD = 0x660F13, /// MOVLPD m64, xmm1
167 LODLPS = 0x0F12, /// MOVLPS xmm1, m64
168 STOLPS = 0x0F13, /// MOVLPS m64, xmm1
169 MOVMSKPD = 0x660F50, /// MOVMSKPD reg, xmm
170 MOVMSKPS = 0x0F50, /// MOVMSKPS reg, xmm
171 MOVNTDQ = 0x660FE7, /// MOVNTDQ m128, xmm1
172 MOVNTI = 0x0FC3, /// MOVNTI m32, r32
173 MOVNTPD = 0x660F2B, /// MOVNTPD m128, xmm1
174 MOVNTPS = 0x0F2B, /// MOVNTPS m128, xmm1
175 MOVNTQ = 0x0FE7, /// MOVNTQ m64, mm
176 MOVQ2DQ = 0xF30FD6, /// MOVQ2DQ
177 LODUPD = 0x660F10, /// MOVUPD xmm1, xmm2/m128
178 STOUPD = 0x660F11, /// MOVUPD xmm2/m128, xmm1
179 LODUPS = 0x0F10, /// MOVUPS xmm1, xmm2/m128
180 STOUPS = 0x0F11, /// MOVUPS xmm2/m128, xmm1
181
182 PACKSSDW = 0x660F6B,
183 PACKSSWB = 0x660F63,
184 PACKUSWB = 0x660F67,
185 PADDSB = 0x660FEC,
186 PADDSW = 0x660FED,
187 PADDUSB = 0x660FDC,
188 PADDUSW = 0x660FDD,
189 PANDN = 0x660FDF,
190 PCMPEQB = 0x660F74,
191 PCMPEQD = 0x660F76,
192 PCMPEQW = 0x660F75,
193 PCMPGTB = 0x660F64,
194 PCMPGTD = 0x660F66,
195 PCMPGTW = 0x660F65,
196 PMADDWD = 0x660FF5,
197 PSLLW = 0x660FF1,
198 PSLLD = 0x660FF2,
199 PSLLQ = 0x660FF3,
200 PSRAW = 0x660FE1,
201 PSRAD = 0x660FE2,
202 PSRLW = 0x660FD1,
203 PSRLD = 0x660FD2,
204 PSRLQ = 0x660FD3,
205 PSUBSB = 0x660FE8,
206 PSUBSW = 0x660FE9,
207 PSUBUSB = 0x660FD8,
208 PSUBUSW = 0x660FD9,
209 PUNPCKHBW = 0x660F68,
210 PUNPCKHDQ = 0x660F6A,
211 PUNPCKHWD = 0x660F69,
212 PUNPCKLBW = 0x660F60,
213 PUNPCKLDQ = 0x660F62,
214 PUNPCKLWD = 0x660F61,
215 PXOR = 0x660FEF,
216 ANDPD = 0x660F54,
217 ANDPS = 0x0F54,
218 ANDNPD = 0x660F55,
219 ANDNPS = 0x0F55,
220 CMPPS = 0x0FC2,
221 CMPPD = 0x660FC2,
222 CMPSD = 0xF20FC2,
223 CMPSS = 0xF30FC2,
224 COMISD = 0x660F2F,
225 COMISS = 0x0F2F,
226 CVTDQ2PD = 0xF30FE6,
227 CVTDQ2PS = 0x0F5B,
228 CVTPD2DQ = 0xF20FE6,
229 CVTPD2PI = 0x660F2D,
230 CVTPD2PS = 0x660F5A,
231 CVTPI2PD = 0x660F2A,
232 CVTPI2PS = 0x0F2A,
233 CVTPS2DQ = 0x660F5B,
234 CVTPS2PD = 0x0F5A,
235 CVTPS2PI = 0x0F2D,
236 CVTSD2SI = 0xF20F2D,
237 CVTSD2SS = 0xF20F5A,
238 CVTSI2SD = 0xF20F2A,
239 CVTSI2SS = 0xF30F2A,
240 CVTSS2SD = 0xF30F5A,
241 CVTSS2SI = 0xF30F2D,
242 CVTTPD2PI = 0x660F2C,
243 CVTTPD2DQ = 0x660FE6,
244 CVTTPS2DQ = 0xF30F5B,
245 CVTTPS2PI = 0x0F2C,
246 CVTTSD2SI = 0xF20F2C,
247 CVTTSS2SI = 0xF30F2C,
248 MASKMOVDQU = 0x660FF7,
249 MASKMOVQ = 0x0FF7,
250 MAXPD = 0x660F5F,
251 MAXPS = 0x0F5F,
252 MAXSD = 0xF20F5F,
253 MAXSS = 0xF30F5F,
254 MINPD = 0x660F5D,
255 MINPS = 0x0F5D,
256 MINSD = 0xF20F5D,
257 MINSS = 0xF30F5D,
258 ORPD = 0x660F56,
259 ORPS = 0x0F56,
260 PAVGB = 0x660FE0,
261 PAVGW = 0x660FE3,
262 PMAXSW = 0x660FEE,
263 //PINSRW = 0x660FC4,
264 PMAXUB = 0x660FDE,
265 PMINSW = 0x660FEA,
266 PMINUB = 0x660FDA,
267 //PMOVMSKB = 0x660FD7,
268 PMULHUW = 0x660FE4,
269 PMULHW = 0x660FE5,
270 PMULUDQ = 0x660FF4,
271 PSADBW = 0x660FF6,
272 PUNPCKHQDQ = 0x660F6D,
273 PUNPCKLQDQ = 0x660F6C,
274 RCPPS = 0x0F53,
275 RCPSS = 0xF30F53,
276 RSQRTPS = 0x0F52,
277 RSQRTSS = 0xF30F52,
278 SQRTPD = 0x660F51,
279 SHUFPD = 0x660FC6,
280 SHUFPS = 0x0FC6,
281 SQRTPS = 0x0F51,
282 SQRTSD = 0xF20F51,
283 SQRTSS = 0xF30F51,
284 UNPCKHPD = 0x660F15,
285 UNPCKHPS = 0x0F15,
286 UNPCKLPD = 0x660F14,
287 UNPCKLPS = 0x0F14,
288
289 PSHUFD = 0x660F70,
290 PSHUFHW = 0xF30F70,
291 PSHUFLW = 0xF20F70,
292 PSHUFW = 0x0F70,
293 PSLLDQ = 0x07660F73,
294 PSRLDQ = 0x03660F73,
295
296 //PREFETCH = 0x0F18,
297
298 // SSE3 Pentium 4 (Prescott)
299
300 ADDSUBPD = 0x660FD0,
301 ADDSUBPS = 0xF20FD0,
302 HADDPD = 0x660F7C,
303 HADDPS = 0xF20F7C,
304 HSUBPD = 0x660F7D,
305 HSUBPS = 0xF20F7D,
306 MOVDDUP = 0xF20F12,
307 MOVSHDUP = 0xF30F16,
308 MOVSLDUP = 0xF30F12,
309 LDDQU = 0xF20FF0,
310 MONITOR = 0x0F01C8,
311 MWAIT = 0x0F01C9,
312
313 // SSSE3
314 PALIGNR = 0x660F3A0F,
315 PHADDD = 0x660F3802,
316 PHADDW = 0x660F3801,
317 PHADDSW = 0x660F3803,
318 PABSB = 0x660F381C,
319 PABSD = 0x660F381E,
320 PABSW = 0x660F381D,
321 PSIGNB = 0x660F3808,
322 PSIGND = 0x660F380A,
323 PSIGNW = 0x660F3809,
324 PSHUFB = 0x660F3800,
325 PMADDUBSW = 0x660F3804,
326 PMULHRSW = 0x660F380B,
327 PHSUBD = 0x660F3806,
328 PHSUBW = 0x660F3805,
329 PHSUBSW = 0x660F3807,
330
331 // SSE4.1
332
333 BLENDPD = 0x660F3A0D,
334 BLENDPS = 0x660F3A0C,
335 BLENDVPD = 0x660F3815,
336 BLENDVPS = 0x660F3814,
337 DPPD = 0x660F3A41,
338 DPPS = 0x660F3A40,
339 EXTRACTPS = 0x660F3A17,
340 INSERTPS = 0x660F3A21,
341 MPSADBW = 0x660F3A42,
342 PBLENDVB = 0x660F3810,
343 PBLENDW = 0x660F3A0E,
344 PEXTRD = 0x660F3A16,
345 PEXTRQ = 0x660F3A16,
346 PINSRB = 0x660F3A20,
347 PINSRD = 0x660F3A22,
348 PINSRQ = 0x660F3A22,
349
350 MOVNTDQA = 0x660F382A,
351 PACKUSDW = 0x660F382B,
352 PCMPEQQ = 0x660F3829,
353 PEXTRB = 0x660F3A14,
354 PHMINPOSUW = 0x660F3841,
355 PMAXSB = 0x660F383C,
356 PMAXSD = 0x660F383D,
357 PMAXUD = 0x660F383F,
358 PMAXUW = 0x660F383E,
359 PMINSB = 0x660F3838,
360 PMINSD = 0x660F3839,
361 PMINUD = 0x660F383B,
362 PMINUW = 0x660F383A,
363 PMOVSXBW = 0x660F3820,
364 PMOVSXBD = 0x660F3821,
365 PMOVSXBQ = 0x660F3822,
366 PMOVSXWD = 0x660F3823,
367 PMOVSXWQ = 0x660F3824,
368 PMOVSXDQ = 0x660F3825,
369 PMOVZXBW = 0x660F3830,
370 PMOVZXBD = 0x660F3831,
371 PMOVZXBQ = 0x660F3832,
372 PMOVZXWD = 0x660F3833,
373 PMOVZXWQ = 0x660F3834,
374 PMOVZXDQ = 0x660F3835,
375 PMULDQ = 0x660F3828,
376 PMULLD = 0x660F3840,
377 PTEST = 0x660F3817,
378
379 ROUNDPD = 0x660F3A09,
380 ROUNDPS = 0x660F3A08,
381 ROUNDSD = 0x660F3A0B,
382 ROUNDSS = 0x660F3A0A,
383
384 // SSE4.2
385 PCMPESTRI = 0x660F3A61,
386 PCMPESTRM = 0x660F3A60,
387 PCMPISTRI = 0x660F3A63,
388 PCMPISTRM = 0x660F3A62,
389 PCMPGTQ = 0x660F3837,
390 //CRC32
391
392 // SSE4a (AMD only)
393 // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
394
395 // POPCNT and LZCNT (have their own CPUID bits)
396 POPCNT = 0xF30FB8,
397 // LZCNT
398 }
399
400 /**
401 * Generate two operand instruction with XMM 128 bit operands.
402 *
403 * This is a compiler magic function - it doesn't behave like
404 * regular D functions.
405 *
406 * Parameters:
407 * opcode = any of the XMM opcodes; it must be a compile time constant
408 * op1 = first operand
409 * op2 = second operand
410 * Returns:
411 * result of opcode
412 */
413 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
414
415 ///
416 unittest
417 {
418 float4 a;
419 a = cast(float4)__simd(XMM.PXOR, a, a);
420 }
421
422 /**
423 * Unary SIMD instructions.
424 */
425 pure @safe void16 __simd(XMM opcode, void16 op1);
426 pure @safe void16 __simd(XMM opcode, double d); ///
427 pure @safe void16 __simd(XMM opcode, float f); ///
428
429 ///
430 unittest
431 {
432 float4 a;
433 a = cast(float4)__simd(XMM.LODSS, a);
434 }
435
436 /****
437 * For instructions:
438 * CMPPD, CMPSS, CMPSD, CMPPS,
439 * PSHUFD, PSHUFHW, PSHUFLW,
440 * BLENDPD, BLENDPS, DPPD, DPPS,
441 * MPSADBW, PBLENDW,
442 * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
443 * Parameters:
444 * opcode = any of the above XMM opcodes; it must be a compile time constant
445 * op1 = first operand
446 * op2 = second operand
447 * imm8 = third operand; must be a compile time constant
448 * Returns:
449 * result of opcode
450 */
451 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
452
453 ///
454 unittest
455 {
456 float4 a;
457 a = cast(float4)__simd(XMM.CMPPD, a, a, 0x7A);
458 }
459
460 /***
461 * For instructions with the imm8 version:
462 * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
463 * PSRLDQ, PSLLDQ
464 * Parameters:
465 * opcode = any of the XMM opcodes; it must be a compile time constant
466 * op1 = first operand
467 * imm8 = second operand; must be a compile time constant
468 * Returns:
469 * result of opcode
470 */
471 pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
472
473 ///
474 unittest
475 {
476 float4 a;
477 a = cast(float4) __simd_ib(XMM.PSRLQ, a, 0x7A);
478 }
479
480 /*****
481 * For "store" operations of the form:
482 * op1 op= op2
483 * such as MOVLPS.
484 * Returns:
485 * op2
486 * These cannot be marked as pure, as semantic() doesn't check them.
487 */
488 @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
489 @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
490 @safe void16 __simd_sto(XMM opcode, float op1, void16 op2); ///
491 @safe void16 __simd_sto(XMM opcode, void16 op1, long op2); ///
492
493 ///
494 unittest
495 {
496 void16 a;
497 float f = 1;
498 double d = 1;
499
500 cast(void)__simd_sto(XMM.STOUPS, a, a);
501 cast(void)__simd_sto(XMM.STOUPS, f, a);
502 cast(void)__simd_sto(XMM.STOUPS, d, a);
503 }
504
505 /* The following use overloading to ensure correct typing.
506 * Compile with inlining on for best performance.
507 */
508
509 pure @safe short8 pcmpeq()(short8 v1, short8 v2)
510 {
511 return cast(short8)__simd(XMM.PCMPEQW, v1, v2);
512 }
513
514 pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
515 {
516 return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2);
517 }
518
519 /*********************
520 * Emit prefetch instruction.
521 * Params:
522 * address = address to be prefetched
523 * writeFetch = true for write fetch, false for read fetch
524 * locality = 0..3 (0 meaning least local, 3 meaning most local)
525 * Note:
526 * The Intel mappings are:
527 * $(TABLE
528 * $(THEAD writeFetch, locality, Instruction)
529 * $(TROW false, 0, prefetchnta)
530 * $(TROW false, 1, prefetch2)
531 * $(TROW false, 2, prefetch1)
532 * $(TROW false, 3, prefetch0)
533 * $(TROW true, 0, prefetchw)
534 * $(TROW true, 1, prefetchw)
535 * $(TROW true, 2, prefetchw)
536 * $(TROW true, 3, prefetchw)
537 * )
538 */
539 void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
540 {
541 static if (writeFetch)
542 __prefetch(address, 4);
543 else static if (locality < 4)
544 __prefetch(address, 3 - locality);
545 else
546 static assert(0, "0..3 expected for locality");
547 }
548
549 private void __prefetch(const(void*) address, ubyte encoding);
550
551 /*************************************
552 * Load unaligned vector from address.
553 * This is a compiler intrinsic.
554 * Params:
555 * p = pointer to vector
556 * Returns:
557 * vector
558 */
559
560 V loadUnaligned(V)(const V* p)
561 if (is(V == void16) ||
562 is(V == byte16) ||
563 is(V == ubyte16) ||
564 is(V == short8) ||
565 is(V == ushort8) ||
566 is(V == int4) ||
567 is(V == uint4) ||
568 is(V == long2) ||
569 is(V == ulong2) ||
570 is(V == double2) ||
571 is(V == float4))
572 {
573 pragma(inline, true);
574 static if (is(V == double2))
575 return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
576 else static if (is(V == float4))
577 return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
578 else
579 return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
580 }
581
582 @system
583 unittest
584 {
585 // Memory to load into the vector:
586 // Should have enough data to test all 16-byte alignments, and still
587 // have room for a 16-byte vector
588 ubyte[32] data;
589 foreach (i; 0..data.length)
590 {
591 data[i] = cast(ubyte)i;
592 }
593
594 // to test all alignments from 1 ~ 16
595 foreach (i; 0..16)
596 {
597 ubyte* d = &data[i];
598
599 void test(T)()
600 {
601 // load the data
602 T v = loadUnaligned(cast(T*)d);
603
604 // check that the data was loaded correctly
605 ubyte* ptrToV = cast(ubyte*)&v;
606 foreach (j; 0..T.sizeof)
607 {
608 assert(ptrToV[j] == d[j]);
609 }
610 }
611
612 test!void16();
613 test!byte16();
614 test!ubyte16();
615 test!short8();
616 test!ushort8();
617 test!int4();
618 test!uint4();
619 test!long2();
620 test!ulong2();
621 test!double2();
622 test!float4();
623 }
624 }
625
626 /*************************************
627 * Store vector to unaligned address.
628 * This is a compiler intrinsic.
629 * Params:
630 * p = pointer to vector
631 * value = value to store
632 * Returns:
633 * value
634 */
635
636 V storeUnaligned(V)(V* p, V value)
637 if (is(V == void16) ||
638 is(V == byte16) ||
639 is(V == ubyte16) ||
640 is(V == short8) ||
641 is(V == ushort8) ||
642 is(V == int4) ||
643 is(V == uint4) ||
644 is(V == long2) ||
645 is(V == ulong2) ||
646 is(V == double2) ||
647 is(V == float4))
648 {
649 pragma(inline, true);
650 static if (is(V == double2))
651 return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
652 else static if (is(V == float4))
653 return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
654 else
655 return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
656 }
657
658 @system
659 unittest
660 {
661 // Memory to store the vector to:
662 // Should have enough data to test all 16-byte alignments, and still
663 // have room for a 16-byte vector
664 ubyte[32] data;
665
666 // to test all alignments from 1 ~ 16
667 foreach (i; 0..16)
668 {
669 ubyte* d = &data[i];
670
671 void test(T)()
672 {
673 T v;
674
675 // populate v` with data
676 ubyte* ptrToV = cast(ubyte*)&v;
677 foreach (j; 0..T.sizeof)
678 {
679 ptrToV[j] = cast(ubyte)j;
680 }
681
682 // store `v` to location pointed to by `d`
683 storeUnaligned(cast(T*)d, v);
684
685 // check that the the data was stored correctly
686 foreach (j; 0..T.sizeof)
687 {
688 assert(ptrToV[j] == d[j]);
689 }
690 }
691
692 test!void16();
693 test!byte16();
694 test!ubyte16();
695 test!short8();
696 test!ushort8();
697 test!int4();
698 test!uint4();
699 test!long2();
700 test!ulong2();
701 test!double2();
702 test!float4();
703 }
704 }
705 }
706