1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X86 4 5define <8 x half> @broadcastph128(ptr %x) { 6; X64-LABEL: broadcastph128: 7; X64: # %bb.0: 8; X64-NEXT: vpbroadcastw (%rdi), %xmm0 9; X64-NEXT: retq 10; 11; X86-LABEL: broadcastph128: 12; X86: # %bb.0: 13; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 14; X86-NEXT: vpbroadcastw (%eax), %xmm0 15; X86-NEXT: retl 16 %l1 = load half, ptr %x, align 2 17 %vec = insertelement <8 x half> undef, half %l1, i32 0 18 %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer 19 ret <8 x half> %res 20} 21 22define <16 x half> @broadcastph256(ptr %x) { 23; X64-LABEL: broadcastph256: 24; X64: # %bb.0: 25; X64-NEXT: vpbroadcastw (%rdi), %ymm0 26; X64-NEXT: retq 27; 28; X86-LABEL: broadcastph256: 29; X86: # %bb.0: 30; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 31; X86-NEXT: vpbroadcastw (%eax), %ymm0 32; X86-NEXT: retl 33 %l1 = load half, ptr %x, align 2 34 %vec = insertelement <16 x half> undef, half %l1, i32 0 35 %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer 36 ret <16 x half> %res 37} 38 39define <32 x half> @broadcastph512(ptr %x) { 40; X64-LABEL: broadcastph512: 41; X64: # %bb.0: 42; X64-NEXT: vpbroadcastw (%rdi), %zmm0 43; X64-NEXT: retq 44; 45; X86-LABEL: broadcastph512: 46; X86: # %bb.0: 47; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 48; X86-NEXT: vpbroadcastw (%eax), %zmm0 49; X86-NEXT: retl 50 %l1 = load half, ptr %x, align 2 51 %vec = insertelement <32 x half> undef, half %l1, i32 0 52 %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer 53 ret <32 x half> %res 54} 55 56define <8 x half> @broadcastph128_scalar(half %x) { 57; X64-LABEL: broadcastph128_scalar: 58; X64: # %bb.0: 59; X64-NEXT: vpbroadcastw %xmm0, %xmm0 60; X64-NEXT: retq 61; 62; X86-LABEL: broadcastph128_scalar: 63; X86: # %bb.0: 64; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 65; X86-NEXT: retl 66 %vec = insertelement <8 x half> undef, half %x, i32 0 67 %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer 68 ret <8 x half> %res 69} 70 71define <16 x half> @broadcastph256_scalar(half %x) { 72; X64-LABEL: broadcastph256_scalar: 73; X64: # %bb.0: 74; X64-NEXT: vpbroadcastw %xmm0, %ymm0 75; X64-NEXT: retq 76; 77; X86-LABEL: broadcastph256_scalar: 78; X86: # %bb.0: 79; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 80; X86-NEXT: retl 81 %vec = insertelement <16 x half> undef, half %x, i32 0 82 %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer 83 ret <16 x half> %res 84} 85 86define <32 x half> @broadcastph512_scalar(half %x) { 87; X64-LABEL: broadcastph512_scalar: 88; X64: # %bb.0: 89; X64-NEXT: vpbroadcastw %xmm0, %zmm0 90; X64-NEXT: retq 91; 92; X86-LABEL: broadcastph512_scalar: 93; X86: # %bb.0: 94; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 95; X86-NEXT: retl 96 %vec = insertelement <32 x half> undef, half %x, i32 0 97 %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer 98 ret <32 x half> %res 99} 100 101define <8 x half> @broadcastph128_reg(<8 x half> %x) { 102; CHECK-LABEL: broadcastph128_reg: 103; CHECK: # %bb.0: 104; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 105; CHECK-NEXT: ret{{[l|q]}} 106 %res = shufflevector <8 x half> %x, <8 x half> undef, <8 x i32> zeroinitializer 107 ret <8 x half> %res 108} 109 110define <16 x half> @broadcastph256_reg(<16 x half> %x) { 111; CHECK-LABEL: broadcastph256_reg: 112; CHECK: # %bb.0: 113; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 114; CHECK-NEXT: ret{{[l|q]}} 115 %res = shufflevector <16 x half> %x, <16 x half> undef, <16 x i32> zeroinitializer 116 ret <16 x half> %res 117} 118 119define <32 x half> @broadcastph512_reg(<32 x half> %x) { 120; CHECK-LABEL: broadcastph512_reg: 121; CHECK: # %bb.0: 122; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 123; CHECK-NEXT: ret{{[l|q]}} 124 %res = shufflevector <32 x half> %x, <32 x half> undef, <32 x i32> zeroinitializer 125 ret <32 x half> %res 126} 127 128define i16 @test1(half %x) { 129; X64-LABEL: test1: 130; X64: # %bb.0: 131; X64-NEXT: vmovw %xmm0, %eax 132; X64-NEXT: # kill: def $ax killed $ax killed $eax 133; X64-NEXT: retq 134; 135; X86-LABEL: test1: 136; X86: # %bb.0: 137; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 138; X86-NEXT: retl 139 %res = bitcast half %x to i16 140 ret i16 %res 141} 142 143define <8 x i16> @test2(i16 %x) { 144; X64-LABEL: test2: 145; X64: # %bb.0: 146; X64-NEXT: vmovw %edi, %xmm0 147; X64-NEXT: retq 148; 149; X86-LABEL: test2: 150; X86: # %bb.0: 151; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 152; X86-NEXT: retl 153 %res = insertelement <8 x i16>undef, i16 %x, i32 0 154 ret <8 x i16>%res 155} 156 157define <8 x i16> @test4(ptr %x) { 158; X64-LABEL: test4: 159; X64: # %bb.0: 160; X64-NEXT: vpbroadcastw (%rdi), %xmm0 161; X64-NEXT: retq 162; 163; X86-LABEL: test4: 164; X86: # %bb.0: 165; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 166; X86-NEXT: vpbroadcastw (%eax), %xmm0 167; X86-NEXT: retl 168 %y = load i16, ptr %x 169 %res = insertelement <8 x i16>undef, i16 %y, i32 0 170 ret <8 x i16>%res 171} 172 173define void @test5(half %x, ptr %y) { 174; X64-LABEL: test5: 175; X64: # %bb.0: 176; X64-NEXT: vmovsh %xmm0, (%rdi) 177; X64-NEXT: retq 178; 179; X86-LABEL: test5: 180; X86: # %bb.0: 181; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 182; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 183; X86-NEXT: vmovsh %xmm0, (%eax) 184; X86-NEXT: retl 185 store half %x, ptr %y, align 2 186 ret void 187} 188 189define half @test7(ptr %x) { 190; X64-LABEL: test7: 191; X64: # %bb.0: 192; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 193; X64-NEXT: retq 194; 195; X86-LABEL: test7: 196; X86: # %bb.0: 197; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 198; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 199; X86-NEXT: retl 200 %y = load i16, ptr %x 201 %res = bitcast i16 %y to half 202 ret half %res 203} 204 205define <8 x i16> @test10(ptr %x) { 206; X64-LABEL: test10: 207; X64: # %bb.0: 208; X64-NEXT: vmovw (%rdi), %xmm0 209; X64-NEXT: retq 210; 211; X86-LABEL: test10: 212; X86: # %bb.0: 213; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 214; X86-NEXT: vmovw (%eax), %xmm0 215; X86-NEXT: retl 216 %y = load i16, ptr %x, align 2 217 %res = insertelement <8 x i16>zeroinitializer, i16 %y, i32 0 218 ret <8 x i16>%res 219} 220 221define <16 x i16> @test10b(ptr %x) { 222; X64-LABEL: test10b: 223; X64: # %bb.0: 224; X64-NEXT: vmovw (%rdi), %xmm0 225; X64-NEXT: retq 226; 227; X86-LABEL: test10b: 228; X86: # %bb.0: 229; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 230; X86-NEXT: vmovw (%eax), %xmm0 231; X86-NEXT: retl 232 %y = load i16, ptr %x, align 2 233 %res = insertelement <16 x i16>zeroinitializer, i16 %y, i32 0 234 ret <16 x i16>%res 235} 236 237define <32 x i16> @test10c(ptr %x) { 238; X64-LABEL: test10c: 239; X64: # %bb.0: 240; X64-NEXT: vmovw (%rdi), %xmm0 241; X64-NEXT: retq 242; 243; X86-LABEL: test10c: 244; X86: # %bb.0: 245; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 246; X86-NEXT: vmovw (%eax), %xmm0 247; X86-NEXT: retl 248 %y = load i16, ptr %x, align 2 249 %res = insertelement <32 x i16>zeroinitializer, i16 %y, i32 0 250 ret <32 x i16>%res 251} 252 253define <8 x half> @test11(ptr %x) { 254; X64-LABEL: test11: 255; X64: # %bb.0: 256; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 257; X64-NEXT: retq 258; 259; X86-LABEL: test11: 260; X86: # %bb.0: 261; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 262; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 263; X86-NEXT: retl 264 %y = load half, ptr %x, align 2 265 %res = insertelement <8 x half>zeroinitializer, half %y, i32 0 266 ret <8 x half>%res 267} 268 269define <16 x half> @test11b(ptr %x) { 270; X64-LABEL: test11b: 271; X64: # %bb.0: 272; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 273; X64-NEXT: retq 274; 275; X86-LABEL: test11b: 276; X86: # %bb.0: 277; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 278; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 279; X86-NEXT: retl 280 %y = load half, ptr %x, align 2 281 %res = insertelement <16 x half>zeroinitializer, half %y, i32 0 282 ret <16 x half>%res 283} 284 285define <32 x half> @test11c(ptr %x) { 286; X64-LABEL: test11c: 287; X64: # %bb.0: 288; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 289; X64-NEXT: retq 290; 291; X86-LABEL: test11c: 292; X86: # %bb.0: 293; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 294; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 295; X86-NEXT: retl 296 %y = load half, ptr %x, align 2 297 %res = insertelement <32 x half>zeroinitializer, half %y, i32 0 298 ret <32 x half>%res 299} 300 301define <8 x half> @test14(half %x) { 302; X64-LABEL: test14: 303; X64: # %bb.0: 304; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 305; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 306; X64-NEXT: retq 307; 308; X86-LABEL: test14: 309; X86: # %bb.0: 310; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 311; X86-NEXT: retl 312 %res = insertelement <8 x half>zeroinitializer, half %x, i32 0 313 ret <8 x half>%res 314} 315 316define <16 x half> @test14b(half %x) { 317; X64-LABEL: test14b: 318; X64: # %bb.0: 319; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 320; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 321; X64-NEXT: retq 322; 323; X86-LABEL: test14b: 324; X86: # %bb.0: 325; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 326; X86-NEXT: retl 327 %res = insertelement <16 x half>zeroinitializer, half %x, i32 0 328 ret <16 x half>%res 329} 330 331define <32 x half> @test14c(half %x) { 332; X64-LABEL: test14c: 333; X64: # %bb.0: 334; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 335; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 336; X64-NEXT: retq 337; 338; X86-LABEL: test14c: 339; X86: # %bb.0: 340; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 341; X86-NEXT: retl 342 %res = insertelement <32 x half>zeroinitializer, half %x, i32 0 343 ret <32 x half>%res 344} 345 346define <8 x i16> @test15(i16 %x) { 347; X64-LABEL: test15: 348; X64: # %bb.0: 349; X64-NEXT: vmovw %edi, %xmm0 350; X64-NEXT: retq 351; 352; X86-LABEL: test15: 353; X86: # %bb.0: 354; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 355; X86-NEXT: retl 356 %res = insertelement <8 x i16>zeroinitializer, i16 %x, i32 0 357 ret <8 x i16>%res 358} 359 360define <16 x i16> @test16(i16 %x) { 361; X64-LABEL: test16: 362; X64: # %bb.0: 363; X64-NEXT: vmovw %edi, %xmm0 364; X64-NEXT: retq 365; 366; X86-LABEL: test16: 367; X86: # %bb.0: 368; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 369; X86-NEXT: retl 370 %res = insertelement <16 x i16>zeroinitializer, i16 %x, i32 0 371 ret <16 x i16>%res 372} 373 374define <32 x i16> @test17(i16 %x) { 375; X64-LABEL: test17: 376; X64: # %bb.0: 377; X64-NEXT: vmovw %edi, %xmm0 378; X64-NEXT: retq 379; 380; X86-LABEL: test17: 381; X86: # %bb.0: 382; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 383; X86-NEXT: retl 384 %res = insertelement <32 x i16>zeroinitializer, i16 %x, i32 0 385 ret <32 x i16>%res 386} 387 388define <8 x i16> @test18(i16 %x) { 389; X64-LABEL: test18: 390; X64: # %bb.0: 391; X64-NEXT: vmovw %edi, %xmm0 392; X64-NEXT: retq 393; 394; X86-LABEL: test18: 395; X86: # %bb.0: 396; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 397; X86-NEXT: retl 398 %res = insertelement <8 x i16> undef, i16 %x, i32 0 399 ret <8 x i16>%res 400} 401 402define <16 x i16> @test19(i16 %x) { 403; X64-LABEL: test19: 404; X64: # %bb.0: 405; X64-NEXT: vmovw %edi, %xmm0 406; X64-NEXT: retq 407; 408; X86-LABEL: test19: 409; X86: # %bb.0: 410; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 411; X86-NEXT: retl 412 %res = insertelement <16 x i16> undef, i16 %x, i32 0 413 ret <16 x i16>%res 414} 415 416define <32 x i16> @test20(i16 %x) { 417; X64-LABEL: test20: 418; X64: # %bb.0: 419; X64-NEXT: vmovw %edi, %xmm0 420; X64-NEXT: retq 421; 422; X86-LABEL: test20: 423; X86: # %bb.0: 424; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 425; X86-NEXT: retl 426 %res = insertelement <32 x i16> undef, i16 %x, i32 0 427 ret <32 x i16>%res 428} 429 430@g8f16 = external global <8 x half> 431@g8f16u = external global <8 x half>, align 8 432@g16f16 = external global <16 x half> 433@g16f16u = external global <16 x half>, align 8 434@g32f16 = external global <32 x half> 435@g32f16u = external global <32 x half>, align 8 436 437define <32 x half> @load32f16(ptr %a) { 438; X64-LABEL: load32f16: 439; X64: # %bb.0: 440; X64-NEXT: vmovaps (%rdi), %zmm0 441; X64-NEXT: retq 442; 443; X86-LABEL: load32f16: 444; X86: # %bb.0: 445; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 446; X86-NEXT: vmovaps (%eax), %zmm0 447; X86-NEXT: retl 448 %res = load <32 x half>, ptr %a 449 ret <32 x half> %res 450} 451 452define <32 x half> @load32f16mask(ptr %a, <32 x half> %b, i32 %c) { 453; X64-LABEL: load32f16mask: 454; X64: # %bb.0: 455; X64-NEXT: kmovd %esi, %k1 456; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} 457; X64-NEXT: retq 458; 459; X86-LABEL: load32f16mask: 460; X86: # %bb.0: 461; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 462; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 463; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} 464; X86-NEXT: retl 465 %msk = bitcast i32 %c to <32 x i1> 466 %res0 = load <32 x half>, ptr %a 467 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b 468 ret <32 x half> %res 469} 470 471define <32 x half> @load32f16maskz(ptr %a, i32 %c) { 472; X64-LABEL: load32f16maskz: 473; X64: # %bb.0: 474; X64-NEXT: kmovd %esi, %k1 475; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} 476; X64-NEXT: retq 477; 478; X86-LABEL: load32f16maskz: 479; X86: # %bb.0: 480; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 481; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 482; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} 483; X86-NEXT: retl 484 %msk = bitcast i32 %c to <32 x i1> 485 %res0 = load <32 x half>, ptr %a 486 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer 487 ret <32 x half> %res 488} 489 490define <32 x half> @loadu32f16(ptr %a) { 491; X64-LABEL: loadu32f16: 492; X64: # %bb.0: 493; X64-NEXT: vmovups (%rdi), %zmm0 494; X64-NEXT: retq 495; 496; X86-LABEL: loadu32f16: 497; X86: # %bb.0: 498; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 499; X86-NEXT: vmovups (%eax), %zmm0 500; X86-NEXT: retl 501 %res = load <32 x half>, ptr %a, align 8 502 ret <32 x half> %res 503} 504 505define <32 x half> @loadu32f16mask(ptr %a, <32 x half> %b, i32 %c) { 506; X64-LABEL: loadu32f16mask: 507; X64: # %bb.0: 508; X64-NEXT: kmovd %esi, %k1 509; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} 510; X64-NEXT: retq 511; 512; X86-LABEL: loadu32f16mask: 513; X86: # %bb.0: 514; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 515; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 516; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} 517; X86-NEXT: retl 518 %msk = bitcast i32 %c to <32 x i1> 519 %res0 = load <32 x half>, ptr %a, align 8 520 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b 521 ret <32 x half> %res 522} 523 524define <32 x half> @loadu32f16maskz(ptr %a, i32 %c) { 525; X64-LABEL: loadu32f16maskz: 526; X64: # %bb.0: 527; X64-NEXT: kmovd %esi, %k1 528; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} 529; X64-NEXT: retq 530; 531; X86-LABEL: loadu32f16maskz: 532; X86: # %bb.0: 533; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 534; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 535; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} 536; X86-NEXT: retl 537 %msk = bitcast i32 %c to <32 x i1> 538 %res0 = load <32 x half>, ptr %a, align 8 539 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer 540 ret <32 x half> %res 541} 542 543define void @store32f16(<32 x half> %a) { 544; X64-LABEL: store32f16: 545; X64: # %bb.0: 546; X64-NEXT: movq g32f16@GOTPCREL(%rip), %rax 547; X64-NEXT: vmovaps %zmm0, (%rax) 548; X64-NEXT: vzeroupper 549; X64-NEXT: retq 550; 551; X86-LABEL: store32f16: 552; X86: # %bb.0: 553; X86-NEXT: vmovaps %zmm0, g32f16 554; X86-NEXT: vzeroupper 555; X86-NEXT: retl 556 store <32 x half> %a, ptr @g32f16 557 ret void 558} 559 560define void @storeu32f16(<32 x half> %a) { 561; X64-LABEL: storeu32f16: 562; X64: # %bb.0: 563; X64-NEXT: movq g32f16u@GOTPCREL(%rip), %rax 564; X64-NEXT: vmovups %zmm0, (%rax) 565; X64-NEXT: vzeroupper 566; X64-NEXT: retq 567; 568; X86-LABEL: storeu32f16: 569; X86: # %bb.0: 570; X86-NEXT: vmovups %zmm0, g32f16u 571; X86-NEXT: vzeroupper 572; X86-NEXT: retl 573 store <32 x half> %a, ptr @g32f16u, align 8 574 ret void 575} 576 577declare void @llvm.masked.store.v32f16.p0(<32 x half>, ptr, i32, <32 x i1>) 578declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32, <32 x i1>, <32 x half>) 579 580define void @storeu32f16mask(<32 x i1> %mask, ptr %addr, <32 x half> %val) { 581; X64-LABEL: storeu32f16mask: 582; X64: # %bb.0: 583; X64-NEXT: vpsllw $7, %ymm0, %ymm0 584; X64-NEXT: vpmovb2m %ymm0, %k1 585; X64-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1} 586; X64-NEXT: vzeroupper 587; X64-NEXT: retq 588; 589; X86-LABEL: storeu32f16mask: 590; X86: # %bb.0: 591; X86-NEXT: vpsllw $7, %ymm0, %ymm0 592; X86-NEXT: vpmovb2m %ymm0, %k1 593; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 594; X86-NEXT: vmovdqu16 %zmm1, (%eax) {%k1} 595; X86-NEXT: vzeroupper 596; X86-NEXT: retl 597 call void @llvm.masked.store.v32f16.p0(<32 x half> %val, ptr %addr, i32 4, <32 x i1>%mask) 598 ret void 599} 600 601define <32 x half> @maskloadu32f16(ptr %addr, <32 x half> %val, <32 x i1> %mask) { 602; X64-LABEL: maskloadu32f16: 603; X64: # %bb.0: 604; X64-NEXT: vpsllw $7, %ymm1, %ymm1 605; X64-NEXT: vpmovb2m %ymm1, %k1 606; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} 607; X64-NEXT: retq 608; 609; X86-LABEL: maskloadu32f16: 610; X86: # %bb.0: 611; X86-NEXT: vpsllw $7, %ymm1, %ymm1 612; X86-NEXT: vpmovb2m %ymm1, %k1 613; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 614; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} 615; X86-NEXT: retl 616 %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> %val) 617 ret <32 x half> %res 618} 619 620define <32 x half> @maskuloadu32f16(ptr %addr, <32 x i1> %mask) { 621; X64-LABEL: maskuloadu32f16: 622; X64: # %bb.0: 623; X64-NEXT: vpsllw $7, %ymm0, %ymm0 624; X64-NEXT: vpmovb2m %ymm0, %k1 625; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} 626; X64-NEXT: retq 627; 628; X86-LABEL: maskuloadu32f16: 629; X86: # %bb.0: 630; X86-NEXT: vpsllw $7, %ymm0, %ymm0 631; X86-NEXT: vpmovb2m %ymm0, %k1 632; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 633; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} 634; X86-NEXT: retl 635 %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> undef) 636 ret <32 x half> %res 637} 638 639define <32 x half> @maskzloadu32f16(ptr %addr, <32 x i1> %mask) { 640; X64-LABEL: maskzloadu32f16: 641; X64: # %bb.0: 642; X64-NEXT: vpsllw $7, %ymm0, %ymm0 643; X64-NEXT: vpmovb2m %ymm0, %k1 644; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} 645; X64-NEXT: retq 646; 647; X86-LABEL: maskzloadu32f16: 648; X86: # %bb.0: 649; X86-NEXT: vpsllw $7, %ymm0, %ymm0 650; X86-NEXT: vpmovb2m %ymm0, %k1 651; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 652; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} 653; X86-NEXT: retl 654 %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer) 655 ret <32 x half> %res 656} 657 658define <32 x half> @movrr32f16(<32 x half> %a, <32 x half> %b) { 659; CHECK-LABEL: movrr32f16: 660; CHECK: # %bb.0: 661; CHECK-NEXT: vmovaps %zmm1, %zmm0 662; CHECK-NEXT: ret{{[l|q]}} 663 ret <32 x half> %b 664} 665 666define <32 x half> @movrrk32f16(<32 x half> %a, <32 x half> %b, i32 %msk) { 667; X64-LABEL: movrrk32f16: 668; X64: # %bb.0: 669; X64-NEXT: kmovd %edi, %k1 670; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} 671; X64-NEXT: retq 672; 673; X86-LABEL: movrrk32f16: 674; X86: # %bb.0: 675; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 676; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} 677; X86-NEXT: retl 678 %mask = bitcast i32 %msk to <32 x i1> 679 %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> %b 680 ret <32 x half> %res 681} 682 683define <32 x half> @movrrkz32f16(<32 x half> %a, i32 %msk) { 684; X64-LABEL: movrrkz32f16: 685; X64: # %bb.0: 686; X64-NEXT: kmovd %edi, %k1 687; X64-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} 688; X64-NEXT: retq 689; 690; X86-LABEL: movrrkz32f16: 691; X86: # %bb.0: 692; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 693; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} 694; X86-NEXT: retl 695 %mask = bitcast i32 %msk to <32 x i1> 696 %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> zeroinitializer 697 ret <32 x half> %res 698} 699 700define <16 x half> @load16f16(ptr %a) { 701; X64-LABEL: load16f16: 702; X64: # %bb.0: 703; X64-NEXT: vmovaps (%rdi), %ymm0 704; X64-NEXT: retq 705; 706; X86-LABEL: load16f16: 707; X86: # %bb.0: 708; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 709; X86-NEXT: vmovaps (%eax), %ymm0 710; X86-NEXT: retl 711 %res = load <16 x half>, ptr %a 712 ret <16 x half> %res 713} 714 715define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) { 716; X64-LABEL: load16f16mask: 717; X64: # %bb.0: 718; X64-NEXT: kmovd %esi, %k1 719; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} 720; X64-NEXT: retq 721; 722; X86-LABEL: load16f16mask: 723; X86: # %bb.0: 724; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 725; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 726; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} 727; X86-NEXT: retl 728 %msk = bitcast i16 %c to <16 x i1> 729 %res0 = load <16 x half>, ptr %a 730 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b 731 ret <16 x half> %res 732} 733 734define <16 x half> @load16f16maskz(ptr %a, i16 %c) { 735; X64-LABEL: load16f16maskz: 736; X64: # %bb.0: 737; X64-NEXT: kmovd %esi, %k1 738; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} 739; X64-NEXT: retq 740; 741; X86-LABEL: load16f16maskz: 742; X86: # %bb.0: 743; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 744; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 745; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} 746; X86-NEXT: retl 747 %msk = bitcast i16 %c to <16 x i1> 748 %res0 = load <16 x half>, ptr %a 749 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer 750 ret <16 x half> %res 751} 752 753define <16 x half> @loadu16f16(ptr %a) { 754; X64-LABEL: loadu16f16: 755; X64: # %bb.0: 756; X64-NEXT: vmovups (%rdi), %ymm0 757; X64-NEXT: retq 758; 759; X86-LABEL: loadu16f16: 760; X86: # %bb.0: 761; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 762; X86-NEXT: vmovups (%eax), %ymm0 763; X86-NEXT: retl 764 %res = load <16 x half>, ptr %a, align 8 765 ret <16 x half> %res 766} 767 768define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) { 769; X64-LABEL: loadu16f16mask: 770; X64: # %bb.0: 771; X64-NEXT: kmovd %esi, %k1 772; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} 773; X64-NEXT: retq 774; 775; X86-LABEL: loadu16f16mask: 776; X86: # %bb.0: 777; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 778; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 779; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} 780; X86-NEXT: retl 781 %msk = bitcast i16 %c to <16 x i1> 782 %res0 = load <16 x half>, ptr %a, align 8 783 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b 784 ret <16 x half> %res 785} 786 787define <16 x half> @loadu16f16maskz(ptr %a, i16 %c) { 788; X64-LABEL: loadu16f16maskz: 789; X64: # %bb.0: 790; X64-NEXT: kmovd %esi, %k1 791; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} 792; X64-NEXT: retq 793; 794; X86-LABEL: loadu16f16maskz: 795; X86: # %bb.0: 796; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 797; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 798; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} 799; X86-NEXT: retl 800 %msk = bitcast i16 %c to <16 x i1> 801 %res0 = load <16 x half>, ptr %a, align 8 802 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer 803 ret <16 x half> %res 804} 805 806define void @store16f16(<16 x half> %a) { 807; X64-LABEL: store16f16: 808; X64: # %bb.0: 809; X64-NEXT: movq g16f16@GOTPCREL(%rip), %rax 810; X64-NEXT: vmovaps %ymm0, (%rax) 811; X64-NEXT: vzeroupper 812; X64-NEXT: retq 813; 814; X86-LABEL: store16f16: 815; X86: # %bb.0: 816; X86-NEXT: vmovaps %ymm0, g16f16 817; X86-NEXT: vzeroupper 818; X86-NEXT: retl 819 store <16 x half> %a, ptr @g16f16 820 ret void 821} 822 823define void @storeu16f16(<16 x half> %a) { 824; X64-LABEL: storeu16f16: 825; X64: # %bb.0: 826; X64-NEXT: movq g16f16u@GOTPCREL(%rip), %rax 827; X64-NEXT: vmovups %ymm0, (%rax) 828; X64-NEXT: vzeroupper 829; X64-NEXT: retq 830; 831; X86-LABEL: storeu16f16: 832; X86: # %bb.0: 833; X86-NEXT: vmovups %ymm0, g16f16u 834; X86-NEXT: vzeroupper 835; X86-NEXT: retl 836 store <16 x half> %a, ptr @g16f16u, align 8 837 ret void 838} 839 840declare void @llvm.masked.store.v16f16.p0(<16 x half>, ptr, i32, <16 x i1>) 841declare <16 x half> @llvm.masked.load.v16f16.p0(ptr, i32, <16 x i1>, <16 x half>) 842 843define void @storeu16f16mask(<16 x i1> %mask, ptr %addr, <16 x half> %val) { 844; X64-LABEL: storeu16f16mask: 845; X64: # %bb.0: 846; X64-NEXT: vpsllw $7, %xmm0, %xmm0 847; X64-NEXT: vpmovb2m %xmm0, %k1 848; X64-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1} 849; X64-NEXT: vzeroupper 850; X64-NEXT: retq 851; 852; X86-LABEL: storeu16f16mask: 853; X86: # %bb.0: 854; X86-NEXT: vpsllw $7, %xmm0, %xmm0 855; X86-NEXT: vpmovb2m %xmm0, %k1 856; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 857; X86-NEXT: vmovdqu16 %ymm1, (%eax) {%k1} 858; X86-NEXT: vzeroupper 859; X86-NEXT: retl 860 call void @llvm.masked.store.v16f16.p0(<16 x half> %val, ptr %addr, i32 4, <16 x i1>%mask) 861 ret void 862} 863 864define <16 x half> @maskloadu16f16(ptr %addr, <16 x half> %val, <16 x i1> %mask) { 865; X64-LABEL: maskloadu16f16: 866; X64: # %bb.0: 867; X64-NEXT: vpsllw $7, %xmm1, %xmm1 868; X64-NEXT: vpmovb2m %xmm1, %k1 869; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} 870; X64-NEXT: retq 871; 872; X86-LABEL: maskloadu16f16: 873; X86: # %bb.0: 874; X86-NEXT: vpsllw $7, %xmm1, %xmm1 875; X86-NEXT: vpmovb2m %xmm1, %k1 876; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 877; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} 878; X86-NEXT: retl 879 %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> %val) 880 ret <16 x half> %res 881} 882 883define <16 x half> @maskuloadu16f16(ptr %addr, <16 x i1> %mask) { 884; X64-LABEL: maskuloadu16f16: 885; X64: # %bb.0: 886; X64-NEXT: vpsllw $7, %xmm0, %xmm0 887; X64-NEXT: vpmovb2m %xmm0, %k1 888; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} 889; X64-NEXT: retq 890; 891; X86-LABEL: maskuloadu16f16: 892; X86: # %bb.0: 893; X86-NEXT: vpsllw $7, %xmm0, %xmm0 894; X86-NEXT: vpmovb2m %xmm0, %k1 895; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 896; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} 897; X86-NEXT: retl 898 %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> undef) 899 ret <16 x half> %res 900} 901 902define <16 x half> @maskzloadu16f16(ptr %addr, <16 x i1> %mask) { 903; X64-LABEL: maskzloadu16f16: 904; X64: # %bb.0: 905; X64-NEXT: vpsllw $7, %xmm0, %xmm0 906; X64-NEXT: vpmovb2m %xmm0, %k1 907; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} 908; X64-NEXT: retq 909; 910; X86-LABEL: maskzloadu16f16: 911; X86: # %bb.0: 912; X86-NEXT: vpsllw $7, %xmm0, %xmm0 913; X86-NEXT: vpmovb2m %xmm0, %k1 914; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 915; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} 916; X86-NEXT: retl 917 %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer) 918 ret <16 x half> %res 919} 920 921define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) { 922; CHECK-LABEL: movrr16f16: 923; CHECK: # %bb.0: 924; CHECK-NEXT: vmovaps %ymm1, %ymm0 925; CHECK-NEXT: ret{{[l|q]}} 926 ret <16 x half> %b 927} 928 929define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) { 930; X64-LABEL: movrrk16f16: 931; X64: # %bb.0: 932; X64-NEXT: kmovd %edi, %k1 933; X64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} 934; X64-NEXT: retq 935; 936; X86-LABEL: movrrk16f16: 937; X86: # %bb.0: 938; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 939; X86-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} 940; X86-NEXT: retl 941 %mask = bitcast i16 %msk to <16 x i1> 942 %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b 943 ret <16 x half> %res 944} 945 946define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) { 947; X64-LABEL: movrrkz16f16: 948; X64: # %bb.0: 949; X64-NEXT: kmovd %edi, %k1 950; X64-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 951; X64-NEXT: retq 952; 953; X86-LABEL: movrrkz16f16: 954; X86: # %bb.0: 955; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 956; X86-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 957; X86-NEXT: retl 958 %mask = bitcast i16 %msk to <16 x i1> 959 %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer 960 ret <16 x half> %res 961} 962 963define <8 x half> @load8f16(ptr %a) { 964; X64-LABEL: load8f16: 965; X64: # %bb.0: 966; X64-NEXT: vmovaps (%rdi), %xmm0 967; X64-NEXT: retq 968; 969; X86-LABEL: load8f16: 970; X86: # %bb.0: 971; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 972; X86-NEXT: vmovaps (%eax), %xmm0 973; X86-NEXT: retl 974 %res = load <8 x half>, ptr %a 975 ret <8 x half> %res 976} 977 978define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) { 979; X64-LABEL: load8f16mask: 980; X64: # %bb.0: 981; X64-NEXT: kmovd %esi, %k1 982; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} 983; X64-NEXT: retq 984; 985; X86-LABEL: load8f16mask: 986; X86: # %bb.0: 987; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 988; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 989; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} 990; X86-NEXT: retl 991 %msk = bitcast i8 %c to <8 x i1> 992 %res0 = load <8 x half>, ptr %a 993 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b 994 ret <8 x half> %res 995} 996 997define <8 x half> @load8f16maskz(ptr %a, i8 %c) { 998; X64-LABEL: load8f16maskz: 999; X64: # %bb.0: 1000; X64-NEXT: kmovd %esi, %k1 1001; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} 1002; X64-NEXT: retq 1003; 1004; X86-LABEL: load8f16maskz: 1005; X86: # %bb.0: 1006; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1007; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 1008; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} 1009; X86-NEXT: retl 1010 %msk = bitcast i8 %c to <8 x i1> 1011 %res0 = load <8 x half>, ptr %a 1012 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer 1013 ret <8 x half> %res 1014} 1015 1016define <8 x half> @loadu8f16(ptr %a) { 1017; X64-LABEL: loadu8f16: 1018; X64: # %bb.0: 1019; X64-NEXT: vmovups (%rdi), %xmm0 1020; X64-NEXT: retq 1021; 1022; X86-LABEL: loadu8f16: 1023; X86: # %bb.0: 1024; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1025; X86-NEXT: vmovups (%eax), %xmm0 1026; X86-NEXT: retl 1027 %res = load <8 x half>, ptr %a, align 8 1028 ret <8 x half> %res 1029} 1030 1031define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) { 1032; X64-LABEL: loadu8f16mask: 1033; X64: # %bb.0: 1034; X64-NEXT: kmovd %esi, %k1 1035; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} 1036; X64-NEXT: retq 1037; 1038; X86-LABEL: loadu8f16mask: 1039; X86: # %bb.0: 1040; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1041; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 1042; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} 1043; X86-NEXT: retl 1044 %msk = bitcast i8 %c to <8 x i1> 1045 %res0 = load <8 x half>, ptr %a, align 8 1046 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b 1047 ret <8 x half> %res 1048} 1049 1050define <8 x half> @loadu8f16maskz(ptr %a, i8 %c) { 1051; X64-LABEL: loadu8f16maskz: 1052; X64: # %bb.0: 1053; X64-NEXT: kmovd %esi, %k1 1054; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} 1055; X64-NEXT: retq 1056; 1057; X86-LABEL: loadu8f16maskz: 1058; X86: # %bb.0: 1059; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1060; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 1061; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} 1062; X86-NEXT: retl 1063 %msk = bitcast i8 %c to <8 x i1> 1064 %res0 = load <8 x half>, ptr %a, align 8 1065 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer 1066 ret <8 x half> %res 1067} 1068 1069define void @store8f16(<8 x half> %a) { 1070; X64-LABEL: store8f16: 1071; X64: # %bb.0: 1072; X64-NEXT: movq g8f16@GOTPCREL(%rip), %rax 1073; X64-NEXT: vmovaps %xmm0, (%rax) 1074; X64-NEXT: retq 1075; 1076; X86-LABEL: store8f16: 1077; X86: # %bb.0: 1078; X86-NEXT: vmovaps %xmm0, g8f16 1079; X86-NEXT: retl 1080 store <8 x half> %a, ptr @g8f16 1081 ret void 1082} 1083 1084define void @storeu8f16(<8 x half> %a) { 1085; X64-LABEL: storeu8f16: 1086; X64: # %bb.0: 1087; X64-NEXT: movq g8f16u@GOTPCREL(%rip), %rax 1088; X64-NEXT: vmovups %xmm0, (%rax) 1089; X64-NEXT: retq 1090; 1091; X86-LABEL: storeu8f16: 1092; X86: # %bb.0: 1093; X86-NEXT: vmovups %xmm0, g8f16u 1094; X86-NEXT: retl 1095 store <8 x half> %a, ptr @g8f16u, align 8 1096 ret void 1097} 1098 1099declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32, <8 x i1>) 1100declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32, <8 x i1>, <8 x half>) 1101 1102define void @storeu8f16mask(<8 x i1> %mask, ptr %addr, <8 x half> %val) { 1103; X64-LABEL: storeu8f16mask: 1104; X64: # %bb.0: 1105; X64-NEXT: vpsllw $15, %xmm0, %xmm0 1106; X64-NEXT: vpmovw2m %xmm0, %k1 1107; X64-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1} 1108; X64-NEXT: retq 1109; 1110; X86-LABEL: storeu8f16mask: 1111; X86: # %bb.0: 1112; X86-NEXT: vpsllw $15, %xmm0, %xmm0 1113; X86-NEXT: vpmovw2m %xmm0, %k1 1114; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1115; X86-NEXT: vmovdqu16 %xmm1, (%eax) {%k1} 1116; X86-NEXT: retl 1117 call void @llvm.masked.store.v8f16.p0(<8 x half> %val, ptr %addr, i32 4, <8 x i1>%mask) 1118 ret void 1119} 1120 1121define <8 x half> @maskloadu8f16(ptr %addr, <8 x half> %val, <8 x i1> %mask) { 1122; X64-LABEL: maskloadu8f16: 1123; X64: # %bb.0: 1124; X64-NEXT: vpsllw $15, %xmm1, %xmm1 1125; X64-NEXT: vpmovw2m %xmm1, %k1 1126; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} 1127; X64-NEXT: retq 1128; 1129; X86-LABEL: maskloadu8f16: 1130; X86: # %bb.0: 1131; X86-NEXT: vpsllw $15, %xmm1, %xmm1 1132; X86-NEXT: vpmovw2m %xmm1, %k1 1133; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1134; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} 1135; X86-NEXT: retl 1136 %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> %val) 1137 ret <8 x half> %res 1138} 1139 1140define <8 x half> @maskuloadu8f16(ptr %addr, <8 x i1> %mask) { 1141; X64-LABEL: maskuloadu8f16: 1142; X64: # %bb.0: 1143; X64-NEXT: vpsllw $15, %xmm0, %xmm0 1144; X64-NEXT: vpmovw2m %xmm0, %k1 1145; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} 1146; X64-NEXT: retq 1147; 1148; X86-LABEL: maskuloadu8f16: 1149; X86: # %bb.0: 1150; X86-NEXT: vpsllw $15, %xmm0, %xmm0 1151; X86-NEXT: vpmovw2m %xmm0, %k1 1152; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1153; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} 1154; X86-NEXT: retl 1155 %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> undef) 1156 ret <8 x half> %res 1157} 1158 1159define <8 x half> @maskzloadu8f16(ptr %addr, <8 x i1> %mask) { 1160; X64-LABEL: maskzloadu8f16: 1161; X64: # %bb.0: 1162; X64-NEXT: vpsllw $15, %xmm0, %xmm0 1163; X64-NEXT: vpmovw2m %xmm0, %k1 1164; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} 1165; X64-NEXT: retq 1166; 1167; X86-LABEL: maskzloadu8f16: 1168; X86: # %bb.0: 1169; X86-NEXT: vpsllw $15, %xmm0, %xmm0 1170; X86-NEXT: vpmovw2m %xmm0, %k1 1171; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1172; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} 1173; X86-NEXT: retl 1174 %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer) 1175 ret <8 x half> %res 1176} 1177 1178define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) { 1179; CHECK-LABEL: movrr8f16: 1180; CHECK: # %bb.0: 1181; CHECK-NEXT: vmovaps %xmm1, %xmm0 1182; CHECK-NEXT: ret{{[l|q]}} 1183 ret <8 x half> %b 1184} 1185 1186define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) { 1187; X64-LABEL: movrrk8f16: 1188; X64: # %bb.0: 1189; X64-NEXT: kmovd %edi, %k1 1190; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} 1191; X64-NEXT: retq 1192; 1193; X86-LABEL: movrrk8f16: 1194; X86: # %bb.0: 1195; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 1196; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} 1197; X86-NEXT: retl 1198 %mask = bitcast i8 %msk to <8 x i1> 1199 %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b 1200 ret <8 x half> %res 1201} 1202 1203define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) { 1204; X64-LABEL: movrrkz8f16: 1205; X64: # %bb.0: 1206; X64-NEXT: kmovd %edi, %k1 1207; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} 1208; X64-NEXT: retq 1209; 1210; X86-LABEL: movrrkz8f16: 1211; X86: # %bb.0: 1212; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 1213; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} 1214; X86-NEXT: retl 1215 %mask = bitcast i8 %msk to <8 x i1> 1216 %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer 1217 ret <8 x half> %res 1218} 1219 1220define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) { 1221; CHECK-LABEL: movsh: 1222; CHECK: # %bb.0: 1223; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11] 1224; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0 1225; CHECK-NEXT: vaddph %xmm0, %xmm2, %xmm0 1226; CHECK-NEXT: ret{{[l|q]}} 1227 %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5> 1228 %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1229 %res = fadd <8 x half> %res1, %res2 1230 ret <8 x half> %res 1231} 1232 1233define i16 @test_movw(half %x) { 1234; X64-LABEL: test_movw: 1235; X64: # %bb.0: 1236; X64-NEXT: vmovw %xmm0, %eax 1237; X64-NEXT: # kill: def $ax killed $ax killed $eax 1238; X64-NEXT: retq 1239; 1240; X86-LABEL: test_movw: 1241; X86: # %bb.0: 1242; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1243; X86-NEXT: retl 1244 %res = bitcast half %x to i16 1245 ret i16 %res 1246} 1247 1248define half @test_movw2(i16 %x) { 1249; X64-LABEL: test_movw2: 1250; X64: # %bb.0: 1251; X64-NEXT: vmovw %edi, %xmm0 1252; X64-NEXT: retq 1253; 1254; X86-LABEL: test_movw2: 1255; X86: # %bb.0: 1256; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 1257; X86-NEXT: retl 1258 %res = bitcast i16 %x to half 1259 ret half %res 1260} 1261 1262; sext avoids having a truncate in front of the bitcast input due to calling 1263; convention or i16 op promotion. 1264define half @test_movw3(i8 %x) { 1265; X64-LABEL: test_movw3: 1266; X64: # %bb.0: 1267; X64-NEXT: movsbl %dil, %eax 1268; X64-NEXT: vmovw %eax, %xmm0 1269; X64-NEXT: retq 1270; 1271; X86-LABEL: test_movw3: 1272; X86: # %bb.0: 1273; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax 1274; X86-NEXT: vmovw %eax, %xmm0 1275; X86-NEXT: retl 1276 %z = sext i8 %x to i16 1277 %a = bitcast i16 %z to half 1278 ret half %a 1279} 1280 1281define half @extract_f16_0(<8 x half> %x) { 1282; CHECK-LABEL: extract_f16_0: 1283; CHECK: # %bb.0: 1284; CHECK-NEXT: ret{{[l|q]}} 1285 %res = extractelement <8 x half> %x, i32 0 1286 ret half %res 1287} 1288 1289define half @extract_f16_1(<8 x half> %x) { 1290; CHECK-LABEL: extract_f16_1: 1291; CHECK: # %bb.0: 1292; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0 1293; CHECK-NEXT: ret{{[l|q]}} 1294 %res = extractelement <8 x half> %x, i32 1 1295 ret half %res 1296} 1297 1298define half @extract_f16_2(<8 x half> %x) { 1299; CHECK-LABEL: extract_f16_2: 1300; CHECK: # %bb.0: 1301; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1302; CHECK-NEXT: ret{{[l|q]}} 1303 %res = extractelement <8 x half> %x, i32 2 1304 ret half %res 1305} 1306 1307define half @extract_f16_3(<8 x half> %x) { 1308; CHECK-LABEL: extract_f16_3: 1309; CHECK: # %bb.0: 1310; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm0 1311; CHECK-NEXT: ret{{[l|q]}} 1312 %res = extractelement <8 x half> %x, i32 3 1313 ret half %res 1314} 1315 1316define half @extract_f16_4(<8 x half> %x) { 1317; CHECK-LABEL: extract_f16_4: 1318; CHECK: # %bb.0: 1319; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1320; CHECK-NEXT: ret{{[l|q]}} 1321 %res = extractelement <8 x half> %x, i32 4 1322 ret half %res 1323} 1324 1325define half @extract_f16_5(<8 x half> %x) { 1326; CHECK-LABEL: extract_f16_5: 1327; CHECK: # %bb.0: 1328; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1329; CHECK-NEXT: ret{{[l|q]}} 1330 %res = extractelement <8 x half> %x, i32 5 1331 ret half %res 1332} 1333 1334define half @extract_f16_6(<8 x half> %x) { 1335; CHECK-LABEL: extract_f16_6: 1336; CHECK: # %bb.0: 1337; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1338; CHECK-NEXT: ret{{[l|q]}} 1339 %res = extractelement <8 x half> %x, i32 6 1340 ret half %res 1341} 1342 1343define half @extract_f16_7(<8 x half> %x) { 1344; CHECK-LABEL: extract_f16_7: 1345; CHECK: # %bb.0: 1346; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1347; CHECK-NEXT: ret{{[l|q]}} 1348 %res = extractelement <8 x half> %x, i32 7 1349 ret half %res 1350} 1351 1352define half @extract_f16_8(<32 x half> %x, i64 %idx) nounwind { 1353; X64-LABEL: extract_f16_8: 1354; X64: # %bb.0: 1355; X64-NEXT: pushq %rbp 1356; X64-NEXT: movq %rsp, %rbp 1357; X64-NEXT: andq $-64, %rsp 1358; X64-NEXT: subq $128, %rsp 1359; X64-NEXT: andl $31, %edi 1360; X64-NEXT: vmovaps %zmm0, (%rsp) 1361; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 1362; X64-NEXT: movq %rbp, %rsp 1363; X64-NEXT: popq %rbp 1364; X64-NEXT: vzeroupper 1365; X64-NEXT: retq 1366; 1367; X86-LABEL: extract_f16_8: 1368; X86: # %bb.0: 1369; X86-NEXT: pushl %ebp 1370; X86-NEXT: movl %esp, %ebp 1371; X86-NEXT: andl $-64, %esp 1372; X86-NEXT: subl $128, %esp 1373; X86-NEXT: movl 8(%ebp), %eax 1374; X86-NEXT: andl $31, %eax 1375; X86-NEXT: vmovaps %zmm0, (%esp) 1376; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 1377; X86-NEXT: movl %ebp, %esp 1378; X86-NEXT: popl %ebp 1379; X86-NEXT: vzeroupper 1380; X86-NEXT: retl 1381 %res = extractelement <32 x half> %x, i64 %idx 1382 ret half %res 1383} 1384 1385define half @extract_f16_9(<64 x half> %x, i64 %idx) nounwind { 1386; X64-LABEL: extract_f16_9: 1387; X64: # %bb.0: 1388; X64-NEXT: pushq %rbp 1389; X64-NEXT: movq %rsp, %rbp 1390; X64-NEXT: andq $-64, %rsp 1391; X64-NEXT: subq $192, %rsp 1392; X64-NEXT: andl $63, %edi 1393; X64-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) 1394; X64-NEXT: vmovaps %zmm0, (%rsp) 1395; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 1396; X64-NEXT: movq %rbp, %rsp 1397; X64-NEXT: popq %rbp 1398; X64-NEXT: vzeroupper 1399; X64-NEXT: retq 1400; 1401; X86-LABEL: extract_f16_9: 1402; X86: # %bb.0: 1403; X86-NEXT: pushl %ebp 1404; X86-NEXT: movl %esp, %ebp 1405; X86-NEXT: andl $-64, %esp 1406; X86-NEXT: subl $192, %esp 1407; X86-NEXT: movl 8(%ebp), %eax 1408; X86-NEXT: andl $63, %eax 1409; X86-NEXT: vmovaps %zmm1, {{[0-9]+}}(%esp) 1410; X86-NEXT: vmovaps %zmm0, (%esp) 1411; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 1412; X86-NEXT: movl %ebp, %esp 1413; X86-NEXT: popl %ebp 1414; X86-NEXT: vzeroupper 1415; X86-NEXT: retl 1416 %res = extractelement <64 x half> %x, i64 %idx 1417 ret half %res 1418} 1419 1420define i16 @extract_i16_0(<8 x i16> %x) { 1421; CHECK-LABEL: extract_i16_0: 1422; CHECK: # %bb.0: 1423; CHECK-NEXT: vmovw %xmm0, %eax 1424; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1425; CHECK-NEXT: ret{{[l|q]}} 1426 %res = extractelement <8 x i16> %x, i32 0 1427 ret i16 %res 1428} 1429 1430define i16 @extract_i16_1(<8 x i16> %x) { 1431; CHECK-LABEL: extract_i16_1: 1432; CHECK: # %bb.0: 1433; CHECK-NEXT: vpextrw $1, %xmm0, %eax 1434; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1435; CHECK-NEXT: ret{{[l|q]}} 1436 %res = extractelement <8 x i16> %x, i32 1 1437 ret i16 %res 1438} 1439 1440define i16 @extract_i16_2(<8 x i16> %x) { 1441; CHECK-LABEL: extract_i16_2: 1442; CHECK: # %bb.0: 1443; CHECK-NEXT: vpextrw $2, %xmm0, %eax 1444; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1445; CHECK-NEXT: ret{{[l|q]}} 1446 %res = extractelement <8 x i16> %x, i32 2 1447 ret i16 %res 1448} 1449 1450define i16 @extract_i16_3(<8 x i16> %x) { 1451; CHECK-LABEL: extract_i16_3: 1452; CHECK: # %bb.0: 1453; CHECK-NEXT: vpextrw $3, %xmm0, %eax 1454; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1455; CHECK-NEXT: ret{{[l|q]}} 1456 %res = extractelement <8 x i16> %x, i32 3 1457 ret i16 %res 1458} 1459 1460define i16 @extract_i16_4(<8 x i16> %x) { 1461; CHECK-LABEL: extract_i16_4: 1462; CHECK: # %bb.0: 1463; CHECK-NEXT: vpextrw $4, %xmm0, %eax 1464; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1465; CHECK-NEXT: ret{{[l|q]}} 1466 %res = extractelement <8 x i16> %x, i32 4 1467 ret i16 %res 1468} 1469 1470define i16 @extract_i16_5(<8 x i16> %x) { 1471; CHECK-LABEL: extract_i16_5: 1472; CHECK: # %bb.0: 1473; CHECK-NEXT: vpextrw $5, %xmm0, %eax 1474; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1475; CHECK-NEXT: ret{{[l|q]}} 1476 %res = extractelement <8 x i16> %x, i32 5 1477 ret i16 %res 1478} 1479 1480define i16 @extract_i16_6(<8 x i16> %x) { 1481; CHECK-LABEL: extract_i16_6: 1482; CHECK: # %bb.0: 1483; CHECK-NEXT: vpextrw $6, %xmm0, %eax 1484; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1485; CHECK-NEXT: ret{{[l|q]}} 1486 %res = extractelement <8 x i16> %x, i32 6 1487 ret i16 %res 1488} 1489 1490define i16 @extract_i16_7(<8 x i16> %x) { 1491; CHECK-LABEL: extract_i16_7: 1492; CHECK: # %bb.0: 1493; CHECK-NEXT: vpextrw $7, %xmm0, %eax 1494; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1495; CHECK-NEXT: ret{{[l|q]}} 1496 %res = extractelement <8 x i16> %x, i32 7 1497 ret i16 %res 1498} 1499 1500define void @extract_store_f16_0(<8 x half> %x, ptr %y) { 1501; X64-LABEL: extract_store_f16_0: 1502; X64: # %bb.0: 1503; X64-NEXT: vmovsh %xmm0, (%rdi) 1504; X64-NEXT: retq 1505; 1506; X86-LABEL: extract_store_f16_0: 1507; X86: # %bb.0: 1508; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1509; X86-NEXT: vmovsh %xmm0, (%eax) 1510; X86-NEXT: retl 1511 %res = extractelement <8 x half> %x, i32 0 1512 store half %res, ptr %y 1513 ret void 1514} 1515 1516define void @extract_store_f16_1(<8 x half> %x, ptr %y) { 1517; X64-LABEL: extract_store_f16_1: 1518; X64: # %bb.0: 1519; X64-NEXT: vpsrld $16, %xmm0, %xmm0 1520; X64-NEXT: vmovsh %xmm0, (%rdi) 1521; X64-NEXT: retq 1522; 1523; X86-LABEL: extract_store_f16_1: 1524; X86: # %bb.0: 1525; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1526; X86-NEXT: vpsrld $16, %xmm0, %xmm0 1527; X86-NEXT: vmovsh %xmm0, (%eax) 1528; X86-NEXT: retl 1529 %res = extractelement <8 x half> %x, i32 1 1530 store half %res, ptr %y 1531 ret void 1532} 1533 1534define void @extract_store_f16_2(<8 x half> %x, ptr %y) { 1535; X64-LABEL: extract_store_f16_2: 1536; X64: # %bb.0: 1537; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1538; X64-NEXT: vmovsh %xmm0, (%rdi) 1539; X64-NEXT: retq 1540; 1541; X86-LABEL: extract_store_f16_2: 1542; X86: # %bb.0: 1543; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1544; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1545; X86-NEXT: vmovsh %xmm0, (%eax) 1546; X86-NEXT: retl 1547 %res = extractelement <8 x half> %x, i32 2 1548 store half %res, ptr %y 1549 ret void 1550} 1551 1552define void @extract_store_f16_3(<8 x half> %x, ptr %y) { 1553; X64-LABEL: extract_store_f16_3: 1554; X64: # %bb.0: 1555; X64-NEXT: vpsrlq $48, %xmm0, %xmm0 1556; X64-NEXT: vmovsh %xmm0, (%rdi) 1557; X64-NEXT: retq 1558; 1559; X86-LABEL: extract_store_f16_3: 1560; X86: # %bb.0: 1561; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1562; X86-NEXT: vpsrlq $48, %xmm0, %xmm0 1563; X86-NEXT: vmovsh %xmm0, (%eax) 1564; X86-NEXT: retl 1565 %res = extractelement <8 x half> %x, i32 3 1566 store half %res, ptr %y 1567 ret void 1568} 1569 1570define void @extract_store_f16_4(<8 x half> %x, ptr %y) { 1571; X64-LABEL: extract_store_f16_4: 1572; X64: # %bb.0: 1573; X64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1574; X64-NEXT: vmovsh %xmm0, (%rdi) 1575; X64-NEXT: retq 1576; 1577; X86-LABEL: extract_store_f16_4: 1578; X86: # %bb.0: 1579; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1580; X86-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1581; X86-NEXT: vmovsh %xmm0, (%eax) 1582; X86-NEXT: retl 1583 %res = extractelement <8 x half> %x, i32 4 1584 store half %res, ptr %y 1585 ret void 1586} 1587 1588define void @extract_store_f16_5(<8 x half> %x, ptr %y) { 1589; X64-LABEL: extract_store_f16_5: 1590; X64: # %bb.0: 1591; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1592; X64-NEXT: vmovsh %xmm0, (%rdi) 1593; X64-NEXT: retq 1594; 1595; X86-LABEL: extract_store_f16_5: 1596; X86: # %bb.0: 1597; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1598; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1599; X86-NEXT: vmovsh %xmm0, (%eax) 1600; X86-NEXT: retl 1601 %res = extractelement <8 x half> %x, i32 5 1602 store half %res, ptr %y 1603 ret void 1604} 1605 1606define void @extract_store_f16_6(<8 x half> %x, ptr %y) { 1607; X64-LABEL: extract_store_f16_6: 1608; X64: # %bb.0: 1609; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1610; X64-NEXT: vmovsh %xmm0, (%rdi) 1611; X64-NEXT: retq 1612; 1613; X86-LABEL: extract_store_f16_6: 1614; X86: # %bb.0: 1615; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1616; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1617; X86-NEXT: vmovsh %xmm0, (%eax) 1618; X86-NEXT: retl 1619 %res = extractelement <8 x half> %x, i32 6 1620 store half %res, ptr %y 1621 ret void 1622} 1623 1624define void @extract_store_f16_7(<8 x half> %x, ptr %y) { 1625; X64-LABEL: extract_store_f16_7: 1626; X64: # %bb.0: 1627; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1628; X64-NEXT: vmovsh %xmm0, (%rdi) 1629; X64-NEXT: retq 1630; 1631; X86-LABEL: extract_store_f16_7: 1632; X86: # %bb.0: 1633; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1634; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1635; X86-NEXT: vmovsh %xmm0, (%eax) 1636; X86-NEXT: retl 1637 %res = extractelement <8 x half> %x, i32 7 1638 store half %res, ptr %y 1639 ret void 1640} 1641 1642define void @extract_store_i16_0(<8 x i16> %x, ptr %y) { 1643; X64-LABEL: extract_store_i16_0: 1644; X64: # %bb.0: 1645; X64-NEXT: vpextrw $0, %xmm0, (%rdi) 1646; X64-NEXT: retq 1647; 1648; X86-LABEL: extract_store_i16_0: 1649; X86: # %bb.0: 1650; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1651; X86-NEXT: vpextrw $0, %xmm0, (%eax) 1652; X86-NEXT: retl 1653 %res = extractelement <8 x i16> %x, i32 0 1654 store i16 %res, ptr %y 1655 ret void 1656} 1657 1658define void @extract_store_i16_1(<8 x i16> %x, ptr %y) { 1659; X64-LABEL: extract_store_i16_1: 1660; X64: # %bb.0: 1661; X64-NEXT: vpextrw $1, %xmm0, (%rdi) 1662; X64-NEXT: retq 1663; 1664; X86-LABEL: extract_store_i16_1: 1665; X86: # %bb.0: 1666; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1667; X86-NEXT: vpextrw $1, %xmm0, (%eax) 1668; X86-NEXT: retl 1669 %res = extractelement <8 x i16> %x, i32 1 1670 store i16 %res, ptr %y 1671 ret void 1672} 1673 1674define void @extract_store_i16_2(<8 x i16> %x, ptr %y) { 1675; X64-LABEL: extract_store_i16_2: 1676; X64: # %bb.0: 1677; X64-NEXT: vpextrw $2, %xmm0, (%rdi) 1678; X64-NEXT: retq 1679; 1680; X86-LABEL: extract_store_i16_2: 1681; X86: # %bb.0: 1682; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1683; X86-NEXT: vpextrw $2, %xmm0, (%eax) 1684; X86-NEXT: retl 1685 %res = extractelement <8 x i16> %x, i32 2 1686 store i16 %res, ptr %y 1687 ret void 1688} 1689 1690define void @extract_store_i16_3(<8 x i16> %x, ptr %y) { 1691; X64-LABEL: extract_store_i16_3: 1692; X64: # %bb.0: 1693; X64-NEXT: vpextrw $3, %xmm0, (%rdi) 1694; X64-NEXT: retq 1695; 1696; X86-LABEL: extract_store_i16_3: 1697; X86: # %bb.0: 1698; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1699; X86-NEXT: vpextrw $3, %xmm0, (%eax) 1700; X86-NEXT: retl 1701 %res = extractelement <8 x i16> %x, i32 3 1702 store i16 %res, ptr %y 1703 ret void 1704} 1705 1706define void @extract_store_i16_4(<8 x i16> %x, ptr %y) { 1707; X64-LABEL: extract_store_i16_4: 1708; X64: # %bb.0: 1709; X64-NEXT: vpextrw $4, %xmm0, (%rdi) 1710; X64-NEXT: retq 1711; 1712; X86-LABEL: extract_store_i16_4: 1713; X86: # %bb.0: 1714; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1715; X86-NEXT: vpextrw $4, %xmm0, (%eax) 1716; X86-NEXT: retl 1717 %res = extractelement <8 x i16> %x, i32 4 1718 store i16 %res, ptr %y 1719 ret void 1720} 1721 1722define void @extract_store_i16_5(<8 x i16> %x, ptr %y) { 1723; X64-LABEL: extract_store_i16_5: 1724; X64: # %bb.0: 1725; X64-NEXT: vpextrw $5, %xmm0, (%rdi) 1726; X64-NEXT: retq 1727; 1728; X86-LABEL: extract_store_i16_5: 1729; X86: # %bb.0: 1730; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1731; X86-NEXT: vpextrw $5, %xmm0, (%eax) 1732; X86-NEXT: retl 1733 %res = extractelement <8 x i16> %x, i32 5 1734 store i16 %res, ptr %y 1735 ret void 1736} 1737 1738define void @extract_store_i16_6(<8 x i16> %x, ptr %y) { 1739; X64-LABEL: extract_store_i16_6: 1740; X64: # %bb.0: 1741; X64-NEXT: vpextrw $6, %xmm0, (%rdi) 1742; X64-NEXT: retq 1743; 1744; X86-LABEL: extract_store_i16_6: 1745; X86: # %bb.0: 1746; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1747; X86-NEXT: vpextrw $6, %xmm0, (%eax) 1748; X86-NEXT: retl 1749 %res = extractelement <8 x i16> %x, i32 6 1750 store i16 %res, ptr %y 1751 ret void 1752} 1753 1754define void @extract_store_i16_7(<8 x i16> %x, ptr %y) { 1755; X64-LABEL: extract_store_i16_7: 1756; X64: # %bb.0: 1757; X64-NEXT: vpextrw $7, %xmm0, (%rdi) 1758; X64-NEXT: retq 1759; 1760; X86-LABEL: extract_store_i16_7: 1761; X86: # %bb.0: 1762; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1763; X86-NEXT: vpextrw $7, %xmm0, (%eax) 1764; X86-NEXT: retl 1765 %res = extractelement <8 x i16> %x, i32 7 1766 store i16 %res, ptr %y 1767 ret void 1768} 1769 1770define i32 @extract_zext_i16_0(<8 x i16> %x) { 1771; CHECK-LABEL: extract_zext_i16_0: 1772; CHECK: # %bb.0: 1773; CHECK-NEXT: vpextrw $0, %xmm0, %eax 1774; CHECK-NEXT: ret{{[l|q]}} 1775 %res = extractelement <8 x i16> %x, i32 0 1776 %res2 = zext i16 %res to i32 1777 ret i32 %res2 1778} 1779 1780define i32 @extract_zext_i16_1(<8 x i16> %x) { 1781; CHECK-LABEL: extract_zext_i16_1: 1782; CHECK: # %bb.0: 1783; CHECK-NEXT: vpextrw $1, %xmm0, %eax 1784; CHECK-NEXT: ret{{[l|q]}} 1785 %res = extractelement <8 x i16> %x, i32 1 1786 %res2 = zext i16 %res to i32 1787 ret i32 %res2 1788} 1789 1790define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3) { 1791; X64-LABEL: build_vector_xxxxuuuu: 1792; X64: # %bb.0: 1793; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1794; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1795; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero 1796; X64-NEXT: retq 1797; 1798; X86-LABEL: build_vector_xxxxuuuu: 1799; X86: # %bb.0: 1800; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 1801; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 1802; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1803; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 1804; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero 1805; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1806; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero 1807; X86-NEXT: retl 1808 %a = insertelement <8 x half> undef, half %a0, i32 0 1809 %b = insertelement <8 x half> %a, half %a1, i32 1 1810 %c = insertelement <8 x half> %b, half %a2, i32 2 1811 %d = insertelement <8 x half> %c, half %a3, i32 3 1812 ret <8 x half> %d 1813} 1814 1815define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3) { 1816; X64-LABEL: build_vector_uuuuxxxx: 1817; X64: # %bb.0: 1818; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1819; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1820; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1821; X64-NEXT: vpbroadcastq %xmm0, %xmm0 1822; X64-NEXT: retq 1823; 1824; X86-LABEL: build_vector_uuuuxxxx: 1825; X86: # %bb.0: 1826; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 1827; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 1828; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1829; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 1830; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero 1831; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1832; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1833; X86-NEXT: vpbroadcastq %xmm0, %xmm0 1834; X86-NEXT: retl 1835 %a = insertelement <8 x half> undef, half %a0, i32 4 1836 %b = insertelement <8 x half> %a, half %a1, i32 5 1837 %c = insertelement <8 x half> %b, half %a2, i32 6 1838 %d = insertelement <8 x half> %c, half %a3, i32 7 1839 ret <8 x half> %d 1840} 1841 1842define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) { 1843; X64-LABEL: build_vector_xxxxxxxx: 1844; X64: # %bb.0: 1845; X64-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 1846; X64-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 1847; X64-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] 1848; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1849; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1850; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1851; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 1852; X64-NEXT: retq 1853; 1854; X86-LABEL: build_vector_xxxxxxxx: 1855; X86: # %bb.0: 1856; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 1857; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 1858; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1859; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 1860; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero 1861; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1862; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1863; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 1864; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero 1865; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1866; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero 1867; X86-NEXT: vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero 1868; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1869; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1870; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1871; X86-NEXT: retl 1872 %a = insertelement <8 x half> undef, half %a0, i32 0 1873 %b = insertelement <8 x half> %a, half %a1, i32 1 1874 %c = insertelement <8 x half> %b, half %a2, i32 2 1875 %d = insertelement <8 x half> %c, half %a3, i32 3 1876 %e = insertelement <8 x half> %d, half %a4, i32 4 1877 %f = insertelement <8 x half> %e, half %a5, i32 5 1878 %g = insertelement <8 x half> %f, half %a6, i32 6 1879 %h = insertelement <8 x half> %g, half %a7, i32 7 1880 ret <8 x half> %h 1881} 1882 1883define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) { 1884; X64-LABEL: build_vector_xxxxuuuuuuuuxxxx: 1885; X64: # %bb.0: 1886; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1887; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1888; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero 1889; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 1890; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 1891; X64-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1892; X64-NEXT: vpbroadcastq %xmm1, %xmm1 1893; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1894; X64-NEXT: retq 1895; 1896; X86-LABEL: build_vector_xxxxuuuuuuuuxxxx: 1897; X86: # %bb.0: 1898; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 1899; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 1900; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1901; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 1902; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero 1903; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1904; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1905; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 1906; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero 1907; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1908; X86-NEXT: vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero 1909; X86-NEXT: vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero 1910; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1911; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero 1912; X86-NEXT: vpbroadcastq %xmm0, %xmm0 1913; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1914; X86-NEXT: retl 1915 %a = insertelement <16 x half> undef, half %a0, i32 0 1916 %b = insertelement <16 x half> %a, half %a1, i32 1 1917 %c = insertelement <16 x half> %b, half %a2, i32 2 1918 %d = insertelement <16 x half> %c, half %a3, i32 3 1919 %e = insertelement <16 x half> %d, half %a4, i32 12 1920 %f = insertelement <16 x half> %e, half %a5, i32 13 1921 %g = insertelement <16 x half> %f, half %a6, i32 14 1922 %h = insertelement <16 x half> %g, half %a7, i32 15 1923 ret <16 x half> %h 1924} 1925 1926define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) { 1927; CHECK-LABEL: regression1: 1928; CHECK: # %bb.0: 1929; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11] 1930; CHECK-NEXT: ret{{[l|q]}} 1931 %res = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5> 1932 ret <8 x half> %res 1933} 1934 1935define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, ptr %4) { 1936; X64-LABEL: regression2: 1937; X64: # %bb.0: 1938; X64-NEXT: vmovw (%rsi), %xmm0 1939; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1940; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 1941; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] 1942; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 1943; X64-NEXT: retq 1944; 1945; X86-LABEL: regression2: 1946; X86: # %bb.0: 1947; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1948; X86-NEXT: vmovw (%eax), %xmm0 1949; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1950; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 1951; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] 1952; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 1953; X86-NEXT: retl 1954 %6 = load i8, ptr %4, align 1 1955 %7 = getelementptr i8, ptr %4, i64 1 1956 %8 = addrspacecast ptr %7 to ptr addrspace(4) 1957 %9 = load i8, ptr addrspace(4) %8, align 1 1958 %10 = insertelement <2 x i8> poison, i8 %6, i32 0 1959 %11 = insertelement <2 x i8> %10, i8 %9, i32 1 1960 %12 = uitofp <2 x i8> %11 to <2 x float> 1961 %13 = shufflevector <2 x float> %12, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1962 %14 = shufflevector <4 x float> %13, <4 x float> <float poison, float poison, float 0.000000e+00, float 2.550000e+02>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 1963 %15 = fmul contract <4 x float> %14, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> 1964 ret <4 x float> %15 1965} 1966 1967; Make sure load/stores of v4f16 are handled well on 32-bit targets where 1968; default widening legalization can't use i64. 1969define void @load_store_v4f16(ptr %x, ptr %y, ptr %z) { 1970; X64-LABEL: load_store_v4f16: 1971; X64: # %bb.0: 1972; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1973; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1974; X64-NEXT: vaddph %xmm1, %xmm0, %xmm0 1975; X64-NEXT: vmovlps %xmm0, (%rdx) 1976; X64-NEXT: retq 1977; 1978; X86-LABEL: load_store_v4f16: 1979; X86: # %bb.0: 1980; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1981; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 1982; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 1983; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1984; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1985; X86-NEXT: vaddph %xmm1, %xmm0, %xmm0 1986; X86-NEXT: vmovlps %xmm0, (%eax) 1987; X86-NEXT: retl 1988 %a = load <4 x half>, ptr %x 1989 %b = load <4 x half>, ptr %y 1990 %c = fadd <4 x half> %a, %b 1991 store <4 x half> %c, ptr %z 1992 ret void 1993} 1994 1995define <8 x half> @test21(half %a, half %b, half %c) nounwind { 1996; X64-LABEL: test21: 1997; X64: # %bb.0: 1998; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 1999; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2 2000; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2001; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2002; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 2003; X64-NEXT: vpbroadcastw %xmm1, %xmm1 2004; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 2005; X64-NEXT: retq 2006; 2007; X86-LABEL: test21: 2008; X86: # %bb.0: 2009; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 2010; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 2011; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2012; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 2013; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2014; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 2015; X86-NEXT: vpbroadcastw %xmm1, %xmm1 2016; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 2017; X86-NEXT: retl 2018 %1 = insertelement <8 x half> <half poison, half poison, half poison, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, half %a, i32 0 2019 %2 = insertelement <8 x half> %1, half %b, i32 1 2020 %3 = insertelement <8 x half> %2, half %c, i32 2 2021 ret <8 x half> %3 2022} 2023 2024define <16 x i16> @test22(ptr %mem) nounwind { 2025; X64-LABEL: test22: 2026; X64: # %bb.0: 2027; X64-NEXT: movzwl 0, %eax 2028; X64-NEXT: andw (%rdi), %ax 2029; X64-NEXT: vmovw %eax, %xmm0 2030; X64-NEXT: retq 2031; 2032; X86-LABEL: test22: 2033; X86: # %bb.0: 2034; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2035; X86-NEXT: movzwl 0, %ecx 2036; X86-NEXT: andw (%eax), %cx 2037; X86-NEXT: vmovw %ecx, %xmm0 2038; X86-NEXT: retl 2039 %1 = load i16, ptr null, align 2 2040 %2 = load i16, ptr %mem, align 2 2041 %3 = and i16 %1, %2 2042 %4 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %3, i32 0 2043 ret <16 x i16> %4 2044} 2045 2046define void @pr52560(i8 %0, <2 x i16> %1, ptr %c) nounwind { 2047; X64-LABEL: pr52560: 2048; X64: # %bb.0: # %entry 2049; X64-NEXT: movsbl %dil, %eax 2050; X64-NEXT: vmovw %eax, %xmm1 2051; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 2052; X64-NEXT: vpcmpgtw %xmm2, %xmm1, %k1 2053; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} 2054; X64-NEXT: vmovw %xmm0, %eax 2055; X64-NEXT: testw %ax, %ax 2056; X64-NEXT: je .LBB123_2 2057; X64-NEXT: # %bb.1: # %for.body.preheader 2058; X64-NEXT: movb $0, (%rsi) 2059; X64-NEXT: .LBB123_2: # %for.end 2060; X64-NEXT: retq 2061; 2062; X86-LABEL: pr52560: 2063; X86: # %bb.0: # %entry 2064; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax 2065; X86-NEXT: vmovw %eax, %xmm1 2066; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 2067; X86-NEXT: vpcmpgtw %xmm2, %xmm1, %k1 2068; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} 2069; X86-NEXT: vmovw %xmm0, %eax 2070; X86-NEXT: testw %ax, %ax 2071; X86-NEXT: je .LBB123_2 2072; X86-NEXT: # %bb.1: # %for.body.preheader 2073; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2074; X86-NEXT: movb $0, (%eax) 2075; X86-NEXT: .LBB123_2: # %for.end 2076; X86-NEXT: retl 2077entry: 2078 %conv = sext i8 %0 to i16 2079 %2 = insertelement <2 x i16> <i16 poison, i16 0>, i16 %conv, i32 0 2080 %3 = icmp sgt <2 x i16> %2, zeroinitializer 2081 %4 = select <2 x i1> %3, <2 x i16> %1, <2 x i16> <i16 0, i16 poison> 2082 %5 = extractelement <2 x i16> %4, i32 0 2083 %tobool.not14 = icmp eq i16 %5, 0 2084 br i1 %tobool.not14, label %for.end, label %for.body.preheader 2085 2086for.body.preheader: ; preds = %entry 2087 store i8 0, ptr %c, align 1 2088 br label %for.end 2089 2090for.end: ; preds = %for.body.preheader, %entry 2091 ret void 2092} 2093 2094define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width"="256" "prefer-vector-width"="256" nounwind { 2095; X64-LABEL: pr52561: 2096; X64: # %bb.0: 2097; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm1 2098; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 2099; X64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] 2100; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 2101; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 2102; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2103; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 2104; X64-NEXT: vmovsh %xmm0, %xmm2, %xmm0 2105; X64-NEXT: retq 2106; 2107; X86-LABEL: pr52561: 2108; X86: # %bb.0: 2109; X86-NEXT: pushl %ebp 2110; X86-NEXT: movl %esp, %ebp 2111; X86-NEXT: andl $-32, %esp 2112; X86-NEXT: subl $32, %esp 2113; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 2114; X86-NEXT: vpaddd 8(%ebp), %ymm1, %ymm1 2115; X86-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] 2116; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 2117; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1 2118; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 2119; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 2120; X86-NEXT: vmovsh %xmm0, %xmm2, %xmm0 2121; X86-NEXT: movl %ebp, %esp 2122; X86-NEXT: popl %ebp 2123; X86-NEXT: retl 2124 %1 = add <16 x i32> %a, <i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112> 2125 %2 = add <16 x i32> %1, %b 2126 %3 = and <16 x i32> %2, <i32 65535, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 65535> 2127 ret <16 x i32> %3 2128} 2129 2130define <8 x i16> @pr59628_xmm(i16 %arg) { 2131; X64-LABEL: pr59628_xmm: 2132; X64: # %bb.0: 2133; X64-NEXT: vmovw %edi, %xmm0 2134; X64-NEXT: vpbroadcastw %edi, %xmm1 2135; X64-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 2136; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} 2137; X64-NEXT: retq 2138; 2139; X86-LABEL: pr59628_xmm: 2140; X86: # %bb.0: 2141; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2142; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 2143; X86-NEXT: vpbroadcastw %eax, %xmm1 2144; X86-NEXT: vmovsh %xmm1, %xmm0, %xmm0 2145; X86-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %k1 2146; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} 2147; X86-NEXT: retl 2148 %I1 = insertelement <8 x i16> zeroinitializer, i16 %arg, i16 0 2149 %I2 = insertelement <8 x i16> %I1, i16 0, i16 %arg 2150 ret <8 x i16> %I2 2151} 2152