1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE 3; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE 4 5define <4 x i64> @permq_ri_256(<4 x i64> %a0) { 6; ENABLE-LABEL: permq_ri_256: 7; ENABLE: # %bb.0: 8; ENABLE-NEXT: #APP 9; ENABLE-NEXT: nop 10; ENABLE-NEXT: #NO_APP 11; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 12; ENABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0] 13; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 14; ENABLE-NEXT: retq 15; 16; DISABLE-LABEL: permq_ri_256: 17; DISABLE: # %bb.0: 18; DISABLE-NEXT: #APP 19; DISABLE-NEXT: nop 20; DISABLE-NEXT: #NO_APP 21; DISABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0] 22; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 23; DISABLE-NEXT: retq 24 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 25 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 26 %res = add <4 x i64> %2, %a0 27 ret <4 x i64> %res 28} 29 30define <4 x i64> @permq_rr_256(<4 x i64> %a0, <4 x i64> %idx) { 31; ENABLE-LABEL: permq_rr_256: 32; ENABLE: # %bb.0: 33; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 34; ENABLE-NEXT: #APP 35; ENABLE-NEXT: nop 36; ENABLE-NEXT: #NO_APP 37; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 38; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 39; ENABLE-NEXT: vpermq %ymm0, %ymm2, %ymm1 40; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 41; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 42; ENABLE-NEXT: retq 43; 44; DISABLE-LABEL: permq_rr_256: 45; DISABLE: # %bb.0: 46; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 47; DISABLE-NEXT: #APP 48; DISABLE-NEXT: nop 49; DISABLE-NEXT: #NO_APP 50; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 51; DISABLE-NEXT: vpermq %ymm0, %ymm2, %ymm1 52; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 53; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 54; DISABLE-NEXT: retq 55 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 56 %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx) 57 %t = add <4 x i64> %a0, %idx 58 %res = add <4 x i64> %t, %2 59 ret <4 x i64> %res 60} 61 62define <4 x i64> @permq_rm_256(ptr %p0, <4 x i64> %idx) { 63; ENABLE-LABEL: permq_rm_256: 64; ENABLE: # %bb.0: 65; ENABLE-NEXT: #APP 66; ENABLE-NEXT: nop 67; ENABLE-NEXT: #NO_APP 68; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 69; ENABLE-NEXT: vpermq (%rdi), %ymm0, %ymm1 70; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 71; ENABLE-NEXT: retq 72; 73; DISABLE-LABEL: permq_rm_256: 74; DISABLE: # %bb.0: 75; DISABLE-NEXT: #APP 76; DISABLE-NEXT: nop 77; DISABLE-NEXT: #NO_APP 78; DISABLE-NEXT: vpermq (%rdi), %ymm0, %ymm1 79; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 80; DISABLE-NEXT: retq 81 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 82 %a0 = load <4 x i64>, ptr %p0, align 64 83 %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx) 84 %res = add <4 x i64> %idx, %2 85 ret <4 x i64> %res 86} 87 88define <4 x i64> @permq_mi_256(ptr %p0) { 89; ENABLE-LABEL: permq_mi_256: 90; ENABLE: # %bb.0: 91; ENABLE-NEXT: #APP 92; ENABLE-NEXT: nop 93; ENABLE-NEXT: #NO_APP 94; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 95; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0] 96; ENABLE-NEXT: retq 97; 98; DISABLE-LABEL: permq_mi_256: 99; DISABLE: # %bb.0: 100; DISABLE-NEXT: #APP 101; DISABLE-NEXT: nop 102; DISABLE-NEXT: #NO_APP 103; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0] 104; DISABLE-NEXT: retq 105 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 106 %a0 = load <4 x i64>, ptr %p0, align 64 107 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0> 108 ret <4 x i64> %2 109} 110 111define <4 x i64> @permq_broadcast_256(ptr %p0, <4 x i64> %idx) { 112; ENABLE-LABEL: permq_broadcast_256: 113; ENABLE: # %bb.0: 114; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 115; ENABLE-NEXT: #APP 116; ENABLE-NEXT: nop 117; ENABLE-NEXT: #NO_APP 118; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 119; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 120; ENABLE-NEXT: vpermq (%rdi){1to4}, %ymm1, %ymm0 121; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 122; ENABLE-NEXT: retq 123; 124; DISABLE-LABEL: permq_broadcast_256: 125; DISABLE: # %bb.0: 126; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 127; DISABLE-NEXT: #APP 128; DISABLE-NEXT: nop 129; DISABLE-NEXT: #NO_APP 130; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 131; DISABLE-NEXT: vpermq (%rdi){1to4}, %ymm1, %ymm0 132; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 133; DISABLE-NEXT: retq 134 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 135 %v0 = load i64, ptr %p0, align 4 136 %t0 = insertelement <4 x i64> undef, i64 %v0, i64 0 137 %a0 = shufflevector <4 x i64> %t0, <4 x i64> undef, <4 x i32> zeroinitializer 138 %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx) 139 %res = add <4 x i64> %2, %idx 140 ret <4 x i64> %res 141} 142 143define <4 x i64> @permq_maskz_256(<4 x i64> %a0, <4 x i64> %idx, ptr %mask) { 144; ENABLE-LABEL: permq_maskz_256: 145; ENABLE: # %bb.0: 146; ENABLE-NEXT: #APP 147; ENABLE-NEXT: nop 148; ENABLE-NEXT: #NO_APP 149; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 150; ENABLE-NEXT: vpermq %ymm0, %ymm1, %ymm2 151; ENABLE-NEXT: kmovb (%rdi), %k1 152; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 153; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 {%k1} 154; ENABLE-NEXT: retq 155; 156; DISABLE-LABEL: permq_maskz_256: 157; DISABLE: # %bb.0: 158; DISABLE-NEXT: #APP 159; DISABLE-NEXT: nop 160; DISABLE-NEXT: #NO_APP 161; DISABLE-NEXT: vpermq %ymm0, %ymm1, %ymm2 162; DISABLE-NEXT: kmovb (%rdi), %k1 163; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 164; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 {%k1} 165; DISABLE-NEXT: retq 166 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 167 %2 = load i8, ptr %mask 168 %3 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx, <4 x i64> zeroinitializer, i8 %2) 169 %t = add <4 x i64> %a0, %idx 170 %res = add <4 x i64> %3, %t 171 ret <4 x i64> %res 172} 173 174declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) 175declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) 176 177define <8 x i64> @permq_rr_512(<8 x i64> %a0, <8 x i64> %idx) { 178; ENABLE-LABEL: permq_rr_512: 179; ENABLE: # %bb.0: 180; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 181; ENABLE-NEXT: #APP 182; ENABLE-NEXT: nop 183; ENABLE-NEXT: #NO_APP 184; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 185; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 186; ENABLE-NEXT: vpermq %zmm0, %zmm2, %zmm1 187; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 188; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 189; ENABLE-NEXT: retq 190; 191; DISABLE-LABEL: permq_rr_512: 192; DISABLE: # %bb.0: 193; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 194; DISABLE-NEXT: #APP 195; DISABLE-NEXT: nop 196; DISABLE-NEXT: #NO_APP 197; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 198; DISABLE-NEXT: vpermq %zmm0, %zmm2, %zmm1 199; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 200; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 201; DISABLE-NEXT: retq 202 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 203 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx) 204 %t = add <8 x i64> %a0, %idx 205 %res = add <8 x i64> %t, %2 206 ret <8 x i64> %res 207} 208 209define <8 x i64> @permq_rm_512(ptr %p0, <8 x i64> %idx) { 210; ENABLE-LABEL: permq_rm_512: 211; ENABLE: # %bb.0: 212; ENABLE-NEXT: #APP 213; ENABLE-NEXT: nop 214; ENABLE-NEXT: #NO_APP 215; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 216; ENABLE-NEXT: vpermq (%rdi), %zmm0, %zmm1 217; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 218; ENABLE-NEXT: retq 219; 220; DISABLE-LABEL: permq_rm_512: 221; DISABLE: # %bb.0: 222; DISABLE-NEXT: #APP 223; DISABLE-NEXT: nop 224; DISABLE-NEXT: #NO_APP 225; DISABLE-NEXT: vpermq (%rdi), %zmm0, %zmm1 226; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 227; DISABLE-NEXT: retq 228 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 229 %a0 = load <8 x i64>, ptr %p0, align 64 230 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx) 231 %res = add <8 x i64> %idx, %2 232 ret <8 x i64> %res 233} 234 235define <8 x i64> @permq_broadcast_512(ptr %p0, <8 x i64> %idx) { 236; ENABLE-LABEL: permq_broadcast_512: 237; ENABLE: # %bb.0: 238; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 239; ENABLE-NEXT: #APP 240; ENABLE-NEXT: nop 241; ENABLE-NEXT: #NO_APP 242; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 243; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 244; ENABLE-NEXT: vpermq (%rdi){1to8}, %zmm1, %zmm0 245; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 246; ENABLE-NEXT: retq 247; 248; DISABLE-LABEL: permq_broadcast_512: 249; DISABLE: # %bb.0: 250; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 251; DISABLE-NEXT: #APP 252; DISABLE-NEXT: nop 253; DISABLE-NEXT: #NO_APP 254; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 255; DISABLE-NEXT: vpermq (%rdi){1to8}, %zmm1, %zmm0 256; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 257; DISABLE-NEXT: retq 258 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 259 %v0 = load i64, ptr %p0, align 4 260 %t0 = insertelement <8 x i64> undef, i64 %v0, i64 0 261 %a0 = shufflevector <8 x i64> %t0, <8 x i64> undef, <8 x i32> zeroinitializer 262 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx) 263 %res = add <8 x i64> %2, %idx 264 ret <8 x i64> %res 265} 266 267define <8 x i64> @permq_maskz_512(<8 x i64> %a0, <8 x i64> %idx, ptr %mask) { 268; ENABLE-LABEL: permq_maskz_512: 269; ENABLE: # %bb.0: 270; ENABLE-NEXT: #APP 271; ENABLE-NEXT: nop 272; ENABLE-NEXT: #NO_APP 273; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 274; ENABLE-NEXT: vpermq %zmm0, %zmm1, %zmm2 275; ENABLE-NEXT: kmovb (%rdi), %k1 276; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 277; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 {%k1} 278; ENABLE-NEXT: retq 279; 280; DISABLE-LABEL: permq_maskz_512: 281; DISABLE: # %bb.0: 282; DISABLE-NEXT: #APP 283; DISABLE-NEXT: nop 284; DISABLE-NEXT: #NO_APP 285; DISABLE-NEXT: vpermq %zmm0, %zmm1, %zmm2 286; DISABLE-NEXT: kmovb (%rdi), %k1 287; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 288; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 {%k1} 289; DISABLE-NEXT: retq 290 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 291 %2 = load i8, ptr %mask 292 %3 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx, <8 x i64> zeroinitializer, i8 %2) 293 %t = add <8 x i64> %a0, %idx 294 %res = add <8 x i64> %3, %t 295 ret <8 x i64> %res 296} 297 298declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) 299declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) 300 301define <8 x i32> @permd_rr_256(<8 x i32> %a0, <8 x i32> %idx) { 302; ENABLE-LABEL: permd_rr_256: 303; ENABLE: # %bb.0: 304; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 305; ENABLE-NEXT: #APP 306; ENABLE-NEXT: nop 307; ENABLE-NEXT: #NO_APP 308; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 309; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 310; ENABLE-NEXT: vpermd %ymm0, %ymm2, %ymm1 311; ENABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 312; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 313; ENABLE-NEXT: retq 314; 315; DISABLE-LABEL: permd_rr_256: 316; DISABLE: # %bb.0: 317; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 318; DISABLE-NEXT: #APP 319; DISABLE-NEXT: nop 320; DISABLE-NEXT: #NO_APP 321; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 322; DISABLE-NEXT: vpermd %ymm0, %ymm2, %ymm1 323; DISABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 324; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 325; DISABLE-NEXT: retq 326 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 327 %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> undef, i8 -1) 328 %t = add <8 x i32> %a0, %idx 329 %res = add <8 x i32> %t, %2 330 ret <8 x i32> %res 331} 332 333define <8 x i32> @permd_rm_256(ptr %p0, <8 x i32> %idx) { 334; ENABLE-LABEL: permd_rm_256: 335; ENABLE: # %bb.0: 336; ENABLE-NEXT: #APP 337; ENABLE-NEXT: nop 338; ENABLE-NEXT: #NO_APP 339; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 340; ENABLE-NEXT: vpermd (%rdi), %ymm0, %ymm1 341; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 342; ENABLE-NEXT: retq 343; 344; DISABLE-LABEL: permd_rm_256: 345; DISABLE: # %bb.0: 346; DISABLE-NEXT: #APP 347; DISABLE-NEXT: nop 348; DISABLE-NEXT: #NO_APP 349; DISABLE-NEXT: vpermd (%rdi), %ymm0, %ymm1 350; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 351; DISABLE-NEXT: retq 352 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 353 %a0 = load <8 x i32>, ptr %p0, align 64 354 %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> undef, i8 -1) 355 %res = add <8 x i32> %idx, %2 356 ret <8 x i32> %res 357} 358 359define <8 x i32> @permd_broadcast_256(ptr %p0, <8 x i32> %idx) { 360; ENABLE-LABEL: permd_broadcast_256: 361; ENABLE: # %bb.0: 362; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 363; ENABLE-NEXT: #APP 364; ENABLE-NEXT: nop 365; ENABLE-NEXT: #NO_APP 366; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 367; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 368; ENABLE-NEXT: vpermd (%rdi){1to8}, %ymm1, %ymm0 369; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 370; ENABLE-NEXT: retq 371; 372; DISABLE-LABEL: permd_broadcast_256: 373; DISABLE: # %bb.0: 374; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 375; DISABLE-NEXT: #APP 376; DISABLE-NEXT: nop 377; DISABLE-NEXT: #NO_APP 378; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 379; DISABLE-NEXT: vpermd (%rdi){1to8}, %ymm1, %ymm0 380; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 381; DISABLE-NEXT: retq 382 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 383 %v0 = load i32, ptr %p0, align 4 384 %t0 = insertelement <8 x i32> undef, i32 %v0, i32 0 385 %a0 = shufflevector <8 x i32> %t0, <8 x i32> undef, <8 x i32> zeroinitializer 386 %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> zeroinitializer, i8 -1) 387 %res = add <8 x i32> %2, %idx 388 ret <8 x i32> %res 389} 390 391define <8 x i32> @permd_maskz_256(<8 x i32> %a0, <8 x i32> %idx, ptr %mask) { 392; ENABLE-LABEL: permd_maskz_256: 393; ENABLE: # %bb.0: 394; ENABLE-NEXT: #APP 395; ENABLE-NEXT: nop 396; ENABLE-NEXT: #NO_APP 397; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 398; ENABLE-NEXT: vpermd %ymm0, %ymm1, %ymm2 399; ENABLE-NEXT: kmovb (%rdi), %k1 400; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 401; ENABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 {%k1} 402; ENABLE-NEXT: retq 403; 404; DISABLE-LABEL: permd_maskz_256: 405; DISABLE: # %bb.0: 406; DISABLE-NEXT: #APP 407; DISABLE-NEXT: nop 408; DISABLE-NEXT: #NO_APP 409; DISABLE-NEXT: vpermd %ymm0, %ymm1, %ymm2 410; DISABLE-NEXT: kmovb (%rdi), %k1 411; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 412; DISABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 {%k1} 413; DISABLE-NEXT: retq 414 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 415 %2 = load i8, ptr %mask 416 %3 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> zeroinitializer, i8 %2) 417 %t = add <8 x i32> %a0, %idx 418 %res = add <8 x i32> %3, %t 419 ret <8 x i32> %res 420} 421 422declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) 423 424define <16 x i32> @permd_rr_512(<16 x i32> %a0, <16 x i32> %idx) { 425; ENABLE-LABEL: permd_rr_512: 426; ENABLE: # %bb.0: 427; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 428; ENABLE-NEXT: #APP 429; ENABLE-NEXT: nop 430; ENABLE-NEXT: #NO_APP 431; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 432; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 433; ENABLE-NEXT: vpermd %zmm0, %zmm2, %zmm1 434; ENABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 435; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 436; ENABLE-NEXT: retq 437; 438; DISABLE-LABEL: permd_rr_512: 439; DISABLE: # %bb.0: 440; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 441; DISABLE-NEXT: #APP 442; DISABLE-NEXT: nop 443; DISABLE-NEXT: #NO_APP 444; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 445; DISABLE-NEXT: vpermd %zmm0, %zmm2, %zmm1 446; DISABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 447; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 448; DISABLE-NEXT: retq 449 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 450 %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1) 451 %t = add <16 x i32> %a0, %idx 452 %res = add <16 x i32> %t, %2 453 ret <16 x i32> %res 454} 455 456define <16 x i32> @permd_rm_512(ptr %p0, <16 x i32> %idx) { 457; ENABLE-LABEL: permd_rm_512: 458; ENABLE: # %bb.0: 459; ENABLE-NEXT: #APP 460; ENABLE-NEXT: nop 461; ENABLE-NEXT: #NO_APP 462; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 463; ENABLE-NEXT: vpermd (%rdi), %zmm0, %zmm1 464; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 465; ENABLE-NEXT: retq 466; 467; DISABLE-LABEL: permd_rm_512: 468; DISABLE: # %bb.0: 469; DISABLE-NEXT: #APP 470; DISABLE-NEXT: nop 471; DISABLE-NEXT: #NO_APP 472; DISABLE-NEXT: vpermd (%rdi), %zmm0, %zmm1 473; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 474; DISABLE-NEXT: retq 475 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 476 %a0 = load <16 x i32>, ptr %p0, align 64 477 %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1) 478 %res = add <16 x i32> %idx, %2 479 ret <16 x i32> %res 480} 481 482define <16 x i32> @permd_broadcast_512(ptr %p0, <16 x i32> %idx) { 483; ENABLE-LABEL: permd_broadcast_512: 484; ENABLE: # %bb.0: 485; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 486; ENABLE-NEXT: #APP 487; ENABLE-NEXT: nop 488; ENABLE-NEXT: #NO_APP 489; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 490; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 491; ENABLE-NEXT: vpermd (%rdi){1to16}, %zmm1, %zmm0 492; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 493; ENABLE-NEXT: retq 494; 495; DISABLE-LABEL: permd_broadcast_512: 496; DISABLE: # %bb.0: 497; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 498; DISABLE-NEXT: #APP 499; DISABLE-NEXT: nop 500; DISABLE-NEXT: #NO_APP 501; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 502; DISABLE-NEXT: vpermd (%rdi){1to16}, %zmm1, %zmm0 503; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 504; DISABLE-NEXT: retq 505 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 506 %v0 = load i32, ptr %p0, align 4 507 %t0 = insertelement <16 x i32> undef, i32 %v0, i32 0 508 %a0 = shufflevector <16 x i32> %t0, <16 x i32> undef, <16 x i32> zeroinitializer 509 %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1) 510 %res = add <16 x i32> %2, %idx 511 ret <16 x i32> %res 512} 513 514define <16 x i32> @permd_maskz_512(<16 x i32> %a0, <16 x i32> %idx, ptr %mask) { 515; ENABLE-LABEL: permd_maskz_512: 516; ENABLE: # %bb.0: 517; ENABLE-NEXT: #APP 518; ENABLE-NEXT: nop 519; ENABLE-NEXT: #NO_APP 520; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 521; ENABLE-NEXT: vpermd %zmm0, %zmm1, %zmm2 522; ENABLE-NEXT: kmovw (%rdi), %k1 523; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 524; ENABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 {%k1} 525; ENABLE-NEXT: retq 526; 527; DISABLE-LABEL: permd_maskz_512: 528; DISABLE: # %bb.0: 529; DISABLE-NEXT: #APP 530; DISABLE-NEXT: nop 531; DISABLE-NEXT: #NO_APP 532; DISABLE-NEXT: vpermd %zmm0, %zmm1, %zmm2 533; DISABLE-NEXT: kmovw (%rdi), %k1 534; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 535; DISABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 {%k1} 536; DISABLE-NEXT: retq 537 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 538 %2 = load i16, ptr %mask 539 %3 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> zeroinitializer, i16 %2) 540 %t = add <16 x i32> %a0, %idx 541 %res = add <16 x i32> %3, %t 542 ret <16 x i32> %res 543} 544 545declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) 546 547define <4 x double> @permpd_ri_256(<4 x double> %a0) { 548; ENABLE-LABEL: permpd_ri_256: 549; ENABLE: # %bb.0: 550; ENABLE-NEXT: #APP 551; ENABLE-NEXT: nop 552; ENABLE-NEXT: #NO_APP 553; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 554; ENABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0] 555; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 556; ENABLE-NEXT: retq 557; 558; DISABLE-LABEL: permpd_ri_256: 559; DISABLE: # %bb.0: 560; DISABLE-NEXT: #APP 561; DISABLE-NEXT: nop 562; DISABLE-NEXT: #NO_APP 563; DISABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0] 564; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 565; DISABLE-NEXT: retq 566 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 567 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 568 %res = fadd <4 x double> %2, %a0 569 ret <4 x double> %res 570} 571 572define <4 x double> @permpd_rr_256(<4 x double> %a0, <4 x i64> %idx) { 573; ENABLE-LABEL: permpd_rr_256: 574; ENABLE: # %bb.0: 575; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 576; ENABLE-NEXT: #APP 577; ENABLE-NEXT: nop 578; ENABLE-NEXT: #NO_APP 579; ENABLE-NEXT: vmovapd %ymm0, %ymm2 580; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 581; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 582; ENABLE-NEXT: vpermpd %ymm2, %ymm0, %ymm1 583; ENABLE-NEXT: vcvtqq2pd %ymm0, %ymm0 584; ENABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0 585; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 586; ENABLE-NEXT: retq 587; 588; DISABLE-LABEL: permpd_rr_256: 589; DISABLE: # %bb.0: 590; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 591; DISABLE-NEXT: #APP 592; DISABLE-NEXT: nop 593; DISABLE-NEXT: #NO_APP 594; DISABLE-NEXT: vmovapd %ymm0, %ymm2 595; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 596; DISABLE-NEXT: vpermpd %ymm2, %ymm0, %ymm1 597; DISABLE-NEXT: vcvtqq2pd %ymm0, %ymm0 598; DISABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0 599; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 600; DISABLE-NEXT: retq 601 %1 = tail call <4 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 602 %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %1, <4 x i64> %idx) 603 %a1 = sitofp <4 x i64> %idx to <4 x double> 604 %t = fadd <4 x double> %1, %a1 605 %res = fadd <4 x double> %2, %t 606 ret <4 x double> %res 607} 608 609define <4 x double> @permpd_rm_256(ptr %p0, <4 x i64> %idx) { 610; ENABLE-LABEL: permpd_rm_256: 611; ENABLE: # %bb.0: 612; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 613; ENABLE-NEXT: #APP 614; ENABLE-NEXT: nop 615; ENABLE-NEXT: #NO_APP 616; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 617; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 618; ENABLE-NEXT: vpermpd (%rdi), %ymm1, %ymm0 619; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 620; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 621; ENABLE-NEXT: retq 622; 623; DISABLE-LABEL: permpd_rm_256: 624; DISABLE: # %bb.0: 625; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 626; DISABLE-NEXT: #APP 627; DISABLE-NEXT: nop 628; DISABLE-NEXT: #NO_APP 629; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 630; DISABLE-NEXT: vpermpd (%rdi), %ymm1, %ymm0 631; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 632; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 633; DISABLE-NEXT: retq 634 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 635 %a0 = load <4 x double>, ptr %p0, align 64 636 %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %idx) 637 %a1 = sitofp <4 x i64> %idx to <4 x double> 638 %res = fadd <4 x double> %2, %a1 639 ret <4 x double> %res 640} 641 642define <4 x double> @permpd_mi_256(ptr %p0) { 643; ENABLE-LABEL: permpd_mi_256: 644; ENABLE: # %bb.0: 645; ENABLE-NEXT: #APP 646; ENABLE-NEXT: nop 647; ENABLE-NEXT: #NO_APP 648; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 649; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0] 650; ENABLE-NEXT: retq 651; 652; DISABLE-LABEL: permpd_mi_256: 653; DISABLE: # %bb.0: 654; DISABLE-NEXT: #APP 655; DISABLE-NEXT: nop 656; DISABLE-NEXT: #NO_APP 657; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0] 658; DISABLE-NEXT: retq 659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 660 %a0 = load <4 x double>, ptr %p0, align 64 661 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0> 662 ret <4 x double> %2 663} 664 665define <4 x double> @permpd_broadcast_256(ptr %p0, <4 x i64> %idx) { 666; ENABLE-LABEL: permpd_broadcast_256: 667; ENABLE: # %bb.0: 668; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 669; ENABLE-NEXT: #APP 670; ENABLE-NEXT: nop 671; ENABLE-NEXT: #NO_APP 672; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 673; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 674; ENABLE-NEXT: vpermpd (%rdi){1to4}, %ymm1, %ymm0 675; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 676; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 677; ENABLE-NEXT: retq 678; 679; DISABLE-LABEL: permpd_broadcast_256: 680; DISABLE: # %bb.0: 681; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 682; DISABLE-NEXT: #APP 683; DISABLE-NEXT: nop 684; DISABLE-NEXT: #NO_APP 685; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 686; DISABLE-NEXT: vpermpd (%rdi){1to4}, %ymm1, %ymm0 687; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 688; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 689; DISABLE-NEXT: retq 690 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 691 %v0 = load double, ptr %p0, align 4 692 %t0 = insertelement <4 x double> undef, double %v0, i64 0 693 %a0 = shufflevector <4 x double> %t0, <4 x double> undef, <4 x i32> zeroinitializer 694 %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %idx) 695 %a1 = sitofp <4 x i64> %idx to <4 x double> 696 %res = fadd <4 x double> %2, %a1 697 ret <4 x double> %res 698} 699 700define <4 x double> @permpd_maskz_256(<4 x double> %a0, <4 x i64> %idx, ptr %mask) { 701; ENABLE-LABEL: permpd_maskz_256: 702; ENABLE: # %bb.0: 703; ENABLE-NEXT: #APP 704; ENABLE-NEXT: nop 705; ENABLE-NEXT: #NO_APP 706; ENABLE-NEXT: kmovb (%rdi), %k1 707; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 708; ENABLE-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} {z} 709; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 710; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 711; ENABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0 712; ENABLE-NEXT: retq 713; 714; DISABLE-LABEL: permpd_maskz_256: 715; DISABLE: # %bb.0: 716; DISABLE-NEXT: #APP 717; DISABLE-NEXT: nop 718; DISABLE-NEXT: #NO_APP 719; DISABLE-NEXT: kmovb (%rdi), %k1 720; DISABLE-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} {z} 721; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 722; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 723; DISABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0 724; DISABLE-NEXT: retq 725 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 726 %2 = load i8, ptr %mask 727 %3 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> %idx, <4 x double> zeroinitializer, i8 %2) 728 %a1 = sitofp <4 x i64> %idx to <4 x double> 729 %t = fadd <4 x double> %a0, %a1 730 %res = fadd <4 x double> %3, %t 731 ret <4 x double> %res 732} 733 734declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) 735declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8) 736 737define <8 x double> @permpd_rr_512(<8 x double> %a0, <8 x i64> %idx) { 738; ENABLE-LABEL: permpd_rr_512: 739; ENABLE: # %bb.0: 740; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 741; ENABLE-NEXT: #APP 742; ENABLE-NEXT: nop 743; ENABLE-NEXT: #NO_APP 744; ENABLE-NEXT: vmovapd %zmm0, %zmm2 745; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 746; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 747; ENABLE-NEXT: vpermpd %zmm2, %zmm0, %zmm1 748; ENABLE-NEXT: vcvtqq2pd %zmm0, %zmm0 749; ENABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0 750; ENABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0 751; ENABLE-NEXT: retq 752; 753; DISABLE-LABEL: permpd_rr_512: 754; DISABLE: # %bb.0: 755; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 756; DISABLE-NEXT: #APP 757; DISABLE-NEXT: nop 758; DISABLE-NEXT: #NO_APP 759; DISABLE-NEXT: vmovapd %zmm0, %zmm2 760; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 761; DISABLE-NEXT: vpermpd %zmm2, %zmm0, %zmm1 762; DISABLE-NEXT: vcvtqq2pd %zmm0, %zmm0 763; DISABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0 764; DISABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0 765; DISABLE-NEXT: retq 766 %1 = tail call <8 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 767 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> %idx) 768 %a1 = sitofp <8 x i64> %idx to <8 x double> 769 %t = fadd <8 x double> %1, %a1 770 %res = fadd <8 x double> %2, %t 771 ret <8 x double> %res 772} 773 774define <8 x double> @permpd_rm_512(ptr %p0, <8 x i64> %idx) { 775; ENABLE-LABEL: permpd_rm_512: 776; ENABLE: # %bb.0: 777; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 778; ENABLE-NEXT: #APP 779; ENABLE-NEXT: nop 780; ENABLE-NEXT: #NO_APP 781; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 782; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 783; ENABLE-NEXT: vpermpd (%rdi), %zmm1, %zmm0 784; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 785; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 786; ENABLE-NEXT: retq 787; 788; DISABLE-LABEL: permpd_rm_512: 789; DISABLE: # %bb.0: 790; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 791; DISABLE-NEXT: #APP 792; DISABLE-NEXT: nop 793; DISABLE-NEXT: #NO_APP 794; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 795; DISABLE-NEXT: vpermpd (%rdi), %zmm1, %zmm0 796; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 797; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 798; DISABLE-NEXT: retq 799 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 800 %a0 = load <8 x double>, ptr %p0, align 64 801 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %idx) 802 %a1 = sitofp <8 x i64> %idx to <8 x double> 803 %res = fadd <8 x double> %2, %a1 804 ret <8 x double> %res 805} 806 807define <8 x double> @permpd_broadcast_512(ptr %p0, <8 x i64> %idx) { 808; ENABLE-LABEL: permpd_broadcast_512: 809; ENABLE: # %bb.0: 810; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 811; ENABLE-NEXT: #APP 812; ENABLE-NEXT: nop 813; ENABLE-NEXT: #NO_APP 814; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 815; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 816; ENABLE-NEXT: vpermpd (%rdi){1to8}, %zmm1, %zmm0 817; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 818; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 819; ENABLE-NEXT: retq 820; 821; DISABLE-LABEL: permpd_broadcast_512: 822; DISABLE: # %bb.0: 823; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 824; DISABLE-NEXT: #APP 825; DISABLE-NEXT: nop 826; DISABLE-NEXT: #NO_APP 827; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 828; DISABLE-NEXT: vpermpd (%rdi){1to8}, %zmm1, %zmm0 829; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 830; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 831; DISABLE-NEXT: retq 832 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 833 %v0 = load double, ptr %p0, align 4 834 %t0 = insertelement <8 x double> undef, double %v0, i64 0 835 %a0 = shufflevector <8 x double> %t0, <8 x double> undef, <8 x i32> zeroinitializer 836 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %idx) 837 %a1 = sitofp <8 x i64> %idx to <8 x double> 838 %res = fadd <8 x double> %2, %a1 839 ret <8 x double> %res 840} 841 842define <8 x double> @permpd_maskz_512(<8 x double> %a0, <8 x i64> %idx, ptr %mask) { 843; ENABLE-LABEL: permpd_maskz_512: 844; ENABLE: # %bb.0: 845; ENABLE-NEXT: #APP 846; ENABLE-NEXT: nop 847; ENABLE-NEXT: #NO_APP 848; ENABLE-NEXT: kmovb (%rdi), %k1 849; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 850; ENABLE-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} {z} 851; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 852; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 853; ENABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0 854; ENABLE-NEXT: retq 855; 856; DISABLE-LABEL: permpd_maskz_512: 857; DISABLE: # %bb.0: 858; DISABLE-NEXT: #APP 859; DISABLE-NEXT: nop 860; DISABLE-NEXT: #NO_APP 861; DISABLE-NEXT: kmovb (%rdi), %k1 862; DISABLE-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} {z} 863; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 864; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 865; DISABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0 866; DISABLE-NEXT: retq 867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 868 %2 = load i8, ptr %mask 869 %3 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> %idx, <8 x double> zeroinitializer, i8 %2) 870 %a1 = sitofp <8 x i64> %idx to <8 x double> 871 %t = fadd <8 x double> %a0, %a1 872 %res = fadd <8 x double> %3, %t 873 ret <8 x double> %res 874} 875 876declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) 877declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) 878 879 880define <8 x float> @permps_rr_256(<8 x float> %a0, <8 x i32> %idx) { 881; ENABLE-LABEL: permps_rr_256: 882; ENABLE: # %bb.0: 883; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 884; ENABLE-NEXT: #APP 885; ENABLE-NEXT: nop 886; ENABLE-NEXT: #NO_APP 887; ENABLE-NEXT: vmovaps %ymm0, %ymm2 888; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 889; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 890; ENABLE-NEXT: vpermps %ymm2, %ymm0, %ymm1 891; ENABLE-NEXT: vcvtdq2ps %ymm0, %ymm0 892; ENABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0 893; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 894; ENABLE-NEXT: retq 895; 896; DISABLE-LABEL: permps_rr_256: 897; DISABLE: # %bb.0: 898; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 899; DISABLE-NEXT: #APP 900; DISABLE-NEXT: nop 901; DISABLE-NEXT: #NO_APP 902; DISABLE-NEXT: vmovaps %ymm0, %ymm2 903; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 904; DISABLE-NEXT: vpermps %ymm2, %ymm0, %ymm1 905; DISABLE-NEXT: vcvtdq2ps %ymm0, %ymm0 906; DISABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0 907; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 908; DISABLE-NEXT: retq 909 %1 = tail call <8 x float> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 910 %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %1, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1) 911 %a1 = sitofp <8 x i32> %idx to <8 x float> 912 %t = fadd <8 x float> %1, %a1 913 %res = fadd <8 x float> %2, %t 914 ret <8 x float> %res 915} 916 917define <8 x float> @permps_rm_256(ptr %p0, <8 x i32> %idx) { 918; ENABLE-LABEL: permps_rm_256: 919; ENABLE: # %bb.0: 920; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 921; ENABLE-NEXT: #APP 922; ENABLE-NEXT: nop 923; ENABLE-NEXT: #NO_APP 924; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 925; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 926; ENABLE-NEXT: vpermps (%rdi), %ymm1, %ymm0 927; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 928; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 929; ENABLE-NEXT: retq 930; 931; DISABLE-LABEL: permps_rm_256: 932; DISABLE: # %bb.0: 933; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 934; DISABLE-NEXT: #APP 935; DISABLE-NEXT: nop 936; DISABLE-NEXT: #NO_APP 937; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 938; DISABLE-NEXT: vpermps (%rdi), %ymm1, %ymm0 939; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 940; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 941; DISABLE-NEXT: retq 942 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 943 %a0 = load <8 x float>, ptr %p0, align 64 944 %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1) 945 %a1 = sitofp <8 x i32> %idx to <8 x float> 946 %res = fadd <8 x float> %2, %a1 947 ret <8 x float> %res 948} 949 950define <8 x float> @permps_broadcast_256(ptr %p0, <8 x i32> %idx) { 951; ENABLE-LABEL: permps_broadcast_256: 952; ENABLE: # %bb.0: 953; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 954; ENABLE-NEXT: #APP 955; ENABLE-NEXT: nop 956; ENABLE-NEXT: #NO_APP 957; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 958; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 959; ENABLE-NEXT: vpermps (%rdi){1to8}, %ymm1, %ymm0 960; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 961; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 962; ENABLE-NEXT: retq 963; 964; DISABLE-LABEL: permps_broadcast_256: 965; DISABLE: # %bb.0: 966; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 967; DISABLE-NEXT: #APP 968; DISABLE-NEXT: nop 969; DISABLE-NEXT: #NO_APP 970; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 971; DISABLE-NEXT: vpermps (%rdi){1to8}, %ymm1, %ymm0 972; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 973; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 974; DISABLE-NEXT: retq 975 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 976 %v0 = load float, ptr %p0, align 4 977 %t0 = insertelement <8 x float> undef, float %v0, i32 0 978 %a0 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer 979 %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1) 980 %a1 = sitofp <8 x i32> %idx to <8 x float> 981 %res = fadd <8 x float> %2, %a1 982 ret <8 x float> %res 983} 984 985define <8 x float> @permps_maskz_256(<8 x float> %a0, <8 x i32> %idx, ptr %mask) { 986; ENABLE-LABEL: permps_maskz_256: 987; ENABLE: # %bb.0: 988; ENABLE-NEXT: #APP 989; ENABLE-NEXT: nop 990; ENABLE-NEXT: #NO_APP 991; ENABLE-NEXT: kmovb (%rdi), %k1 992; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 993; ENABLE-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} {z} 994; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 995; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 996; ENABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0 997; ENABLE-NEXT: retq 998; 999; DISABLE-LABEL: permps_maskz_256: 1000; DISABLE: # %bb.0: 1001; DISABLE-NEXT: #APP 1002; DISABLE-NEXT: nop 1003; DISABLE-NEXT: #NO_APP 1004; DISABLE-NEXT: kmovb (%rdi), %k1 1005; DISABLE-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} {z} 1006; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 1007; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 1008; DISABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0 1009; DISABLE-NEXT: retq 1010 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1011 %2 = load i8, ptr %mask 1012 %3 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 %2) 1013 %a1 = sitofp <8 x i32> %idx to <8 x float> 1014 %t = fadd <8 x float> %a0, %a1 1015 %res = fadd <8 x float> %3, %t 1016 ret <8 x float> %res 1017} 1018 1019declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8) 1020 1021define <16 x float> @permps_rr_512(<16 x float> %a0, <16 x i32> %idx) { 1022; ENABLE-LABEL: permps_rr_512: 1023; ENABLE: # %bb.0: 1024; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1025; ENABLE-NEXT: #APP 1026; ENABLE-NEXT: nop 1027; ENABLE-NEXT: #NO_APP 1028; ENABLE-NEXT: vmovaps %zmm0, %zmm2 1029; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1030; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 1031; ENABLE-NEXT: vpermps %zmm2, %zmm0, %zmm1 1032; ENABLE-NEXT: vcvtdq2ps %zmm0, %zmm0 1033; ENABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0 1034; ENABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0 1035; ENABLE-NEXT: retq 1036; 1037; DISABLE-LABEL: permps_rr_512: 1038; DISABLE: # %bb.0: 1039; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1040; DISABLE-NEXT: #APP 1041; DISABLE-NEXT: nop 1042; DISABLE-NEXT: #NO_APP 1043; DISABLE-NEXT: vmovaps %zmm0, %zmm2 1044; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1045; DISABLE-NEXT: vpermps %zmm2, %zmm0, %zmm1 1046; DISABLE-NEXT: vcvtdq2ps %zmm0, %zmm0 1047; DISABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0 1048; DISABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0 1049; DISABLE-NEXT: retq 1050 %1 = tail call <16 x float> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1051 %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %1, <16 x i32> %idx) 1052 %a1 = sitofp <16 x i32> %idx to <16 x float> 1053 %t = fadd <16 x float> %1, %a1 1054 %res = fadd <16 x float> %2, %t 1055 ret <16 x float> %res 1056} 1057 1058define <16 x float> @permps_rm_512(ptr %p0, <16 x i32> %idx) { 1059; ENABLE-LABEL: permps_rm_512: 1060; ENABLE: # %bb.0: 1061; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1062; ENABLE-NEXT: #APP 1063; ENABLE-NEXT: nop 1064; ENABLE-NEXT: #NO_APP 1065; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 1066; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 1067; ENABLE-NEXT: vpermps (%rdi), %zmm1, %zmm0 1068; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 1069; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 1070; ENABLE-NEXT: retq 1071; 1072; DISABLE-LABEL: permps_rm_512: 1073; DISABLE: # %bb.0: 1074; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1075; DISABLE-NEXT: #APP 1076; DISABLE-NEXT: nop 1077; DISABLE-NEXT: #NO_APP 1078; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 1079; DISABLE-NEXT: vpermps (%rdi), %zmm1, %zmm0 1080; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 1081; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 1082; DISABLE-NEXT: retq 1083 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1084 %a0 = load <16 x float>, ptr %p0, align 64 1085 %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx) 1086 %a1 = sitofp <16 x i32> %idx to <16 x float> 1087 %res = fadd <16 x float> %2, %a1 1088 ret <16 x float> %res 1089} 1090 1091define <16 x float> @permps_broadcast_512(ptr %p0, <16 x i32> %idx) { 1092; ENABLE-LABEL: permps_broadcast_512: 1093; ENABLE: # %bb.0: 1094; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1095; ENABLE-NEXT: #APP 1096; ENABLE-NEXT: nop 1097; ENABLE-NEXT: #NO_APP 1098; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 1099; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 1100; ENABLE-NEXT: vpermps (%rdi){1to16}, %zmm1, %zmm0 1101; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 1102; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 1103; ENABLE-NEXT: retq 1104; 1105; DISABLE-LABEL: permps_broadcast_512: 1106; DISABLE: # %bb.0: 1107; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1108; DISABLE-NEXT: #APP 1109; DISABLE-NEXT: nop 1110; DISABLE-NEXT: #NO_APP 1111; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 1112; DISABLE-NEXT: vpermps (%rdi){1to16}, %zmm1, %zmm0 1113; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 1114; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 1115; DISABLE-NEXT: retq 1116 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1117 %v0 = load float, ptr %p0, align 4 1118 %t0 = insertelement <16 x float> undef, float %v0, i32 0 1119 %a0 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer 1120 %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx) 1121 %a1 = sitofp <16 x i32> %idx to <16 x float> 1122 %res = fadd <16 x float> %2, %a1 1123 ret <16 x float> %res 1124} 1125 1126define <16 x float> @permps_maskz_512(<16 x float> %a0, <16 x i32> %idx, ptr %mask) { 1127; ENABLE-LABEL: permps_maskz_512: 1128; ENABLE: # %bb.0: 1129; ENABLE-NEXT: #APP 1130; ENABLE-NEXT: nop 1131; ENABLE-NEXT: #NO_APP 1132; ENABLE-NEXT: kmovw (%rdi), %k1 1133; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 1134; ENABLE-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} {z} 1135; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 1136; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 1137; ENABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0 1138; ENABLE-NEXT: retq 1139; 1140; DISABLE-LABEL: permps_maskz_512: 1141; DISABLE: # %bb.0: 1142; DISABLE-NEXT: #APP 1143; DISABLE-NEXT: nop 1144; DISABLE-NEXT: #NO_APP 1145; DISABLE-NEXT: kmovw (%rdi), %k1 1146; DISABLE-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} {z} 1147; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 1148; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 1149; DISABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0 1150; DISABLE-NEXT: retq 1151 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1152 %2 = load i16, ptr %mask 1153 %3 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx, <16 x float> zeroinitializer, i16 %2) 1154 %a1 = sitofp <16 x i32> %idx to <16 x float> 1155 %t = fadd <16 x float> %a0, %a1 1156 %res = fadd <16 x float> %3, %t 1157 ret <16 x float> %res 1158} 1159 1160declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) 1161declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) 1162