1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c 6 7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) { 8; CHECK-LABEL: test_mm256_abs_epi8: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vpabsb %ymm0, %ymm0 11; CHECK-NEXT: ret{{[l|q]}} 12 %arg = bitcast <4 x i64> %a0 to <32 x i8> 13 %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false) 14 %res = bitcast <32 x i8> %abs to <4 x i64> 15 ret <4 x i64> %res 16} 17declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone 18 19define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) { 20; CHECK-LABEL: test_mm256_abs_epi16: 21; CHECK: # %bb.0: 22; CHECK-NEXT: vpabsw %ymm0, %ymm0 23; CHECK-NEXT: ret{{[l|q]}} 24 %arg = bitcast <4 x i64> %a0 to <16 x i16> 25 %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false) 26 %res = bitcast <16 x i16> %abs to <4 x i64> 27 ret <4 x i64> %res 28} 29declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone 30 31define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) { 32; CHECK-LABEL: test_mm256_abs_epi32: 33; CHECK: # %bb.0: 34; CHECK-NEXT: vpabsd %ymm0, %ymm0 35; CHECK-NEXT: ret{{[l|q]}} 36 %arg = bitcast <4 x i64> %a0 to <8 x i32> 37 %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false) 38 %res = bitcast <8 x i32> %abs to <4 x i64> 39 ret <4 x i64> %res 40} 41declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone 42 43define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 44; CHECK-LABEL: test_mm256_add_epi8: 45; CHECK: # %bb.0: 46; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 47; CHECK-NEXT: ret{{[l|q]}} 48 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 49 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 50 %res = add <32 x i8> %arg0, %arg1 51 %bc = bitcast <32 x i8> %res to <4 x i64> 52 ret <4 x i64> %bc 53} 54 55define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 56; CHECK-LABEL: test_mm256_add_epi16: 57; CHECK: # %bb.0: 58; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 59; CHECK-NEXT: ret{{[l|q]}} 60 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 61 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 62 %res = add <16 x i16> %arg0, %arg1 63 %bc = bitcast <16 x i16> %res to <4 x i64> 64 ret <4 x i64> %bc 65} 66 67define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 68; CHECK-LABEL: test_mm256_add_epi32: 69; CHECK: # %bb.0: 70; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 71; CHECK-NEXT: ret{{[l|q]}} 72 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 73 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 74 %res = add <8 x i32> %arg0, %arg1 75 %bc = bitcast <8 x i32> %res to <4 x i64> 76 ret <4 x i64> %bc 77} 78 79define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 80; CHECK-LABEL: test_mm256_add_epi64: 81; CHECK: # %bb.0: 82; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 83; CHECK-NEXT: ret{{[l|q]}} 84 %res = add <4 x i64> %a0, %a1 85 ret <4 x i64> %res 86} 87 88define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) { 89; CHECK-LABEL: test_mm256_adds_epi8: 90; CHECK: # %bb.0: 91; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 92; CHECK-NEXT: ret{{[l|q]}} 93 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 94 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 95 %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 96 %bc = bitcast <32 x i8> %res to <4 x i64> 97 ret <4 x i64> %bc 98} 99declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 100 101define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { 102; CHECK-LABEL: test_mm256_adds_epi16: 103; CHECK: # %bb.0: 104; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 105; CHECK-NEXT: ret{{[l|q]}} 106 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 107 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 108 %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 109 %bc = bitcast <16 x i16> %res to <4 x i64> 110 ret <4 x i64> %bc 111} 112declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 113 114define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { 115; CHECK-LABEL: test_mm256_adds_epu8: 116; CHECK: # %bb.0: 117; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 118; CHECK-NEXT: ret{{[l|q]}} 119 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 120 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 121 %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 122 %bc = bitcast <32 x i8> %res to <4 x i64> 123 ret <4 x i64> %bc 124} 125declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) 126 127define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { 128; CHECK-LABEL: test_mm256_adds_epu16: 129; CHECK: # %bb.0: 130; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 131; CHECK-NEXT: ret{{[l|q]}} 132 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 133 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 134 %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 135 %bc = bitcast <16 x i16> %res to <4 x i64> 136 ret <4 x i64> %bc 137} 138declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) 139 140define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { 141; CHECK-LABEL: test_mm256_alignr_epi8: 142; CHECK: # %bb.0: 143; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17] 144; CHECK-NEXT: ret{{[l|q]}} 145 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 146 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 147 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49> 148 %res = bitcast <32 x i8> %shuf to <4 x i64> 149 ret <4 x i64> %res 150} 151 152define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { 153; CHECK-LABEL: test2_mm256_alignr_epi8: 154; CHECK: # %bb.0: 155; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] 156; CHECK-NEXT: ret{{[l|q]}} 157 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 158 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 159 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 160 %res = bitcast <32 x i8> %shuf to <4 x i64> 161 ret <4 x i64> %res 162} 163 164define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 165; CHECK-LABEL: test_mm256_and_si256: 166; CHECK: # %bb.0: 167; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 168; CHECK-NEXT: ret{{[l|q]}} 169 %res = and <4 x i64> %a0, %a1 170 ret <4 x i64> %res 171} 172 173define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 174; CHECK-LABEL: test_mm256_andnot_si256: 175; CHECK: # %bb.0: 176; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 177; CHECK-NEXT: vpxor %ymm2, %ymm0, %ymm0 178; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 179; CHECK-NEXT: ret{{[l|q]}} 180 %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1> 181 %res = and <4 x i64> %not, %a1 182 ret <4 x i64> %res 183} 184 185define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 186; CHECK-LABEL: test_mm256_avg_epu8: 187; CHECK: # %bb.0: 188; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 189; CHECK-NEXT: ret{{[l|q]}} 190 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 191 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 192 %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1) 193 %bc = bitcast <32 x i8> %res to <4 x i64> 194 ret <4 x i64> %bc 195} 196declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone 197 198define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 199; CHECK-LABEL: test_mm256_avg_epu16: 200; CHECK: # %bb.0: 201; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 202; CHECK-NEXT: ret{{[l|q]}} 203 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 204 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 205 %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1) 206 %bc = bitcast <16 x i16> %res to <4 x i64> 207 ret <4 x i64> %bc 208} 209declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone 210 211define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) { 212; CHECK-LABEL: test_mm256_blend_epi16: 213; CHECK: # %bb.0: 214; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] 215; CHECK-NEXT: ret{{[l|q]}} 216 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 217 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 218 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 219 %res = bitcast <16 x i16> %shuf to <4 x i64> 220 ret <4 x i64> %res 221} 222 223define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) { 224; CHECK-LABEL: test_mm_blend_epi32: 225; CHECK: # %bb.0: 226; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 227; CHECK-NEXT: ret{{[l|q]}} 228 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 229 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 230 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 231 %res = bitcast <4 x i32> %shuf to <2 x i64> 232 ret <2 x i64> %res 233} 234 235define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) { 236; CHECK-LABEL: test_mm256_blend_epi32: 237; CHECK: # %bb.0: 238; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] 239; CHECK-NEXT: ret{{[l|q]}} 240 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 241 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 242 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7> 243 %res = bitcast <8 x i32> %shuf to <4 x i64> 244 ret <4 x i64> %res 245} 246 247define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { 248; CHECK-LABEL: test_mm256_blendv_epi8: 249; CHECK: # %bb.0: 250; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 251; CHECK-NEXT: ret{{[l|q]}} 252 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 253 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 254 %arg2 = bitcast <4 x i64> %a2 to <32 x i8> 255 %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2) 256 %res = bitcast <32 x i8> %call to <4 x i64> 257 ret <4 x i64> %res 258} 259declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone 260 261define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) { 262; CHECK-LABEL: test_mm_broadcastb_epi8: 263; CHECK: # %bb.0: 264; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 265; CHECK-NEXT: ret{{[l|q]}} 266 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 267 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer 268 %res = bitcast <16 x i8> %shuf to <2 x i64> 269 ret <2 x i64> %res 270} 271 272define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) { 273; CHECK-LABEL: test_mm256_broadcastb_epi8: 274; CHECK: # %bb.0: 275; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 276; CHECK-NEXT: ret{{[l|q]}} 277 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 278 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer 279 %res = bitcast <32 x i8> %shuf to <4 x i64> 280 ret <4 x i64> %res 281} 282 283define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { 284; CHECK-LABEL: test_mm_broadcastd_epi32: 285; CHECK: # %bb.0: 286; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 287; CHECK-NEXT: ret{{[l|q]}} 288 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 289 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer 290 %res = bitcast <4 x i32> %shuf to <2 x i64> 291 ret <2 x i64> %res 292} 293 294define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) { 295; CHECK-LABEL: test_mm256_broadcastd_epi32: 296; CHECK: # %bb.0: 297; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 298; CHECK-NEXT: ret{{[l|q]}} 299 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 300 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer 301 %res = bitcast <8 x i32> %shuf to <4 x i64> 302 ret <4 x i64> %res 303} 304 305define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { 306; CHECK-LABEL: test_mm_broadcastq_epi64: 307; CHECK: # %bb.0: 308; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 309; CHECK-NEXT: ret{{[l|q]}} 310 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 311 ret <2 x i64> %res 312} 313 314define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) { 315; CHECK-LABEL: test_mm256_broadcastq_epi64: 316; CHECK: # %bb.0: 317; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 318; CHECK-NEXT: ret{{[l|q]}} 319 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer 320 ret <4 x i64> %res 321} 322 323define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) { 324; CHECK-LABEL: test_mm_broadcastsd_pd: 325; CHECK: # %bb.0: 326; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 327; CHECK-NEXT: ret{{[l|q]}} 328 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 329 ret <2 x double> %res 330} 331 332define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) { 333; CHECK-LABEL: test_mm256_broadcastsd_pd: 334; CHECK: # %bb.0: 335; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 336; CHECK-NEXT: ret{{[l|q]}} 337 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer 338 ret <4 x double> %res 339} 340 341define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) { 342; CHECK-LABEL: test_mm256_broadcastsi128_si256: 343; CHECK: # %bb.0: 344; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 345; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 346; CHECK-NEXT: ret{{[l|q]}} 347 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 348 ret <4 x i64> %res 349} 350 351define <4 x i64> @test_mm256_broadcastsi128_si256_mem(ptr %p0) { 352; X86-LABEL: test_mm256_broadcastsi128_si256_mem: 353; X86: # %bb.0: 354; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 355; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 356; X86-NEXT: retl 357; 358; X64-LABEL: test_mm256_broadcastsi128_si256_mem: 359; X64: # %bb.0: 360; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 361; X64-NEXT: retq 362 %a0 = load <2 x i64>, ptr %p0 363 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 364 ret <4 x i64> %res 365} 366 367define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { 368; CHECK-LABEL: test_mm_broadcastss_ps: 369; CHECK: # %bb.0: 370; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 371; CHECK-NEXT: ret{{[l|q]}} 372 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 373 ret <4 x float> %res 374} 375 376define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) { 377; CHECK-LABEL: test_mm256_broadcastss_ps: 378; CHECK: # %bb.0: 379; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 380; CHECK-NEXT: ret{{[l|q]}} 381 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer 382 ret <8 x float> %res 383} 384 385define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) { 386; CHECK-LABEL: test_mm_broadcastw_epi16: 387; CHECK: # %bb.0: 388; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 389; CHECK-NEXT: ret{{[l|q]}} 390 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 391 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer 392 %res = bitcast <8 x i16> %shuf to <2 x i64> 393 ret <2 x i64> %res 394} 395 396define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) { 397; CHECK-LABEL: test_mm256_broadcastw_epi16: 398; CHECK: # %bb.0: 399; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 400; CHECK-NEXT: ret{{[l|q]}} 401 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 402 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer 403 %res = bitcast <16 x i16> %shuf to <4 x i64> 404 ret <4 x i64> %res 405} 406 407define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) { 408; CHECK-LABEL: test_mm256_bslli_epi128: 409; CHECK: # %bb.0: 410; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 411; CHECK-NEXT: ret{{[l|q]}} 412 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 413 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60> 414 %res = bitcast <32 x i8> %shuf to <4 x i64> 415 ret <4 x i64> %res 416} 417 418define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) { 419; CHECK-LABEL: test_mm256_bsrli_epi128: 420; CHECK: # %bb.0: 421; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 422; CHECK-NEXT: ret{{[l|q]}} 423 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 424 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50> 425 %res = bitcast <32 x i8> %shuf to <4 x i64> 426 ret <4 x i64> %res 427} 428 429define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 430; CHECK-LABEL: test_mm256_cmpeq_epi8: 431; CHECK: # %bb.0: 432; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 433; CHECK-NEXT: ret{{[l|q]}} 434 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 435 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 436 %cmp = icmp eq <32 x i8> %arg0, %arg1 437 %res = sext <32 x i1> %cmp to <32 x i8> 438 %bc = bitcast <32 x i8> %res to <4 x i64> 439 ret <4 x i64> %bc 440} 441 442define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 443; CHECK-LABEL: test_mm256_cmpeq_epi16: 444; CHECK: # %bb.0: 445; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 446; CHECK-NEXT: ret{{[l|q]}} 447 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 448 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 449 %cmp = icmp eq <16 x i16> %arg0, %arg1 450 %res = sext <16 x i1> %cmp to <16 x i16> 451 %bc = bitcast <16 x i16> %res to <4 x i64> 452 ret <4 x i64> %bc 453} 454 455define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 456; CHECK-LABEL: test_mm256_cmpeq_epi32: 457; CHECK: # %bb.0: 458; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 459; CHECK-NEXT: ret{{[l|q]}} 460 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 461 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 462 %cmp = icmp eq <8 x i32> %arg0, %arg1 463 %res = sext <8 x i1> %cmp to <8 x i32> 464 %bc = bitcast <8 x i32> %res to <4 x i64> 465 ret <4 x i64> %bc 466} 467 468define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 469; CHECK-LABEL: test_mm256_cmpeq_epi64: 470; CHECK: # %bb.0: 471; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 472; CHECK-NEXT: ret{{[l|q]}} 473 %cmp = icmp eq <4 x i64> %a0, %a1 474 %res = sext <4 x i1> %cmp to <4 x i64> 475 ret <4 x i64> %res 476} 477 478define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 479; CHECK-LABEL: test_mm256_cmpgt_epi8: 480; CHECK: # %bb.0: 481; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 482; CHECK-NEXT: ret{{[l|q]}} 483 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 484 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 485 %cmp = icmp sgt <32 x i8> %arg0, %arg1 486 %res = sext <32 x i1> %cmp to <32 x i8> 487 %bc = bitcast <32 x i8> %res to <4 x i64> 488 ret <4 x i64> %bc 489} 490 491define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 492; CHECK-LABEL: test_mm256_cmpgt_epi16: 493; CHECK: # %bb.0: 494; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 495; CHECK-NEXT: ret{{[l|q]}} 496 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 497 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 498 %cmp = icmp sgt <16 x i16> %arg0, %arg1 499 %res = sext <16 x i1> %cmp to <16 x i16> 500 %bc = bitcast <16 x i16> %res to <4 x i64> 501 ret <4 x i64> %bc 502} 503 504define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 505; CHECK-LABEL: test_mm256_cmpgt_epi32: 506; CHECK: # %bb.0: 507; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 508; CHECK-NEXT: ret{{[l|q]}} 509 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 510 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 511 %cmp = icmp sgt <8 x i32> %arg0, %arg1 512 %res = sext <8 x i1> %cmp to <8 x i32> 513 %bc = bitcast <8 x i32> %res to <4 x i64> 514 ret <4 x i64> %bc 515} 516 517define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 518; CHECK-LABEL: test_mm256_cmpgt_epi64: 519; CHECK: # %bb.0: 520; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 521; CHECK-NEXT: ret{{[l|q]}} 522 %cmp = icmp sgt <4 x i64> %a0, %a1 523 %res = sext <4 x i1> %cmp to <4 x i64> 524 ret <4 x i64> %res 525} 526 527define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) { 528; CHECK-LABEL: test_mm256_cvtepi8_epi16: 529; CHECK: # %bb.0: 530; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 531; CHECK-NEXT: ret{{[l|q]}} 532 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 533 %ext = sext <16 x i8> %arg0 to <16 x i16> 534 %res = bitcast <16 x i16> %ext to <4 x i64> 535 ret <4 x i64> %res 536} 537 538define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) { 539; CHECK-LABEL: test_mm256_cvtepi8_epi32: 540; CHECK: # %bb.0: 541; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 542; CHECK-NEXT: ret{{[l|q]}} 543 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 544 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 545 %ext = sext <8 x i8> %shuf to <8 x i32> 546 %res = bitcast <8 x i32> %ext to <4 x i64> 547 ret <4 x i64> %res 548} 549 550define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) { 551; CHECK-LABEL: test_mm256_cvtepi8_epi64: 552; CHECK: # %bb.0: 553; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 554; CHECK-NEXT: ret{{[l|q]}} 555 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 556 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 557 %ext = sext <4 x i8> %shuf to <4 x i64> 558 ret <4 x i64> %ext 559} 560 561define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) { 562; CHECK-LABEL: test_mm256_cvtepi16_epi32: 563; CHECK: # %bb.0: 564; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 565; CHECK-NEXT: ret{{[l|q]}} 566 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 567 %ext = sext <8 x i16> %arg0 to <8 x i32> 568 %res = bitcast <8 x i32> %ext to <4 x i64> 569 ret <4 x i64> %res 570} 571 572define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) { 573; CHECK-LABEL: test_mm256_cvtepi16_epi64: 574; CHECK: # %bb.0: 575; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 576; CHECK-NEXT: ret{{[l|q]}} 577 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 578 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 579 %ext = sext <4 x i16> %shuf to <4 x i64> 580 ret <4 x i64> %ext 581} 582 583define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) { 584; CHECK-LABEL: test_mm256_cvtepi32_epi64: 585; CHECK: # %bb.0: 586; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 587; CHECK-NEXT: ret{{[l|q]}} 588 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 589 %ext = sext <4 x i32> %arg0 to <4 x i64> 590 ret <4 x i64> %ext 591} 592 593define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) { 594; CHECK-LABEL: test_mm256_cvtepu8_epi16: 595; CHECK: # %bb.0: 596; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 597; CHECK-NEXT: ret{{[l|q]}} 598 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 599 %ext = zext <16 x i8> %arg0 to <16 x i16> 600 %res = bitcast <16 x i16> %ext to <4 x i64> 601 ret <4 x i64> %res 602} 603 604define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) { 605; CHECK-LABEL: test_mm256_cvtepu8_epi32: 606; CHECK: # %bb.0: 607; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 608; CHECK-NEXT: ret{{[l|q]}} 609 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 610 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 611 %ext = zext <8 x i8> %shuf to <8 x i32> 612 %res = bitcast <8 x i32> %ext to <4 x i64> 613 ret <4 x i64> %res 614} 615 616define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) { 617; CHECK-LABEL: test_mm256_cvtepu8_epi64: 618; CHECK: # %bb.0: 619; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 620; CHECK-NEXT: ret{{[l|q]}} 621 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 622 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 623 %ext = zext <4 x i8> %shuf to <4 x i64> 624 ret <4 x i64> %ext 625} 626 627define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) { 628; CHECK-LABEL: test_mm256_cvtepu16_epi32: 629; CHECK: # %bb.0: 630; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 631; CHECK-NEXT: ret{{[l|q]}} 632 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 633 %ext = zext <8 x i16> %arg0 to <8 x i32> 634 %res = bitcast <8 x i32> %ext to <4 x i64> 635 ret <4 x i64> %res 636} 637 638define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) { 639; CHECK-LABEL: test_mm256_cvtepu16_epi64: 640; CHECK: # %bb.0: 641; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 642; CHECK-NEXT: ret{{[l|q]}} 643 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 644 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 645 %ext = zext <4 x i16> %shuf to <4 x i64> 646 ret <4 x i64> %ext 647} 648 649define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) { 650; CHECK-LABEL: test_mm256_cvtepu32_epi64: 651; CHECK: # %bb.0: 652; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 653; CHECK-NEXT: ret{{[l|q]}} 654 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 655 %ext = zext <4 x i32> %arg0 to <4 x i64> 656 ret <4 x i64> %ext 657} 658 659define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind { 660; CHECK-LABEL: test_mm256_extracti128_si256: 661; CHECK: # %bb.0: 662; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 663; CHECK-NEXT: vzeroupper 664; CHECK-NEXT: ret{{[l|q]}} 665 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3> 666 ret <2 x i64> %res 667} 668 669define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) { 670; CHECK-LABEL: test_mm256_hadd_epi16: 671; CHECK: # %bb.0: 672; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0 673; CHECK-NEXT: ret{{[l|q]}} 674 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 675 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 676 %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1) 677 %bc = bitcast <16 x i16> %res to <4 x i64> 678 ret <4 x i64> %bc 679} 680declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone 681 682define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) { 683; CHECK-LABEL: test_mm256_hadd_epi32: 684; CHECK: # %bb.0: 685; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 686; CHECK-NEXT: ret{{[l|q]}} 687 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 688 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 689 %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1) 690 %bc = bitcast <8 x i32> %res to <4 x i64> 691 ret <4 x i64> %bc 692} 693declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone 694 695define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) { 696; CHECK-LABEL: test_mm256_hadds_epi16: 697; CHECK: # %bb.0: 698; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 699; CHECK-NEXT: ret{{[l|q]}} 700 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 701 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 702 %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1) 703 %bc = bitcast <16 x i16> %res to <4 x i64> 704 ret <4 x i64> %bc 705} 706declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone 707 708define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) { 709; CHECK-LABEL: test_mm256_hsub_epi16: 710; CHECK: # %bb.0: 711; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0 712; CHECK-NEXT: ret{{[l|q]}} 713 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 714 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 715 %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1) 716 %bc = bitcast <16 x i16> %res to <4 x i64> 717 ret <4 x i64> %bc 718} 719declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone 720 721define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) { 722; CHECK-LABEL: test_mm256_hsub_epi32: 723; CHECK: # %bb.0: 724; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0 725; CHECK-NEXT: ret{{[l|q]}} 726 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 727 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 728 %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1) 729 %bc = bitcast <8 x i32> %res to <4 x i64> 730 ret <4 x i64> %bc 731} 732declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone 733 734define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 735; CHECK-LABEL: test_mm256_hsubs_epi16: 736; CHECK: # %bb.0: 737; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 738; CHECK-NEXT: ret{{[l|q]}} 739 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 740 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 741 %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1) 742 %bc = bitcast <16 x i16> %res to <4 x i64> 743 ret <4 x i64> %bc 744} 745declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone 746 747define <2 x i64> @test_mm_i32gather_epi32(ptr%a0, <2 x i64> %a1) { 748; X86-LABEL: test_mm_i32gather_epi32: 749; X86: # %bb.0: 750; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 751; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 752; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 753; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1 754; X86-NEXT: vmovdqa %xmm1, %xmm0 755; X86-NEXT: retl 756; 757; X64-LABEL: test_mm_i32gather_epi32: 758; X64: # %bb.0: 759; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 760; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 761; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1 762; X64-NEXT: vmovdqa %xmm1, %xmm0 763; X64-NEXT: retq 764 %arg0 = bitcast ptr%a0 to ptr 765 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 766 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 767 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, ptr %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2) 768 %bc = bitcast <4 x i32> %call to <2 x i64> 769 ret <2 x i64> %bc 770} 771declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, ptr, <4 x i32>, <4 x i32>, i8) nounwind readonly 772 773define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) { 774; X86-LABEL: test_mm_mask_i32gather_epi32: 775; X86: # %bb.0: 776; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 777; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0 778; X86-NEXT: retl 779; 780; X64-LABEL: test_mm_mask_i32gather_epi32: 781; X64: # %bb.0: 782; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 783; X64-NEXT: retq 784 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 785 %arg1 = bitcast ptr%a1 to ptr 786 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 787 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 788 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, ptr %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2) 789 %bc = bitcast <4 x i32> %call to <2 x i64> 790 ret <2 x i64> %bc 791} 792 793define <4 x i64> @test_mm256_i32gather_epi32(ptr%a0, <4 x i64> %a1) { 794; X86-LABEL: test_mm256_i32gather_epi32: 795; X86: # %bb.0: 796; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 797; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 798; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 799; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1 800; X86-NEXT: vmovdqa %ymm1, %ymm0 801; X86-NEXT: retl 802; 803; X64-LABEL: test_mm256_i32gather_epi32: 804; X64: # %bb.0: 805; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 806; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 807; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1 808; X64-NEXT: vmovdqa %ymm1, %ymm0 809; X64-NEXT: retq 810 %arg0 = bitcast ptr%a0 to ptr 811 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 812 %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32> 813 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, ptr %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2) 814 %bc = bitcast <8 x i32> %call to <4 x i64> 815 ret <4 x i64> %bc 816} 817declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, ptr, <8 x i32>, <8 x i32>, i8) nounwind readonly 818 819define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, ptr%a1, <4 x i64> %a2, <4 x i64> %a3) { 820; X86-LABEL: test_mm256_mask_i32gather_epi32: 821; X86: # %bb.0: 822; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 823; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0 824; X86-NEXT: retl 825; 826; X64-LABEL: test_mm256_mask_i32gather_epi32: 827; X64: # %bb.0: 828; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 829; X64-NEXT: retq 830 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 831 %arg1 = bitcast ptr%a1 to ptr 832 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 833 %arg3 = bitcast <4 x i64> %a3 to <8 x i32> 834 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, ptr %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2) 835 %bc = bitcast <8 x i32> %call to <4 x i64> 836 ret <4 x i64> %bc 837} 838 839define <2 x i64> @test_mm_i32gather_epi64(ptr%a0, <2 x i64> %a1) { 840; X86-LABEL: test_mm_i32gather_epi64: 841; X86: # %bb.0: 842; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 843; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 844; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 845; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1 846; X86-NEXT: vmovdqa %xmm1, %xmm0 847; X86-NEXT: retl 848; 849; X64-LABEL: test_mm_i32gather_epi64: 850; X64: # %bb.0: 851; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 852; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 853; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1 854; X64-NEXT: vmovdqa %xmm1, %xmm0 855; X64-NEXT: retq 856 %arg0 = bitcast ptr%a0 to ptr 857 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 858 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, ptr %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2) 859 ret <2 x i64> %res 860} 861declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, ptr, <4 x i32>, <2 x i64>, i8) nounwind readonly 862 863define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) { 864; X86-LABEL: test_mm_mask_i32gather_epi64: 865; X86: # %bb.0: 866; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 867; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0 868; X86-NEXT: retl 869; 870; X64-LABEL: test_mm_mask_i32gather_epi64: 871; X64: # %bb.0: 872; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 873; X64-NEXT: retq 874 %arg1 = bitcast ptr%a1 to ptr 875 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 876 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, ptr %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2) 877 ret <2 x i64> %res 878} 879 880define <4 x i64> @test_mm256_i32gather_epi64(ptr%a0, <2 x i64> %a1) { 881; X86-LABEL: test_mm256_i32gather_epi64: 882; X86: # %bb.0: 883; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 884; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 885; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 886; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1 887; X86-NEXT: vmovdqa %ymm1, %ymm0 888; X86-NEXT: retl 889; 890; X64-LABEL: test_mm256_i32gather_epi64: 891; X64: # %bb.0: 892; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 893; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 894; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1 895; X64-NEXT: vmovdqa %ymm1, %ymm0 896; X64-NEXT: retq 897 %arg0 = bitcast ptr%a0 to ptr 898 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 899 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, ptr %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2) 900 ret <4 x i64> %res 901} 902declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, ptr, <4 x i32>, <4 x i64>, i8) nounwind readonly 903 904define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, ptr%a1, <2 x i64> %a2, <4 x i64> %a3) { 905; X86-LABEL: test_mm256_mask_i32gather_epi64: 906; X86: # %bb.0: 907; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 908; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0 909; X86-NEXT: retl 910; 911; X64-LABEL: test_mm256_mask_i32gather_epi64: 912; X64: # %bb.0: 913; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 914; X64-NEXT: retq 915 %arg1 = bitcast ptr%a1 to ptr 916 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 917 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, ptr %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2) 918 ret <4 x i64> %res 919} 920 921define <2 x double> @test_mm_i32gather_pd(ptr%a0, <2 x i64> %a1) { 922; X86-LABEL: test_mm_i32gather_pd: 923; X86: # %bb.0: 924; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 925; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 926; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 927; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1 928; X86-NEXT: vmovapd %xmm1, %xmm0 929; X86-NEXT: retl 930; 931; X64-LABEL: test_mm_i32gather_pd: 932; X64: # %bb.0: 933; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 934; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 935; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1 936; X64-NEXT: vmovapd %xmm1, %xmm0 937; X64-NEXT: retq 938 %arg0 = bitcast ptr%a0 to ptr 939 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 940 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer 941 %sext = sext <2 x i1> %cmp to <2 x i64> 942 %mask = bitcast <2 x i64> %sext to <2 x double> 943 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, ptr %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2) 944 ret <2 x double> %res 945} 946declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, ptr, <4 x i32>, <2 x double>, i8) nounwind readonly 947 948define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, ptr%a1, <2 x i64> %a2, <2 x double> %a3) { 949; X86-LABEL: test_mm_mask_i32gather_pd: 950; X86: # %bb.0: 951; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 952; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0 953; X86-NEXT: retl 954; 955; X64-LABEL: test_mm_mask_i32gather_pd: 956; X64: # %bb.0: 957; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 958; X64-NEXT: retq 959 %arg1 = bitcast ptr%a1 to ptr 960 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 961 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, ptr %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2) 962 ret <2 x double> %res 963} 964 965define <4 x double> @test_mm256_i32gather_pd(ptr%a0, <2 x i64> %a1) { 966; X86-LABEL: test_mm256_i32gather_pd: 967; X86: # %bb.0: 968; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 969; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 970; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 971; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1 972; X86-NEXT: vmovapd %ymm1, %ymm0 973; X86-NEXT: retl 974; 975; X64-LABEL: test_mm256_i32gather_pd: 976; X64: # %bb.0: 977; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 978; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 979; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1 980; X64-NEXT: vmovapd %ymm1, %ymm0 981; X64-NEXT: retq 982 %arg0 = bitcast ptr%a0 to ptr 983 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 984 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0) 985 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, ptr %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2) 986 ret <4 x double> %res 987} 988declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, ptr, <4 x i32>, <4 x double>, i8) nounwind readonly 989 990define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, ptr%a1, <2 x i64> %a2, <4 x double> %a3) { 991; X86-LABEL: test_mm256_mask_i32gather_pd: 992; X86: # %bb.0: 993; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 994; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0 995; X86-NEXT: retl 996; 997; X64-LABEL: test_mm256_mask_i32gather_pd: 998; X64: # %bb.0: 999; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0 1000; X64-NEXT: retq 1001 %arg1 = bitcast ptr%a1 to ptr 1002 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1003 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, ptr %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2) 1004 ret <4 x double> %res 1005} 1006 1007define <4 x float> @test_mm_i32gather_ps(ptr%a0, <2 x i64> %a1) { 1008; X86-LABEL: test_mm_i32gather_ps: 1009; X86: # %bb.0: 1010; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1011; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1012; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1013; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1 1014; X86-NEXT: vmovaps %xmm1, %xmm0 1015; X86-NEXT: retl 1016; 1017; X64-LABEL: test_mm_i32gather_ps: 1018; X64: # %bb.0: 1019; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1020; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1021; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1 1022; X64-NEXT: vmovaps %xmm1, %xmm0 1023; X64-NEXT: retq 1024 %arg0 = bitcast ptr%a0 to ptr 1025 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1026 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1027 %sext = sext <4 x i1> %cmp to <4 x i32> 1028 %mask = bitcast <4 x i32> %sext to <4 x float> 1029 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, ptr %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2) 1030 ret <4 x float> %call 1031} 1032declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, ptr, <4 x i32>, <4 x float>, i8) nounwind readonly 1033 1034define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, ptr%a1, <2 x i64> %a2, <4 x float> %a3) { 1035; X86-LABEL: test_mm_mask_i32gather_ps: 1036; X86: # %bb.0: 1037; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1038; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0 1039; X86-NEXT: retl 1040; 1041; X64-LABEL: test_mm_mask_i32gather_ps: 1042; X64: # %bb.0: 1043; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 1044; X64-NEXT: retq 1045 %arg1 = bitcast ptr%a1 to ptr 1046 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1047 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, ptr %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2) 1048 ret <4 x float> %call 1049} 1050 1051define <8 x float> @test_mm256_i32gather_ps(ptr%a0, <4 x i64> %a1) { 1052; X86-LABEL: test_mm256_i32gather_ps: 1053; X86: # %bb.0: 1054; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1055; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1056; X86-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2 1057; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1 1058; X86-NEXT: vmovaps %ymm1, %ymm0 1059; X86-NEXT: retl 1060; 1061; X64-LABEL: test_mm256_i32gather_ps: 1062; X64: # %bb.0: 1063; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1064; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2 1065; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1 1066; X64-NEXT: vmovaps %ymm1, %ymm0 1067; X64-NEXT: retq 1068 %arg0 = bitcast ptr%a0 to ptr 1069 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1070 %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0) 1071 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, ptr %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2) 1072 ret <8 x float> %call 1073} 1074declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, ptr, <8 x i32>, <8 x float>, i8) nounwind readonly 1075 1076define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, ptr%a1, <4 x i64> %a2, <8 x float> %a3) { 1077; X86-LABEL: test_mm256_mask_i32gather_ps: 1078; X86: # %bb.0: 1079; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1080; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0 1081; X86-NEXT: retl 1082; 1083; X64-LABEL: test_mm256_mask_i32gather_ps: 1084; X64: # %bb.0: 1085; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0 1086; X64-NEXT: retq 1087 %arg1 = bitcast ptr%a1 to ptr 1088 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1089 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, ptr %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2) 1090 ret <8 x float> %call 1091} 1092 1093define <2 x i64> @test_mm_i64gather_epi32(ptr%a0, <2 x i64> %a1) { 1094; X86-LABEL: test_mm_i64gather_epi32: 1095; X86: # %bb.0: 1096; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1097; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1098; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1099; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1 1100; X86-NEXT: vmovdqa %xmm1, %xmm0 1101; X86-NEXT: retl 1102; 1103; X64-LABEL: test_mm_i64gather_epi32: 1104; X64: # %bb.0: 1105; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1106; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1107; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1 1108; X64-NEXT: vmovdqa %xmm1, %xmm0 1109; X64-NEXT: retq 1110 %arg0 = bitcast ptr%a0 to ptr 1111 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1112 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, ptr %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2) 1113 %bc = bitcast <4 x i32> %call to <2 x i64> 1114 ret <2 x i64> %bc 1115} 1116declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, ptr, <2 x i64>, <4 x i32>, i8) nounwind readonly 1117 1118define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) { 1119; X86-LABEL: test_mm_mask_i64gather_epi32: 1120; X86: # %bb.0: 1121; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1122; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0 1123; X86-NEXT: retl 1124; 1125; X64-LABEL: test_mm_mask_i64gather_epi32: 1126; X64: # %bb.0: 1127; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 1128; X64-NEXT: retq 1129 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1130 %arg1 = bitcast ptr%a1 to ptr 1131 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1132 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, ptr %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2) 1133 %bc = bitcast <4 x i32> %call to <2 x i64> 1134 ret <2 x i64> %bc 1135} 1136 1137define <2 x i64> @test_mm256_i64gather_epi32(ptr%a0, <4 x i64> %a1) { 1138; X86-LABEL: test_mm256_i64gather_epi32: 1139; X86: # %bb.0: 1140; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1141; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1142; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1143; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1 1144; X86-NEXT: vmovdqa %xmm1, %xmm0 1145; X86-NEXT: vzeroupper 1146; X86-NEXT: retl 1147; 1148; X64-LABEL: test_mm256_i64gather_epi32: 1149; X64: # %bb.0: 1150; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1151; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1152; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1 1153; X64-NEXT: vmovdqa %xmm1, %xmm0 1154; X64-NEXT: vzeroupper 1155; X64-NEXT: retq 1156 %arg0 = bitcast ptr%a0 to ptr 1157 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32> 1158 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, ptr %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2) 1159 %bc = bitcast <4 x i32> %call to <2 x i64> 1160 ret <2 x i64> %bc 1161} 1162declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, ptr, <4 x i64>, <4 x i32>, i8) nounwind readonly 1163 1164define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, ptr%a1, <4 x i64> %a2, <2 x i64> %a3) { 1165; X86-LABEL: test_mm256_mask_i64gather_epi32: 1166; X86: # %bb.0: 1167; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1168; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 1169; X86-NEXT: vzeroupper 1170; X86-NEXT: retl 1171; 1172; X64-LABEL: test_mm256_mask_i64gather_epi32: 1173; X64: # %bb.0: 1174; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 1175; X64-NEXT: vzeroupper 1176; X64-NEXT: retq 1177 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1178 %arg1 = bitcast ptr%a1 to ptr 1179 %arg3 = bitcast <2 x i64> %a3 to <4 x i32> 1180 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, ptr %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2) 1181 %bc = bitcast <4 x i32> %call to <2 x i64> 1182 ret <2 x i64> %bc 1183} 1184 1185define <2 x i64> @test_mm_i64gather_epi64(ptr%a0, <2 x i64> %a1) { 1186; X86-LABEL: test_mm_i64gather_epi64: 1187; X86: # %bb.0: 1188; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1189; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1190; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1191; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1 1192; X86-NEXT: vmovdqa %xmm1, %xmm0 1193; X86-NEXT: retl 1194; 1195; X64-LABEL: test_mm_i64gather_epi64: 1196; X64: # %bb.0: 1197; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1198; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1199; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1 1200; X64-NEXT: vmovdqa %xmm1, %xmm0 1201; X64-NEXT: retq 1202 %arg0 = bitcast ptr%a0 to ptr 1203 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, ptr %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2) 1204 ret <2 x i64> %call 1205} 1206declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, ptr, <2 x i64>, <2 x i64>, i8) nounwind readonly 1207 1208define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) { 1209; X86-LABEL: test_mm_mask_i64gather_epi64: 1210; X86: # %bb.0: 1211; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1212; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 1213; X86-NEXT: retl 1214; 1215; X64-LABEL: test_mm_mask_i64gather_epi64: 1216; X64: # %bb.0: 1217; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 1218; X64-NEXT: retq 1219 %arg1 = bitcast ptr%a1 to ptr 1220 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, ptr %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2) 1221 ret <2 x i64> %call 1222} 1223 1224define <4 x i64> @test_mm256_i64gather_epi64(ptr%a0, <4 x i64> %a1) { 1225; X86-LABEL: test_mm256_i64gather_epi64: 1226; X86: # %bb.0: 1227; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1228; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1229; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1230; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1 1231; X86-NEXT: vmovdqa %ymm1, %ymm0 1232; X86-NEXT: retl 1233; 1234; X64-LABEL: test_mm256_i64gather_epi64: 1235; X64: # %bb.0: 1236; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 1237; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1238; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1 1239; X64-NEXT: vmovdqa %ymm1, %ymm0 1240; X64-NEXT: retq 1241 %arg0 = bitcast ptr%a0 to ptr 1242 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, ptr %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2) 1243 ret <4 x i64> %call 1244} 1245declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, ptr, <4 x i64>, <4 x i64>, i8) nounwind readonly 1246 1247define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, ptr%a1, <4 x i64> %a2, <4 x i64> %a3) { 1248; X86-LABEL: test_mm256_mask_i64gather_epi64: 1249; X86: # %bb.0: 1250; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1251; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0 1252; X86-NEXT: retl 1253; 1254; X64-LABEL: test_mm256_mask_i64gather_epi64: 1255; X64: # %bb.0: 1256; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 1257; X64-NEXT: retq 1258 %arg1 = bitcast ptr%a1 to ptr 1259 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, ptr %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2) 1260 ret <4 x i64> %call 1261} 1262 1263define <2 x double> @test_mm_i64gather_pd(ptr%a0, <2 x i64> %a1) { 1264; X86-LABEL: test_mm_i64gather_pd: 1265; X86: # %bb.0: 1266; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1267; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1268; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1269; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1 1270; X86-NEXT: vmovapd %xmm1, %xmm0 1271; X86-NEXT: retl 1272; 1273; X64-LABEL: test_mm_i64gather_pd: 1274; X64: # %bb.0: 1275; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1276; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1277; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1 1278; X64-NEXT: vmovapd %xmm1, %xmm0 1279; X64-NEXT: retq 1280 %arg0 = bitcast ptr%a0 to ptr 1281 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer 1282 %sext = sext <2 x i1> %cmp to <2 x i64> 1283 %mask = bitcast <2 x i64> %sext to <2 x double> 1284 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, ptr %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2) 1285 ret <2 x double> %call 1286} 1287declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, ptr, <2 x i64>, <2 x double>, i8) nounwind readonly 1288 1289define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, ptr%a1, <2 x i64> %a2, <2 x double> %a3) { 1290; X86-LABEL: test_mm_mask_i64gather_pd: 1291; X86: # %bb.0: 1292; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1293; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0 1294; X86-NEXT: retl 1295; 1296; X64-LABEL: test_mm_mask_i64gather_pd: 1297; X64: # %bb.0: 1298; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 1299; X64-NEXT: retq 1300 %arg1 = bitcast ptr%a1 to ptr 1301 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, ptr %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2) 1302 ret <2 x double> %call 1303} 1304 1305define <4 x double> @test_mm256_i64gather_pd(ptr%a0, <4 x i64> %a1) { 1306; X86-LABEL: test_mm256_i64gather_pd: 1307; X86: # %bb.0: 1308; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1309; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1310; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1311; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1 1312; X86-NEXT: vmovapd %ymm1, %ymm0 1313; X86-NEXT: retl 1314; 1315; X64-LABEL: test_mm256_i64gather_pd: 1316; X64: # %bb.0: 1317; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1318; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2 1319; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1 1320; X64-NEXT: vmovapd %ymm1, %ymm0 1321; X64-NEXT: retq 1322 %arg0 = bitcast ptr%a0 to ptr 1323 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0) 1324 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, ptr %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2) 1325 ret <4 x double> %call 1326} 1327declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, ptr, <4 x i64>, <4 x double>, i8) nounwind readonly 1328 1329define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, ptr%a1, <4 x i64> %a2, <4 x double> %a3) { 1330; X86-LABEL: test_mm256_mask_i64gather_pd: 1331; X86: # %bb.0: 1332; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1333; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0 1334; X86-NEXT: retl 1335; 1336; X64-LABEL: test_mm256_mask_i64gather_pd: 1337; X64: # %bb.0: 1338; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0 1339; X64-NEXT: retq 1340 %arg1 = bitcast ptr%a1 to ptr 1341 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, ptr %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2) 1342 ret <4 x double> %call 1343} 1344 1345define <4 x float> @test_mm_i64gather_ps(ptr%a0, <2 x i64> %a1) { 1346; X86-LABEL: test_mm_i64gather_ps: 1347; X86: # %bb.0: 1348; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1349; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1350; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1351; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1 1352; X86-NEXT: vmovaps %xmm1, %xmm0 1353; X86-NEXT: retl 1354; 1355; X64-LABEL: test_mm_i64gather_ps: 1356; X64: # %bb.0: 1357; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1358; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1359; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1 1360; X64-NEXT: vmovaps %xmm1, %xmm0 1361; X64-NEXT: retq 1362 %arg0 = bitcast ptr%a0 to ptr 1363 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1364 %sext = sext <4 x i1> %cmp to <4 x i32> 1365 %mask = bitcast <4 x i32> %sext to <4 x float> 1366 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, ptr %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2) 1367 ret <4 x float> %call 1368} 1369declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, ptr, <2 x i64>, <4 x float>, i8) nounwind readonly 1370 1371define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, ptr%a1, <2 x i64> %a2, <4 x float> %a3) { 1372; X86-LABEL: test_mm_mask_i64gather_ps: 1373; X86: # %bb.0: 1374; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1375; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0 1376; X86-NEXT: retl 1377; 1378; X64-LABEL: test_mm_mask_i64gather_ps: 1379; X64: # %bb.0: 1380; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 1381; X64-NEXT: retq 1382 %arg1 = bitcast ptr%a1 to ptr 1383 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, ptr %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2) 1384 ret <4 x float> %call 1385} 1386 1387define <4 x float> @test_mm256_i64gather_ps(ptr%a0, <4 x i64> %a1) { 1388; X86-LABEL: test_mm256_i64gather_ps: 1389; X86: # %bb.0: 1390; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1391; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1392; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1393; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1 1394; X86-NEXT: vmovaps %xmm1, %xmm0 1395; X86-NEXT: vzeroupper 1396; X86-NEXT: retl 1397; 1398; X64-LABEL: test_mm256_i64gather_ps: 1399; X64: # %bb.0: 1400; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1401; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1402; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1 1403; X64-NEXT: vmovaps %xmm1, %xmm0 1404; X64-NEXT: vzeroupper 1405; X64-NEXT: retq 1406 %arg0 = bitcast ptr%a0 to ptr 1407 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer 1408 %sext = sext <4 x i1> %cmp to <4 x i32> 1409 %mask = bitcast <4 x i32> %sext to <4 x float> 1410 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, ptr %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2) 1411 ret <4 x float> %call 1412} 1413declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, ptr, <4 x i64>, <4 x float>, i8) nounwind readonly 1414 1415define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, ptr%a1, <4 x i64> %a2, <4 x float> %a3) { 1416; X86-LABEL: test_mm256_mask_i64gather_ps: 1417; X86: # %bb.0: 1418; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1419; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 1420; X86-NEXT: vzeroupper 1421; X86-NEXT: retl 1422; 1423; X64-LABEL: test_mm256_mask_i64gather_ps: 1424; X64: # %bb.0: 1425; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0 1426; X64-NEXT: vzeroupper 1427; X64-NEXT: retq 1428 %arg1 = bitcast ptr%a1 to ptr 1429 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, ptr %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2) 1430 ret <4 x float> %call 1431} 1432 1433define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 1434; CHECK-LABEL: test0_mm256_inserti128_si256: 1435; CHECK: # %bb.0: 1436; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1437; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1438; CHECK-NEXT: ret{{[l|q]}} 1439 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1440 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1441 ret <4 x i64> %res 1442} 1443 1444define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 1445; CHECK-LABEL: test1_mm256_inserti128_si256: 1446; CHECK: # %bb.0: 1447; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1448; CHECK-NEXT: ret{{[l|q]}} 1449 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1450 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1451 ret <4 x i64> %res 1452} 1453 1454define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1455; CHECK-LABEL: test_mm256_madd_epi16: 1456; CHECK: # %bb.0: 1457; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1458; CHECK-NEXT: ret{{[l|q]}} 1459 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1460 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1461 %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1) 1462 %bc = bitcast <8 x i32> %res to <4 x i64> 1463 ret <4 x i64> %bc 1464} 1465declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone 1466 1467define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1468; CHECK-LABEL: test_mm256_maddubs_epi16: 1469; CHECK: # %bb.0: 1470; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 1471; CHECK-NEXT: ret{{[l|q]}} 1472 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1473 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1474 %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1) 1475 %bc = bitcast <16 x i16> %res to <4 x i64> 1476 ret <4 x i64> %bc 1477} 1478declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone 1479 1480define <2 x i64> @test_mm_maskload_epi32(ptr %a0, <2 x i64> %a1) nounwind { 1481; X86-LABEL: test_mm_maskload_epi32: 1482; X86: # %bb.0: 1483; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1484; X86-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0 1485; X86-NEXT: retl 1486; 1487; X64-LABEL: test_mm_maskload_epi32: 1488; X64: # %bb.0: 1489; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0 1490; X64-NEXT: retq 1491 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1492 %call = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %arg1) 1493 %bc = bitcast <4 x i32> %call to <2 x i64> 1494 ret <2 x i64> %bc 1495} 1496declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly 1497 1498define <4 x i64> @test_mm256_maskload_epi32(ptr %a0, <4 x i64> %a1) nounwind { 1499; X86-LABEL: test_mm256_maskload_epi32: 1500; X86: # %bb.0: 1501; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1502; X86-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0 1503; X86-NEXT: retl 1504; 1505; X64-LABEL: test_mm256_maskload_epi32: 1506; X64: # %bb.0: 1507; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0 1508; X64-NEXT: retq 1509 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1510 %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %arg1) 1511 %bc = bitcast <8 x i32> %call to <4 x i64> 1512 ret <4 x i64> %bc 1513} 1514declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonly 1515 1516define <2 x i64> @test_mm_maskload_epi64(ptr %a0, <2 x i64> %a1) nounwind { 1517; X86-LABEL: test_mm_maskload_epi64: 1518; X86: # %bb.0: 1519; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1520; X86-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0 1521; X86-NEXT: retl 1522; 1523; X64-LABEL: test_mm_maskload_epi64: 1524; X64: # %bb.0: 1525; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0 1526; X64-NEXT: retq 1527 %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1) 1528 ret <2 x i64> %res 1529} 1530declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly 1531 1532define <4 x i64> @test_mm256_maskload_epi64(ptr %a0, <4 x i64> %a1) nounwind { 1533; X86-LABEL: test_mm256_maskload_epi64: 1534; X86: # %bb.0: 1535; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1536; X86-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0 1537; X86-NEXT: retl 1538; 1539; X64-LABEL: test_mm256_maskload_epi64: 1540; X64: # %bb.0: 1541; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 1542; X64-NEXT: retq 1543 %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1) 1544 ret <4 x i64> %res 1545} 1546declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonly 1547 1548define void @test_mm_maskstore_epi32(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind { 1549; X86-LABEL: test_mm_maskstore_epi32: 1550; X86: # %bb.0: 1551; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1552; X86-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax) 1553; X86-NEXT: retl 1554; 1555; X64-LABEL: test_mm_maskstore_epi32: 1556; X64: # %bb.0: 1557; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) 1558; X64-NEXT: retq 1559 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1560 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 1561 call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %arg1, <4 x i32> %arg2) 1562 ret void 1563} 1564declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind readnone 1565 1566define void @test_mm256_maskstore_epi32(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind { 1567; X86-LABEL: test_mm256_maskstore_epi32: 1568; X86: # %bb.0: 1569; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1570; X86-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) 1571; X86-NEXT: vzeroupper 1572; X86-NEXT: retl 1573; 1574; X64-LABEL: test_mm256_maskstore_epi32: 1575; X64: # %bb.0: 1576; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) 1577; X64-NEXT: vzeroupper 1578; X64-NEXT: retq 1579 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1580 %arg2 = bitcast <4 x i64> %a2 to <8 x i32> 1581 call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %arg1, <8 x i32> %arg2) 1582 ret void 1583} 1584declare void @llvm.x86.avx2.maskstore.d.256(ptr, <8 x i32>, <8 x i32>) nounwind readnone 1585 1586define void @test_mm_maskstore_epi64(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind { 1587; X86-LABEL: test_mm_maskstore_epi64: 1588; X86: # %bb.0: 1589; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1590; X86-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax) 1591; X86-NEXT: retl 1592; 1593; X64-LABEL: test_mm_maskstore_epi64: 1594; X64: # %bb.0: 1595; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) 1596; X64-NEXT: retq 1597 call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) 1598 ret void 1599} 1600declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind readnone 1601 1602define void @test_mm256_maskstore_epi64(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind { 1603; X86-LABEL: test_mm256_maskstore_epi64: 1604; X86: # %bb.0: 1605; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1606; X86-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) 1607; X86-NEXT: vzeroupper 1608; X86-NEXT: retl 1609; 1610; X64-LABEL: test_mm256_maskstore_epi64: 1611; X64: # %bb.0: 1612; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) 1613; X64-NEXT: vzeroupper 1614; X64-NEXT: retq 1615 call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) 1616 ret void 1617} 1618declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind readnone 1619 1620define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) { 1621; CHECK-LABEL: test_mm256_max_epi8: 1622; CHECK: # %bb.0: 1623; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 1624; CHECK-NEXT: ret{{[l|q]}} 1625 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1626 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1627 %sel = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 1628 %bc = bitcast <32 x i8> %sel to <4 x i64> 1629 ret <4 x i64> %bc 1630} 1631declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>) 1632 1633define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1634; CHECK-LABEL: test_mm256_max_epi16: 1635; CHECK: # %bb.0: 1636; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 1637; CHECK-NEXT: ret{{[l|q]}} 1638 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1639 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1640 %sel = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 1641 %bc = bitcast <16 x i16> %sel to <4 x i64> 1642 ret <4 x i64> %bc 1643} 1644declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) 1645 1646define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1647; CHECK-LABEL: test_mm256_max_epi32: 1648; CHECK: # %bb.0: 1649; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 1650; CHECK-NEXT: ret{{[l|q]}} 1651 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1652 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1653 %sel = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) 1654 %bc = bitcast <8 x i32> %sel to <4 x i64> 1655 ret <4 x i64> %bc 1656} 1657declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) 1658 1659define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1660; CHECK-LABEL: test_mm256_max_epu8: 1661; CHECK: # %bb.0: 1662; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 1663; CHECK-NEXT: ret{{[l|q]}} 1664 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1665 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1666 %sel = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 1667 %bc = bitcast <32 x i8> %sel to <4 x i64> 1668 ret <4 x i64> %bc 1669} 1670declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>) 1671 1672define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1673; CHECK-LABEL: test_mm256_max_epu16: 1674; CHECK: # %bb.0: 1675; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 1676; CHECK-NEXT: ret{{[l|q]}} 1677 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1678 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1679 %sel = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 1680 %bc = bitcast <16 x i16> %sel to <4 x i64> 1681 ret <4 x i64> %bc 1682} 1683declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>) 1684 1685define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1686; CHECK-LABEL: test_mm256_max_epu32: 1687; CHECK: # %bb.0: 1688; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 1689; CHECK-NEXT: ret{{[l|q]}} 1690 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1691 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1692 %sel = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) 1693 %bc = bitcast <8 x i32> %sel to <4 x i64> 1694 ret <4 x i64> %bc 1695} 1696declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) 1697 1698define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) { 1699; CHECK-LABEL: test_mm256_min_epi8: 1700; CHECK: # %bb.0: 1701; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0 1702; CHECK-NEXT: ret{{[l|q]}} 1703 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1704 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1705 %sel = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 1706 %bc = bitcast <32 x i8> %sel to <4 x i64> 1707 ret <4 x i64> %bc 1708} 1709declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>) 1710 1711define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1712; CHECK-LABEL: test_mm256_min_epi16: 1713; CHECK: # %bb.0: 1714; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0 1715; CHECK-NEXT: ret{{[l|q]}} 1716 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1717 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1718 %sel = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 1719 %bc = bitcast <16 x i16> %sel to <4 x i64> 1720 ret <4 x i64> %bc 1721} 1722declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) 1723 1724define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1725; CHECK-LABEL: test_mm256_min_epi32: 1726; CHECK: # %bb.0: 1727; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 1728; CHECK-NEXT: ret{{[l|q]}} 1729 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1730 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1731 %sel = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) 1732 %bc = bitcast <8 x i32> %sel to <4 x i64> 1733 ret <4 x i64> %bc 1734} 1735declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) 1736 1737define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1738; CHECK-LABEL: test_mm256_min_epu8: 1739; CHECK: # %bb.0: 1740; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0 1741; CHECK-NEXT: ret{{[l|q]}} 1742 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1743 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1744 %sel = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 1745 %bc = bitcast <32 x i8> %sel to <4 x i64> 1746 ret <4 x i64> %bc 1747} 1748declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>) 1749 1750define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1751; CHECK-LABEL: test_mm256_min_epu16: 1752; CHECK: # %bb.0: 1753; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0 1754; CHECK-NEXT: ret{{[l|q]}} 1755 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1756 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1757 %sel = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 1758 %bc = bitcast <16 x i16> %sel to <4 x i64> 1759 ret <4 x i64> %bc 1760} 1761declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) 1762 1763define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1764; CHECK-LABEL: test_mm256_min_epu32: 1765; CHECK: # %bb.0: 1766; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 1767; CHECK-NEXT: ret{{[l|q]}} 1768 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1769 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1770 %sel = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1) 1771 %bc = bitcast <8 x i32> %sel to <4 x i64> 1772 ret <4 x i64> %bc 1773} 1774declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) 1775 1776define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind { 1777; CHECK-LABEL: test_mm256_movemask_epi8: 1778; CHECK: # %bb.0: 1779; CHECK-NEXT: vpmovmskb %ymm0, %eax 1780; CHECK-NEXT: vzeroupper 1781; CHECK-NEXT: ret{{[l|q]}} 1782 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1783 %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0) 1784 ret i32 %res 1785} 1786declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone 1787 1788define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) { 1789; CHECK-LABEL: test_mm256_mpsadbw_epu8: 1790; CHECK: # %bb.0: 1791; CHECK-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0 1792; CHECK-NEXT: ret{{[l|q]}} 1793 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 1794 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 1795 %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3) 1796 %bc = bitcast <16 x i16> %call to <4 x i64> 1797 ret <4 x i64> %bc 1798} 1799declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone 1800 1801define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1802; CHECK-LABEL: test_mm256_mul_epi32: 1803; CHECK: # %bb.0: 1804; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 1805; CHECK-NEXT: ret{{[l|q]}} 1806 %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32> 1807 %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32> 1808 %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32> 1809 %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32> 1810 %res = mul nsw <4 x i64> %A1, %B1 1811 ret <4 x i64> %res 1812} 1813declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone 1814 1815define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) { 1816; CHECK-LABEL: test_mm256_mul_epu32: 1817; CHECK: # %bb.0: 1818; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1819; CHECK-NEXT: ret{{[l|q]}} 1820 %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1821 %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1822 %res = mul nuw <4 x i64> %A, %B 1823 ret <4 x i64> %res 1824} 1825declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone 1826 1827define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1828; CHECK-LABEL: test_mm256_mulhi_epi16: 1829; CHECK: # %bb.0: 1830; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 1831; CHECK-NEXT: ret{{[l|q]}} 1832 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1833 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1834 %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1) 1835 %bc = bitcast <16 x i16> %res to <4 x i64> 1836 ret <4 x i64> %bc 1837} 1838declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone 1839 1840define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) { 1841; CHECK-LABEL: test_mm256_mulhi_epu16: 1842; CHECK: # %bb.0: 1843; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 1844; CHECK-NEXT: ret{{[l|q]}} 1845 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1846 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1847 %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1) 1848 %bc = bitcast <16 x i16> %res to <4 x i64> 1849 ret <4 x i64> %bc 1850} 1851declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone 1852 1853define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1854; CHECK-LABEL: test_mm256_mulhrs_epi16: 1855; CHECK: # %bb.0: 1856; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 1857; CHECK-NEXT: ret{{[l|q]}} 1858 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1859 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1860 %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1) 1861 %bc = bitcast <16 x i16> %res to <4 x i64> 1862 ret <4 x i64> %bc 1863} 1864declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone 1865 1866define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1867; CHECK-LABEL: test_mm256_mullo_epi16: 1868; CHECK: # %bb.0: 1869; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1870; CHECK-NEXT: ret{{[l|q]}} 1871 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1872 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1873 %res = mul <16 x i16> %arg0, %arg1 1874 %bc = bitcast <16 x i16> %res to <4 x i64> 1875 ret <4 x i64> %bc 1876} 1877 1878define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1879; CHECK-LABEL: test_mm256_mullo_epi32: 1880; CHECK: # %bb.0: 1881; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1882; CHECK-NEXT: ret{{[l|q]}} 1883 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1884 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1885 %res = mul <8 x i32> %arg0, %arg1 1886 %bc = bitcast <8 x i32> %res to <4 x i64> 1887 ret <4 x i64> %bc 1888} 1889 1890define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1891; CHECK-LABEL: test_mm256_or_si256: 1892; CHECK: # %bb.0: 1893; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 1894; CHECK-NEXT: ret{{[l|q]}} 1895 %res = or <4 x i64> %a0, %a1 1896 ret <4 x i64> %res 1897} 1898 1899define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1900; CHECK-LABEL: test_mm256_packs_epi16: 1901; CHECK: # %bb.0: 1902; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 1903; CHECK-NEXT: ret{{[l|q]}} 1904 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1905 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1906 %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1) 1907 %res = bitcast <32 x i8> %call to <4 x i64> 1908 ret <4 x i64> %res 1909} 1910declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone 1911 1912define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1913; CHECK-LABEL: test_mm256_packs_epi32: 1914; CHECK: # %bb.0: 1915; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 1916; CHECK-NEXT: ret{{[l|q]}} 1917 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1918 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1919 %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1) 1920 %res = bitcast <16 x i16> %call to <4 x i64> 1921 ret <4 x i64> %res 1922} 1923declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone 1924 1925define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) { 1926; CHECK-LABEL: test_mm256_packus_epi16: 1927; CHECK: # %bb.0: 1928; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1929; CHECK-NEXT: ret{{[l|q]}} 1930 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 1931 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 1932 %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1) 1933 %res = bitcast <32 x i8> %call to <4 x i64> 1934 ret <4 x i64> %res 1935} 1936declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone 1937 1938define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1939; CHECK-LABEL: test_mm256_packus_epi32: 1940; CHECK: # %bb.0: 1941; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1942; CHECK-NEXT: ret{{[l|q]}} 1943 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1944 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1945 %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1) 1946 %res = bitcast <16 x i16> %call to <4 x i64> 1947 ret <4 x i64> %res 1948} 1949declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone 1950 1951define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) { 1952; CHECK-LABEL: test_mm256_permute2x128_si256: 1953; CHECK: # %bb.0: 1954; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1955; CHECK-NEXT: ret{{[l|q]}} 1956 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1957 ret <4 x i64> %res 1958} 1959declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly 1960 1961define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) { 1962; CHECK-LABEL: test_mm256_permute4x64_epi64: 1963; CHECK: # %bb.0: 1964; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0] 1965; CHECK-NEXT: ret{{[l|q]}} 1966 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0> 1967 ret <4 x i64> %res 1968} 1969 1970define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) { 1971; CHECK-LABEL: test_mm256_permute4x64_pd: 1972; CHECK: # %bb.0: 1973; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0] 1974; CHECK-NEXT: ret{{[l|q]}} 1975 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 1976 ret <4 x double> %res 1977} 1978 1979define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) { 1980; CHECK-LABEL: test_mm256_permutevar8x32_epi32: 1981; CHECK: # %bb.0: 1982; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 1983; CHECK-NEXT: ret{{[l|q]}} 1984 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 1985 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1986 %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1) 1987 %res = bitcast <8 x i32> %call to <4 x i64> 1988 ret <4 x i64> %res 1989} 1990declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly 1991 1992define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) { 1993; CHECK-LABEL: test_mm256_permutevar8x32_ps: 1994; CHECK: # %bb.0: 1995; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 1996; CHECK-NEXT: ret{{[l|q]}} 1997 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1998 %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1) 1999 ret <8 x float> %res 2000} 2001declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly 2002 2003define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2004; CHECK-LABEL: test_mm256_sad_epu8: 2005; CHECK: # %bb.0: 2006; CHECK-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 2007; CHECK-NEXT: ret{{[l|q]}} 2008 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2009 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2010 %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1) 2011 ret <4 x i64> %res 2012} 2013declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone 2014 2015define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) { 2016; CHECK-LABEL: test_mm256_shuffle_epi32: 2017; CHECK: # %bb.0: 2018; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] 2019; CHECK-NEXT: ret{{[l|q]}} 2020 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2021 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4> 2022 %res = bitcast <8 x i32> %shuf to <4 x i64> 2023 ret <4 x i64> %res 2024} 2025 2026define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2027; CHECK-LABEL: test_mm256_shuffle_epi8: 2028; CHECK: # %bb.0: 2029; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0 2030; CHECK-NEXT: ret{{[l|q]}} 2031 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2032 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2033 %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1) 2034 %res = bitcast <32 x i8> %shuf to <4 x i64> 2035 ret <4 x i64> %res 2036} 2037declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone 2038 2039define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) { 2040; CHECK-LABEL: test_mm256_shufflehi_epi16: 2041; CHECK: # %bb.0: 2042; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13] 2043; CHECK-NEXT: ret{{[l|q]}} 2044 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2045 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13> 2046 %res = bitcast <16 x i16> %shuf to <4 x i64> 2047 ret <4 x i64> %res 2048} 2049 2050define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) { 2051; CHECK-LABEL: test_mm256_shufflelo_epi16: 2052; CHECK: # %bb.0: 2053; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15] 2054; CHECK-NEXT: ret{{[l|q]}} 2055 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2056 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15> 2057 %res = bitcast <16 x i16> %shuf to <4 x i64> 2058 ret <4 x i64> %res 2059} 2060 2061define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2062; CHECK-LABEL: test_mm256_sign_epi8: 2063; CHECK: # %bb.0: 2064; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0 2065; CHECK-NEXT: ret{{[l|q]}} 2066 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2067 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2068 %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1) 2069 %res = bitcast <32 x i8> %call to <4 x i64> 2070 ret <4 x i64> %res 2071} 2072declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone 2073 2074define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2075; CHECK-LABEL: test_mm256_sign_epi16: 2076; CHECK: # %bb.0: 2077; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0 2078; CHECK-NEXT: ret{{[l|q]}} 2079 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2080 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2081 %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1) 2082 %res = bitcast <16 x i16> %call to <4 x i64> 2083 ret <4 x i64> %res 2084} 2085declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone 2086 2087define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2088; CHECK-LABEL: test_mm256_sign_epi32: 2089; CHECK: # %bb.0: 2090; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0 2091; CHECK-NEXT: ret{{[l|q]}} 2092 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2093 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2094 %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1) 2095 %res = bitcast <8 x i32> %call to <4 x i64> 2096 ret <4 x i64> %res 2097} 2098declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone 2099 2100define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2101; CHECK-LABEL: test_mm256_sll_epi16: 2102; CHECK: # %bb.0: 2103; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 2104; CHECK-NEXT: ret{{[l|q]}} 2105 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2106 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2107 %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1) 2108 %bc = bitcast <16 x i16> %res to <4 x i64> 2109 ret <4 x i64> %bc 2110} 2111declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone 2112 2113define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2114; CHECK-LABEL: test_mm256_sll_epi32: 2115; CHECK: # %bb.0: 2116; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0 2117; CHECK-NEXT: ret{{[l|q]}} 2118 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2119 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2120 %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1) 2121 %bc = bitcast <8 x i32> %res to <4 x i64> 2122 ret <4 x i64> %bc 2123} 2124declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone 2125 2126define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) { 2127; CHECK-LABEL: test_mm256_sll_epi64: 2128; CHECK: # %bb.0: 2129; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0 2130; CHECK-NEXT: ret{{[l|q]}} 2131 %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) 2132 ret <4 x i64> %res 2133} 2134declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone 2135 2136define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) { 2137; CHECK-LABEL: test_mm256_slli_epi16: 2138; CHECK: # %bb.0: 2139; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0 2140; CHECK-NEXT: ret{{[l|q]}} 2141 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2142 %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3) 2143 %bc = bitcast <16 x i16> %res to <4 x i64> 2144 ret <4 x i64> %bc 2145} 2146declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone 2147 2148define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) { 2149; CHECK-LABEL: test_mm256_slli_epi32: 2150; CHECK: # %bb.0: 2151; CHECK-NEXT: vpslld $3, %ymm0, %ymm0 2152; CHECK-NEXT: ret{{[l|q]}} 2153 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2154 %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3) 2155 %bc = bitcast <8 x i32> %res to <4 x i64> 2156 ret <4 x i64> %bc 2157} 2158declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone 2159 2160define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) { 2161; CHECK-LABEL: test_mm256_slli_epi64: 2162; CHECK: # %bb.0: 2163; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0 2164; CHECK-NEXT: ret{{[l|q]}} 2165 %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3) 2166 ret <4 x i64> %res 2167} 2168declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone 2169 2170define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) { 2171; CHECK-LABEL: test_mm256_slli_si256: 2172; CHECK: # %bb.0: 2173; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] 2174; CHECK-NEXT: ret{{[l|q]}} 2175 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2176 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60> 2177 %res = bitcast <32 x i8> %shuf to <4 x i64> 2178 ret <4 x i64> %res 2179} 2180 2181define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2182; CHECK-LABEL: test_mm_sllv_epi32: 2183; CHECK: # %bb.0: 2184; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 2185; CHECK-NEXT: ret{{[l|q]}} 2186 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2187 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2188 %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1) 2189 %bc = bitcast <4 x i32> %res to <2 x i64> 2190 ret <2 x i64> %bc 2191} 2192declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone 2193 2194define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2195; CHECK-LABEL: test_mm256_sllv_epi32: 2196; CHECK: # %bb.0: 2197; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 2198; CHECK-NEXT: ret{{[l|q]}} 2199 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2200 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2201 %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2202 %bc = bitcast <8 x i32> %res to <4 x i64> 2203 ret <4 x i64> %bc 2204} 2205declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2206 2207define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) { 2208; CHECK-LABEL: test_mm_sllv_epi64: 2209; CHECK: # %bb.0: 2210; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 2211; CHECK-NEXT: ret{{[l|q]}} 2212 %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) 2213 ret <2 x i64> %res 2214} 2215declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone 2216 2217define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) { 2218; CHECK-LABEL: test_mm256_sllv_epi64: 2219; CHECK: # %bb.0: 2220; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 2221; CHECK-NEXT: ret{{[l|q]}} 2222 %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) 2223 ret <4 x i64> %res 2224} 2225declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 2226 2227define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2228; CHECK-LABEL: test_mm256_sra_epi16: 2229; CHECK: # %bb.0: 2230; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0 2231; CHECK-NEXT: ret{{[l|q]}} 2232 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2233 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2234 %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1) 2235 %bc = bitcast <16 x i16> %res to <4 x i64> 2236 ret <4 x i64> %bc 2237} 2238declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone 2239 2240define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2241; CHECK-LABEL: test_mm256_sra_epi32: 2242; CHECK: # %bb.0: 2243; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0 2244; CHECK-NEXT: ret{{[l|q]}} 2245 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2246 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2247 %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1) 2248 %bc = bitcast <8 x i32> %res to <4 x i64> 2249 ret <4 x i64> %bc 2250} 2251declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone 2252 2253define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) { 2254; CHECK-LABEL: test_mm256_srai_epi16: 2255; CHECK: # %bb.0: 2256; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0 2257; CHECK-NEXT: ret{{[l|q]}} 2258 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2259 %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3) 2260 %bc = bitcast <16 x i16> %res to <4 x i64> 2261 ret <4 x i64> %bc 2262} 2263declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone 2264 2265define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) { 2266; CHECK-LABEL: test_mm256_srai_epi32: 2267; CHECK: # %bb.0: 2268; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0 2269; CHECK-NEXT: ret{{[l|q]}} 2270 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2271 %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3) 2272 %bc = bitcast <8 x i32> %res to <4 x i64> 2273 ret <4 x i64> %bc 2274} 2275declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone 2276 2277define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2278; CHECK-LABEL: test_mm_srav_epi32: 2279; CHECK: # %bb.0: 2280; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0 2281; CHECK-NEXT: ret{{[l|q]}} 2282 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2283 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2284 %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1) 2285 %bc = bitcast <4 x i32> %res to <2 x i64> 2286 ret <2 x i64> %bc 2287} 2288declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone 2289 2290define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2291; CHECK-LABEL: test_mm256_srav_epi32: 2292; CHECK: # %bb.0: 2293; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 2294; CHECK-NEXT: ret{{[l|q]}} 2295 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2296 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2297 %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2298 %bc = bitcast <8 x i32> %res to <4 x i64> 2299 ret <4 x i64> %bc 2300} 2301declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2302 2303define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) { 2304; CHECK-LABEL: test_mm256_srl_epi16: 2305; CHECK: # %bb.0: 2306; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 2307; CHECK-NEXT: ret{{[l|q]}} 2308 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2309 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 2310 %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1) 2311 %bc = bitcast <16 x i16> %res to <4 x i64> 2312 ret <4 x i64> %bc 2313} 2314declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone 2315 2316define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) { 2317; CHECK-LABEL: test_mm256_srl_epi32: 2318; CHECK: # %bb.0: 2319; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 2320; CHECK-NEXT: ret{{[l|q]}} 2321 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2322 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2323 %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1) 2324 %bc = bitcast <8 x i32> %res to <4 x i64> 2325 ret <4 x i64> %bc 2326} 2327declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone 2328 2329define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) { 2330; CHECK-LABEL: test_mm256_srl_epi64: 2331; CHECK: # %bb.0: 2332; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 2333; CHECK-NEXT: ret{{[l|q]}} 2334 %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) 2335 ret <4 x i64> %res 2336} 2337declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone 2338 2339define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) { 2340; CHECK-LABEL: test_mm256_srli_epi16: 2341; CHECK: # %bb.0: 2342; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 2343; CHECK-NEXT: ret{{[l|q]}} 2344 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2345 %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3) 2346 %bc = bitcast <16 x i16> %res to <4 x i64> 2347 ret <4 x i64> %bc 2348} 2349declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone 2350 2351define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) { 2352; CHECK-LABEL: test_mm256_srli_epi32: 2353; CHECK: # %bb.0: 2354; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 2355; CHECK-NEXT: ret{{[l|q]}} 2356 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2357 %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3) 2358 %bc = bitcast <8 x i32> %res to <4 x i64> 2359 ret <4 x i64> %bc 2360} 2361declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone 2362 2363define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) { 2364; CHECK-LABEL: test_mm256_srli_epi64: 2365; CHECK: # %bb.0: 2366; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 2367; CHECK-NEXT: ret{{[l|q]}} 2368 %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3) 2369 ret <4 x i64> %res 2370} 2371declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone 2372 2373define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) { 2374; CHECK-LABEL: test_mm256_srli_si256: 2375; CHECK: # %bb.0: 2376; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero 2377; CHECK-NEXT: ret{{[l|q]}} 2378 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2379 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50> 2380 %res = bitcast <32 x i8> %shuf to <4 x i64> 2381 ret <4 x i64> %res 2382} 2383 2384define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) { 2385; CHECK-LABEL: test_mm_srlv_epi32: 2386; CHECK: # %bb.0: 2387; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 2388; CHECK-NEXT: ret{{[l|q]}} 2389 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2390 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2391 %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1) 2392 %bc = bitcast <4 x i32> %res to <2 x i64> 2393 ret <2 x i64> %bc 2394} 2395declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone 2396 2397define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) { 2398; CHECK-LABEL: test_mm256_srlv_epi32: 2399; CHECK: # %bb.0: 2400; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 2401; CHECK-NEXT: ret{{[l|q]}} 2402 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2403 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2404 %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1) 2405 %bc = bitcast <8 x i32> %res to <4 x i64> 2406 ret <4 x i64> %bc 2407} 2408declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 2409 2410define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) { 2411; CHECK-LABEL: test_mm_srlv_epi64: 2412; CHECK: # %bb.0: 2413; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 2414; CHECK-NEXT: ret{{[l|q]}} 2415 %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) 2416 ret <2 x i64> %res 2417} 2418declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone 2419 2420define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) { 2421; CHECK-LABEL: test_mm256_srlv_epi64: 2422; CHECK: # %bb.0: 2423; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 2424; CHECK-NEXT: ret{{[l|q]}} 2425 %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) 2426 ret <4 x i64> %res 2427} 2428declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 2429 2430define <4 x i64> @test_mm256_stream_load_si256(ptr%a0) { 2431; X86-LABEL: test_mm256_stream_load_si256: 2432; X86: # %bb.0: 2433; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2434; X86-NEXT: vmovntdqa (%eax), %ymm0 2435; X86-NEXT: retl 2436; 2437; X64-LABEL: test_mm256_stream_load_si256: 2438; X64: # %bb.0: 2439; X64-NEXT: vmovntdqa (%rdi), %ymm0 2440; X64-NEXT: retq 2441 %arg0 = bitcast ptr%a0 to ptr 2442 %res = call <4 x i64> @llvm.x86.avx2.movntdqa(ptr %arg0) 2443 ret <4 x i64> %res 2444} 2445declare <4 x i64> @llvm.x86.avx2.movntdqa(ptr) nounwind readonly 2446 2447define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2448; CHECK-LABEL: test_mm256_sub_epi8: 2449; CHECK: # %bb.0: 2450; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2451; CHECK-NEXT: ret{{[l|q]}} 2452 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2453 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2454 %res = sub <32 x i8> %arg0, %arg1 2455 %bc = bitcast <32 x i8> %res to <4 x i64> 2456 ret <4 x i64> %bc 2457} 2458 2459define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2460; CHECK-LABEL: test_mm256_sub_epi16: 2461; CHECK: # %bb.0: 2462; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 2463; CHECK-NEXT: ret{{[l|q]}} 2464 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2465 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2466 %res = sub <16 x i16> %arg0, %arg1 2467 %bc = bitcast <16 x i16> %res to <4 x i64> 2468 ret <4 x i64> %bc 2469} 2470 2471define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2472; CHECK-LABEL: test_mm256_sub_epi32: 2473; CHECK: # %bb.0: 2474; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2475; CHECK-NEXT: ret{{[l|q]}} 2476 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2477 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2478 %res = sub <8 x i32> %arg0, %arg1 2479 %bc = bitcast <8 x i32> %res to <4 x i64> 2480 ret <4 x i64> %bc 2481} 2482 2483define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2484; CHECK-LABEL: test_mm256_sub_epi64: 2485; CHECK: # %bb.0: 2486; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 2487; CHECK-NEXT: ret{{[l|q]}} 2488 %res = sub <4 x i64> %a0, %a1 2489 ret <4 x i64> %res 2490} 2491 2492define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) { 2493; CHECK-LABEL: test_mm256_subs_epi8: 2494; CHECK: # %bb.0: 2495; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 2496; CHECK-NEXT: ret{{[l|q]}} 2497 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2498 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2499 %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 2500 %bc = bitcast <32 x i8> %res to <4 x i64> 2501 ret <4 x i64> %bc 2502} 2503declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 2504 2505define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { 2506; CHECK-LABEL: test_mm256_subs_epi16: 2507; CHECK: # %bb.0: 2508; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 2509; CHECK-NEXT: ret{{[l|q]}} 2510 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2511 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2512 %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 2513 %bc = bitcast <16 x i16> %res to <4 x i64> 2514 ret <4 x i64> %bc 2515} 2516declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 2517 2518define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { 2519; CHECK-LABEL: test_mm256_subs_epu8: 2520; CHECK: # %bb.0: 2521; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 2522; CHECK-NEXT: ret{{[l|q]}} 2523 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2524 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2525 %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) 2526 %bc = bitcast <32 x i8> %res to <4 x i64> 2527 ret <4 x i64> %bc 2528} 2529declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) 2530 2531define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) { 2532; CHECK-LABEL: test_mm256_subs_epu16: 2533; CHECK: # %bb.0: 2534; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 2535; CHECK-NEXT: ret{{[l|q]}} 2536 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2537 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2538 %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) 2539 %bc = bitcast <16 x i16> %res to <4 x i64> 2540 ret <4 x i64> %bc 2541} 2542declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) 2543 2544define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2545; CHECK-LABEL: test_mm256_unpackhi_epi8: 2546; CHECK: # %bb.0: 2547; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 2548; CHECK-NEXT: ret{{[l|q]}} 2549 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2550 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2551 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 2552 %bc = bitcast <32 x i8> %res to <4 x i64> 2553 ret <4 x i64> %bc 2554} 2555 2556define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2557; CHECK-LABEL: test_mm256_unpackhi_epi16: 2558; CHECK: # %bb.0: 2559; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 2560; CHECK-NEXT: ret{{[l|q]}} 2561 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2562 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2563 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 2564 %bc = bitcast <16 x i16> %res to <4 x i64> 2565 ret <4 x i64> %bc 2566} 2567 2568define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2569; CHECK-LABEL: test_mm256_unpackhi_epi32: 2570; CHECK: # %bb.0: 2571; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 2572; CHECK-NEXT: ret{{[l|q]}} 2573 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2574 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2575 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 2576 %bc = bitcast <8 x i32> %res to <4 x i64> 2577 ret <4 x i64> %bc 2578} 2579 2580define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2581; CHECK-LABEL: test_mm256_unpackhi_epi64: 2582; CHECK: # %bb.0: 2583; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 2584; CHECK-NEXT: ret{{[l|q]}} 2585 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 2586 ret <4 x i64> %res 2587} 2588 2589define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2590; CHECK-LABEL: test_mm256_unpacklo_epi8: 2591; CHECK: # %bb.0: 2592; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 2593; CHECK-NEXT: ret{{[l|q]}} 2594 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 2595 %arg1 = bitcast <4 x i64> %a1 to <32 x i8> 2596 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> 2597 %bc = bitcast <32 x i8> %res to <4 x i64> 2598 ret <4 x i64> %bc 2599} 2600 2601define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2602; CHECK-LABEL: test_mm256_unpacklo_epi16: 2603; CHECK: # %bb.0: 2604; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 2605; CHECK-NEXT: ret{{[l|q]}} 2606 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 2607 %arg1 = bitcast <4 x i64> %a1 to <16 x i16> 2608 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> 2609 %bc = bitcast <16 x i16> %res to <4 x i64> 2610 ret <4 x i64> %bc 2611} 2612 2613define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2614; CHECK-LABEL: test_mm256_unpacklo_epi32: 2615; CHECK: # %bb.0: 2616; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 2617; CHECK-NEXT: ret{{[l|q]}} 2618 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2619 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 2620 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 2621 %bc = bitcast <8 x i32> %res to <4 x i64> 2622 ret <4 x i64> %bc 2623} 2624 2625define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2626; CHECK-LABEL: test_mm256_unpacklo_epi64: 2627; CHECK: # %bb.0: 2628; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 2629; CHECK-NEXT: ret{{[l|q]}} 2630 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 2631 ret <4 x i64> %res 2632} 2633 2634define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2635; CHECK-LABEL: test_mm256_xor_si256: 2636; CHECK: # %bb.0: 2637; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 2638; CHECK-NEXT: ret{{[l|q]}} 2639 %res = xor <4 x i64> %a0, %a1 2640 ret <4 x i64> %res 2641} 2642 2643declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 2644 2645declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 2646