1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 6; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ 8; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW 9; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512f,avx512bw,avx512dq,avx512vl -verify-machineinstrs | FileCheck %s --check-prefixes=X86-AVX512 10 11; 12; vXf64 13; 14 15define <1 x double> @load_v1f64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x double> %dst) { 16; SSE-LABEL: load_v1f64_v1i64: 17; SSE: ## %bb.0: 18; SSE-NEXT: testq %rdi, %rdi 19; SSE-NEXT: jne LBB0_2 20; SSE-NEXT: ## %bb.1: ## %cond.load 21; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 22; SSE-NEXT: LBB0_2: ## %else 23; SSE-NEXT: retq 24; 25; AVX-LABEL: load_v1f64_v1i64: 26; AVX: ## %bb.0: 27; AVX-NEXT: testq %rdi, %rdi 28; AVX-NEXT: jne LBB0_2 29; AVX-NEXT: ## %bb.1: ## %cond.load 30; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 31; AVX-NEXT: LBB0_2: ## %else 32; AVX-NEXT: retq 33; 34; X86-AVX512-LABEL: load_v1f64_v1i64: 35; X86-AVX512: ## %bb.0: 36; X86-AVX512-NEXT: subl $12, %esp 37; X86-AVX512-NEXT: .cfi_def_cfa_offset 16 38; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 39; X86-AVX512-NEXT: orl {{[0-9]+}}(%esp), %eax 40; X86-AVX512-NEXT: jne LBB0_1 41; X86-AVX512-NEXT: ## %bb.2: ## %cond.load 42; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 43; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 44; X86-AVX512-NEXT: jmp LBB0_3 45; X86-AVX512-NEXT: LBB0_1: 46; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 47; X86-AVX512-NEXT: LBB0_3: ## %else 48; X86-AVX512-NEXT: vmovsd %xmm0, (%esp) 49; X86-AVX512-NEXT: fldl (%esp) 50; X86-AVX512-NEXT: addl $12, %esp 51; X86-AVX512-NEXT: retl 52 %mask = icmp eq <1 x i64> %trigger, zeroinitializer 53 %res = call <1 x double> @llvm.masked.load.v1f64.p0(ptr %addr, i32 4, <1 x i1> %mask, <1 x double> %dst) 54 ret <1 x double> %res 55} 56 57define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) { 58; SSE2-LABEL: load_v2f64_v2i64: 59; SSE2: ## %bb.0: 60; SSE2-NEXT: pxor %xmm2, %xmm2 61; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 62; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] 63; SSE2-NEXT: pand %xmm2, %xmm0 64; SSE2-NEXT: movmskpd %xmm0, %eax 65; SSE2-NEXT: testb $1, %al 66; SSE2-NEXT: jne LBB1_1 67; SSE2-NEXT: ## %bb.2: ## %else 68; SSE2-NEXT: testb $2, %al 69; SSE2-NEXT: jne LBB1_3 70; SSE2-NEXT: LBB1_4: ## %else2 71; SSE2-NEXT: movaps %xmm1, %xmm0 72; SSE2-NEXT: retq 73; SSE2-NEXT: LBB1_1: ## %cond.load 74; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 75; SSE2-NEXT: testb $2, %al 76; SSE2-NEXT: je LBB1_4 77; SSE2-NEXT: LBB1_3: ## %cond.load1 78; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] 79; SSE2-NEXT: movaps %xmm1, %xmm0 80; SSE2-NEXT: retq 81; 82; SSE42-LABEL: load_v2f64_v2i64: 83; SSE42: ## %bb.0: 84; SSE42-NEXT: pxor %xmm2, %xmm2 85; SSE42-NEXT: pcmpeqq %xmm0, %xmm2 86; SSE42-NEXT: movmskpd %xmm2, %eax 87; SSE42-NEXT: testb $1, %al 88; SSE42-NEXT: jne LBB1_1 89; SSE42-NEXT: ## %bb.2: ## %else 90; SSE42-NEXT: testb $2, %al 91; SSE42-NEXT: jne LBB1_3 92; SSE42-NEXT: LBB1_4: ## %else2 93; SSE42-NEXT: movaps %xmm1, %xmm0 94; SSE42-NEXT: retq 95; SSE42-NEXT: LBB1_1: ## %cond.load 96; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 97; SSE42-NEXT: testb $2, %al 98; SSE42-NEXT: je LBB1_4 99; SSE42-NEXT: LBB1_3: ## %cond.load1 100; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] 101; SSE42-NEXT: movaps %xmm1, %xmm0 102; SSE42-NEXT: retq 103; 104; AVX1OR2-LABEL: load_v2f64_v2i64: 105; AVX1OR2: ## %bb.0: 106; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 107; AVX1OR2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 108; AVX1OR2-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 109; AVX1OR2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 110; AVX1OR2-NEXT: retq 111; 112; AVX512F-LABEL: load_v2f64_v2i64: 113; AVX512F: ## %bb.0: 114; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 115; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 116; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 117; AVX512F-NEXT: kshiftlw $14, %k0, %k0 118; AVX512F-NEXT: kshiftrw $14, %k0, %k1 119; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 120; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 121; AVX512F-NEXT: vzeroupper 122; AVX512F-NEXT: retq 123; 124; AVX512VL-LABEL: load_v2f64_v2i64: 125; AVX512VL: ## %bb.0: 126; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 127; AVX512VL-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1} 128; AVX512VL-NEXT: retq 129; 130; X86-AVX512-LABEL: load_v2f64_v2i64: 131; X86-AVX512: ## %bb.0: 132; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 133; X86-AVX512-NEXT: vptestnmq %xmm0, %xmm0, %k1 134; X86-AVX512-NEXT: vblendmpd (%eax), %xmm1, %xmm0 {%k1} 135; X86-AVX512-NEXT: retl 136 %mask = icmp eq <2 x i64> %trigger, zeroinitializer 137 %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) 138 ret <2 x double> %res 139} 140 141define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, ptr %addr, <4 x double> %dst) { 142; SSE-LABEL: load_v4f64_v4i32: 143; SSE: ## %bb.0: 144; SSE-NEXT: pxor %xmm3, %xmm3 145; SSE-NEXT: pcmpeqd %xmm0, %xmm3 146; SSE-NEXT: movmskps %xmm3, %eax 147; SSE-NEXT: testb $1, %al 148; SSE-NEXT: jne LBB2_1 149; SSE-NEXT: ## %bb.2: ## %else 150; SSE-NEXT: testb $2, %al 151; SSE-NEXT: jne LBB2_3 152; SSE-NEXT: LBB2_4: ## %else2 153; SSE-NEXT: testb $4, %al 154; SSE-NEXT: jne LBB2_5 155; SSE-NEXT: LBB2_6: ## %else5 156; SSE-NEXT: testb $8, %al 157; SSE-NEXT: je LBB2_8 158; SSE-NEXT: LBB2_7: ## %cond.load7 159; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 160; SSE-NEXT: LBB2_8: ## %else8 161; SSE-NEXT: movaps %xmm1, %xmm0 162; SSE-NEXT: movaps %xmm2, %xmm1 163; SSE-NEXT: retq 164; SSE-NEXT: LBB2_1: ## %cond.load 165; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 166; SSE-NEXT: testb $2, %al 167; SSE-NEXT: je LBB2_4 168; SSE-NEXT: LBB2_3: ## %cond.load1 169; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] 170; SSE-NEXT: testb $4, %al 171; SSE-NEXT: je LBB2_6 172; SSE-NEXT: LBB2_5: ## %cond.load4 173; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 174; SSE-NEXT: testb $8, %al 175; SSE-NEXT: jne LBB2_7 176; SSE-NEXT: jmp LBB2_8 177; 178; AVX1-LABEL: load_v4f64_v4i32: 179; AVX1: ## %bb.0: 180; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 181; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 182; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 183; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 184; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 185; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 186; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 187; AVX1-NEXT: retq 188; 189; AVX2-LABEL: load_v4f64_v4i32: 190; AVX2: ## %bb.0: 191; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 192; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 193; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 194; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 195; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 196; AVX2-NEXT: retq 197; 198; AVX512F-LABEL: load_v4f64_v4i32: 199; AVX512F: ## %bb.0: 200; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 201; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 202; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 203; AVX512F-NEXT: kshiftlw $12, %k0, %k0 204; AVX512F-NEXT: kshiftrw $12, %k0, %k1 205; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 206; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 207; AVX512F-NEXT: retq 208; 209; AVX512VL-LABEL: load_v4f64_v4i32: 210; AVX512VL: ## %bb.0: 211; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1 212; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1} 213; AVX512VL-NEXT: retq 214; 215; X86-AVX512-LABEL: load_v4f64_v4i32: 216; X86-AVX512: ## %bb.0: 217; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 218; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 219; X86-AVX512-NEXT: vblendmpd (%eax), %ymm1, %ymm0 {%k1} 220; X86-AVX512-NEXT: retl 221 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 222 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 32, <4 x i1> %mask, <4 x double> %dst) 223 ret <4 x double> %res 224} 225 226define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) { 227; SSE-LABEL: load_v4f64_v4i32_zero: 228; SSE: ## %bb.0: 229; SSE-NEXT: movdqa %xmm0, %xmm1 230; SSE-NEXT: pxor %xmm0, %xmm0 231; SSE-NEXT: pcmpeqd %xmm0, %xmm1 232; SSE-NEXT: movmskps %xmm1, %eax 233; SSE-NEXT: testb $1, %al 234; SSE-NEXT: xorps %xmm1, %xmm1 235; SSE-NEXT: jne LBB3_1 236; SSE-NEXT: ## %bb.2: ## %else 237; SSE-NEXT: testb $2, %al 238; SSE-NEXT: jne LBB3_3 239; SSE-NEXT: LBB3_4: ## %else2 240; SSE-NEXT: testb $4, %al 241; SSE-NEXT: jne LBB3_5 242; SSE-NEXT: LBB3_6: ## %else5 243; SSE-NEXT: testb $8, %al 244; SSE-NEXT: jne LBB3_7 245; SSE-NEXT: LBB3_8: ## %else8 246; SSE-NEXT: retq 247; SSE-NEXT: LBB3_1: ## %cond.load 248; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 249; SSE-NEXT: testb $2, %al 250; SSE-NEXT: je LBB3_4 251; SSE-NEXT: LBB3_3: ## %cond.load1 252; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 253; SSE-NEXT: testb $4, %al 254; SSE-NEXT: je LBB3_6 255; SSE-NEXT: LBB3_5: ## %cond.load4 256; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 257; SSE-NEXT: testb $8, %al 258; SSE-NEXT: je LBB3_8 259; SSE-NEXT: LBB3_7: ## %cond.load7 260; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] 261; SSE-NEXT: retq 262; 263; AVX1-LABEL: load_v4f64_v4i32_zero: 264; AVX1: ## %bb.0: 265; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 266; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 267; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 268; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 269; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 270; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 271; AVX1-NEXT: retq 272; 273; AVX2-LABEL: load_v4f64_v4i32_zero: 274; AVX2: ## %bb.0: 275; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 276; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 277; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 278; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 279; AVX2-NEXT: retq 280; 281; AVX512F-LABEL: load_v4f64_v4i32_zero: 282; AVX512F: ## %bb.0: 283; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 284; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 285; AVX512F-NEXT: kshiftlw $12, %k0, %k0 286; AVX512F-NEXT: kshiftrw $12, %k0, %k1 287; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} 288; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 289; AVX512F-NEXT: retq 290; 291; AVX512VL-LABEL: load_v4f64_v4i32_zero: 292; AVX512VL: ## %bb.0: 293; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1 294; AVX512VL-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} 295; AVX512VL-NEXT: retq 296; 297; X86-AVX512-LABEL: load_v4f64_v4i32_zero: 298; X86-AVX512: ## %bb.0: 299; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 300; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 301; X86-AVX512-NEXT: vmovapd (%eax), %ymm0 {%k1} {z} 302; X86-AVX512-NEXT: retl 303 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 304 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 32, <4 x i1> %mask, <4 x double>zeroinitializer) 305 ret <4 x double> %res 306} 307 308define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %dst) { 309; SSE2-LABEL: load_v4f64_v4i64: 310; SSE2: ## %bb.0: 311; SSE2-NEXT: pxor %xmm4, %xmm4 312; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 313; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 314; SSE2-NEXT: movdqa %xmm0, %xmm4 315; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3] 316; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 317; SSE2-NEXT: andps %xmm4, %xmm0 318; SSE2-NEXT: movmskps %xmm0, %eax 319; SSE2-NEXT: testb $1, %al 320; SSE2-NEXT: jne LBB4_1 321; SSE2-NEXT: ## %bb.2: ## %else 322; SSE2-NEXT: testb $2, %al 323; SSE2-NEXT: jne LBB4_3 324; SSE2-NEXT: LBB4_4: ## %else2 325; SSE2-NEXT: testb $4, %al 326; SSE2-NEXT: jne LBB4_5 327; SSE2-NEXT: LBB4_6: ## %else5 328; SSE2-NEXT: testb $8, %al 329; SSE2-NEXT: je LBB4_8 330; SSE2-NEXT: LBB4_7: ## %cond.load7 331; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] 332; SSE2-NEXT: LBB4_8: ## %else8 333; SSE2-NEXT: movaps %xmm2, %xmm0 334; SSE2-NEXT: movaps %xmm3, %xmm1 335; SSE2-NEXT: retq 336; SSE2-NEXT: LBB4_1: ## %cond.load 337; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 338; SSE2-NEXT: testb $2, %al 339; SSE2-NEXT: je LBB4_4 340; SSE2-NEXT: LBB4_3: ## %cond.load1 341; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 342; SSE2-NEXT: testb $4, %al 343; SSE2-NEXT: je LBB4_6 344; SSE2-NEXT: LBB4_5: ## %cond.load4 345; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] 346; SSE2-NEXT: testb $8, %al 347; SSE2-NEXT: jne LBB4_7 348; SSE2-NEXT: jmp LBB4_8 349; 350; SSE42-LABEL: load_v4f64_v4i64: 351; SSE42: ## %bb.0: 352; SSE42-NEXT: pxor %xmm4, %xmm4 353; SSE42-NEXT: pcmpeqq %xmm4, %xmm1 354; SSE42-NEXT: pcmpeqq %xmm4, %xmm0 355; SSE42-NEXT: packssdw %xmm1, %xmm0 356; SSE42-NEXT: movmskps %xmm0, %eax 357; SSE42-NEXT: testb $1, %al 358; SSE42-NEXT: jne LBB4_1 359; SSE42-NEXT: ## %bb.2: ## %else 360; SSE42-NEXT: testb $2, %al 361; SSE42-NEXT: jne LBB4_3 362; SSE42-NEXT: LBB4_4: ## %else2 363; SSE42-NEXT: testb $4, %al 364; SSE42-NEXT: jne LBB4_5 365; SSE42-NEXT: LBB4_6: ## %else5 366; SSE42-NEXT: testb $8, %al 367; SSE42-NEXT: je LBB4_8 368; SSE42-NEXT: LBB4_7: ## %cond.load7 369; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] 370; SSE42-NEXT: LBB4_8: ## %else8 371; SSE42-NEXT: movaps %xmm2, %xmm0 372; SSE42-NEXT: movaps %xmm3, %xmm1 373; SSE42-NEXT: retq 374; SSE42-NEXT: LBB4_1: ## %cond.load 375; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 376; SSE42-NEXT: testb $2, %al 377; SSE42-NEXT: je LBB4_4 378; SSE42-NEXT: LBB4_3: ## %cond.load1 379; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 380; SSE42-NEXT: testb $4, %al 381; SSE42-NEXT: je LBB4_6 382; SSE42-NEXT: LBB4_5: ## %cond.load4 383; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] 384; SSE42-NEXT: testb $8, %al 385; SSE42-NEXT: jne LBB4_7 386; SSE42-NEXT: jmp LBB4_8 387; 388; AVX1-LABEL: load_v4f64_v4i64: 389; AVX1: ## %bb.0: 390; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 391; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 392; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 393; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 394; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 395; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 396; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 397; AVX1-NEXT: retq 398; 399; AVX2-LABEL: load_v4f64_v4i64: 400; AVX2: ## %bb.0: 401; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 402; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 403; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 404; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 405; AVX2-NEXT: retq 406; 407; AVX512F-LABEL: load_v4f64_v4i64: 408; AVX512F: ## %bb.0: 409; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 410; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 411; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 412; AVX512F-NEXT: kshiftlw $12, %k0, %k0 413; AVX512F-NEXT: kshiftrw $12, %k0, %k1 414; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 415; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 416; AVX512F-NEXT: retq 417; 418; AVX512VL-LABEL: load_v4f64_v4i64: 419; AVX512VL: ## %bb.0: 420; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1 421; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1} 422; AVX512VL-NEXT: retq 423; 424; X86-AVX512-LABEL: load_v4f64_v4i64: 425; X86-AVX512: ## %bb.0: 426; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 427; X86-AVX512-NEXT: vptestnmq %ymm0, %ymm0, %k1 428; X86-AVX512-NEXT: vblendmpd (%eax), %ymm1, %ymm0 {%k1} 429; X86-AVX512-NEXT: retl 430 %mask = icmp eq <4 x i64> %trigger, zeroinitializer 431 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x double> %dst) 432 ret <4 x double> %res 433} 434 435define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x double> %dst) { 436; SSE-LABEL: load_v8f64_v8i16: 437; SSE: ## %bb.0: 438; SSE-NEXT: pxor %xmm5, %xmm5 439; SSE-NEXT: pcmpeqw %xmm0, %xmm5 440; SSE-NEXT: packsswb %xmm5, %xmm5 441; SSE-NEXT: pmovmskb %xmm5, %eax 442; SSE-NEXT: testb $1, %al 443; SSE-NEXT: jne LBB5_1 444; SSE-NEXT: ## %bb.2: ## %else 445; SSE-NEXT: testb $2, %al 446; SSE-NEXT: jne LBB5_3 447; SSE-NEXT: LBB5_4: ## %else2 448; SSE-NEXT: testb $4, %al 449; SSE-NEXT: jne LBB5_5 450; SSE-NEXT: LBB5_6: ## %else5 451; SSE-NEXT: testb $8, %al 452; SSE-NEXT: jne LBB5_7 453; SSE-NEXT: LBB5_8: ## %else8 454; SSE-NEXT: testb $16, %al 455; SSE-NEXT: jne LBB5_9 456; SSE-NEXT: LBB5_10: ## %else11 457; SSE-NEXT: testb $32, %al 458; SSE-NEXT: jne LBB5_11 459; SSE-NEXT: LBB5_12: ## %else14 460; SSE-NEXT: testb $64, %al 461; SSE-NEXT: jne LBB5_13 462; SSE-NEXT: LBB5_14: ## %else17 463; SSE-NEXT: testb $-128, %al 464; SSE-NEXT: je LBB5_16 465; SSE-NEXT: LBB5_15: ## %cond.load19 466; SSE-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] 467; SSE-NEXT: LBB5_16: ## %else20 468; SSE-NEXT: movaps %xmm1, %xmm0 469; SSE-NEXT: movaps %xmm2, %xmm1 470; SSE-NEXT: movaps %xmm3, %xmm2 471; SSE-NEXT: movaps %xmm4, %xmm3 472; SSE-NEXT: retq 473; SSE-NEXT: LBB5_1: ## %cond.load 474; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 475; SSE-NEXT: testb $2, %al 476; SSE-NEXT: je LBB5_4 477; SSE-NEXT: LBB5_3: ## %cond.load1 478; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] 479; SSE-NEXT: testb $4, %al 480; SSE-NEXT: je LBB5_6 481; SSE-NEXT: LBB5_5: ## %cond.load4 482; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 483; SSE-NEXT: testb $8, %al 484; SSE-NEXT: je LBB5_8 485; SSE-NEXT: LBB5_7: ## %cond.load7 486; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 487; SSE-NEXT: testb $16, %al 488; SSE-NEXT: je LBB5_10 489; SSE-NEXT: LBB5_9: ## %cond.load10 490; SSE-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] 491; SSE-NEXT: testb $32, %al 492; SSE-NEXT: je LBB5_12 493; SSE-NEXT: LBB5_11: ## %cond.load13 494; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] 495; SSE-NEXT: testb $64, %al 496; SSE-NEXT: je LBB5_14 497; SSE-NEXT: LBB5_13: ## %cond.load16 498; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] 499; SSE-NEXT: testb $-128, %al 500; SSE-NEXT: jne LBB5_15 501; SSE-NEXT: jmp LBB5_16 502; 503; AVX1-LABEL: load_v8f64_v8i16: 504; AVX1: ## %bb.0: 505; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 506; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 507; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 508; AVX1-NEXT: vpmovsxwq %xmm3, %xmm5 509; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] 510; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 511; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 512; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 513; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4 514; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 515; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 516; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 517; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 518; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 519; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1 520; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 521; AVX1-NEXT: retq 522; 523; AVX2-LABEL: load_v8f64_v8i16: 524; AVX2: ## %bb.0: 525; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 526; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 527; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 528; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 529; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 530; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 531; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 532; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 533; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1 534; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 535; AVX2-NEXT: retq 536; 537; AVX512F-LABEL: load_v8f64_v8i16: 538; AVX512F: ## %bb.0: 539; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 540; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 541; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 542; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 543; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 544; AVX512F-NEXT: retq 545; 546; AVX512VLDQ-LABEL: load_v8f64_v8i16: 547; AVX512VLDQ: ## %bb.0: 548; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 549; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 550; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 551; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 552; AVX512VLDQ-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 553; AVX512VLDQ-NEXT: retq 554; 555; AVX512VLBW-LABEL: load_v8f64_v8i16: 556; AVX512VLBW: ## %bb.0: 557; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1 558; AVX512VLBW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 559; AVX512VLBW-NEXT: retq 560; 561; X86-AVX512-LABEL: load_v8f64_v8i16: 562; X86-AVX512: ## %bb.0: 563; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 564; X86-AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k1 565; X86-AVX512-NEXT: vblendmpd (%eax), %zmm1, %zmm0 {%k1} 566; X86-AVX512-NEXT: retl 567 %mask = icmp eq <8 x i16> %trigger, zeroinitializer 568 %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x double> %dst) 569 ret <8 x double> %res 570} 571 572define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double> %dst) { 573; SSE2-LABEL: load_v8f64_v8i64: 574; SSE2: ## %bb.0: 575; SSE2-NEXT: pxor %xmm8, %xmm8 576; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 577; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,0,3,2] 578; SSE2-NEXT: pand %xmm3, %xmm9 579; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 580; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] 581; SSE2-NEXT: pand %xmm2, %xmm3 582; SSE2-NEXT: packssdw %xmm9, %xmm3 583; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 584; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] 585; SSE2-NEXT: pand %xmm1, %xmm2 586; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 587; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 588; SSE2-NEXT: pand %xmm0, %xmm1 589; SSE2-NEXT: packssdw %xmm2, %xmm1 590; SSE2-NEXT: packssdw %xmm3, %xmm1 591; SSE2-NEXT: packsswb %xmm1, %xmm1 592; SSE2-NEXT: pmovmskb %xmm1, %eax 593; SSE2-NEXT: testb $1, %al 594; SSE2-NEXT: jne LBB6_1 595; SSE2-NEXT: ## %bb.2: ## %else 596; SSE2-NEXT: testb $2, %al 597; SSE2-NEXT: jne LBB6_3 598; SSE2-NEXT: LBB6_4: ## %else2 599; SSE2-NEXT: testb $4, %al 600; SSE2-NEXT: jne LBB6_5 601; SSE2-NEXT: LBB6_6: ## %else5 602; SSE2-NEXT: testb $8, %al 603; SSE2-NEXT: jne LBB6_7 604; SSE2-NEXT: LBB6_8: ## %else8 605; SSE2-NEXT: testb $16, %al 606; SSE2-NEXT: jne LBB6_9 607; SSE2-NEXT: LBB6_10: ## %else11 608; SSE2-NEXT: testb $32, %al 609; SSE2-NEXT: jne LBB6_11 610; SSE2-NEXT: LBB6_12: ## %else14 611; SSE2-NEXT: testb $64, %al 612; SSE2-NEXT: jne LBB6_13 613; SSE2-NEXT: LBB6_14: ## %else17 614; SSE2-NEXT: testb $-128, %al 615; SSE2-NEXT: je LBB6_16 616; SSE2-NEXT: LBB6_15: ## %cond.load19 617; SSE2-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1] 618; SSE2-NEXT: LBB6_16: ## %else20 619; SSE2-NEXT: movaps %xmm4, %xmm0 620; SSE2-NEXT: movaps %xmm5, %xmm1 621; SSE2-NEXT: movaps %xmm6, %xmm2 622; SSE2-NEXT: movaps %xmm7, %xmm3 623; SSE2-NEXT: retq 624; SSE2-NEXT: LBB6_1: ## %cond.load 625; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] 626; SSE2-NEXT: testb $2, %al 627; SSE2-NEXT: je LBB6_4 628; SSE2-NEXT: LBB6_3: ## %cond.load1 629; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] 630; SSE2-NEXT: testb $4, %al 631; SSE2-NEXT: je LBB6_6 632; SSE2-NEXT: LBB6_5: ## %cond.load4 633; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] 634; SSE2-NEXT: testb $8, %al 635; SSE2-NEXT: je LBB6_8 636; SSE2-NEXT: LBB6_7: ## %cond.load7 637; SSE2-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1] 638; SSE2-NEXT: testb $16, %al 639; SSE2-NEXT: je LBB6_10 640; SSE2-NEXT: LBB6_9: ## %cond.load10 641; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] 642; SSE2-NEXT: testb $32, %al 643; SSE2-NEXT: je LBB6_12 644; SSE2-NEXT: LBB6_11: ## %cond.load13 645; SSE2-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] 646; SSE2-NEXT: testb $64, %al 647; SSE2-NEXT: je LBB6_14 648; SSE2-NEXT: LBB6_13: ## %cond.load16 649; SSE2-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] 650; SSE2-NEXT: testb $-128, %al 651; SSE2-NEXT: jne LBB6_15 652; SSE2-NEXT: jmp LBB6_16 653; 654; SSE42-LABEL: load_v8f64_v8i64: 655; SSE42: ## %bb.0: 656; SSE42-NEXT: pxor %xmm8, %xmm8 657; SSE42-NEXT: pcmpeqq %xmm8, %xmm3 658; SSE42-NEXT: pcmpeqq %xmm8, %xmm2 659; SSE42-NEXT: packssdw %xmm3, %xmm2 660; SSE42-NEXT: pcmpeqq %xmm8, %xmm1 661; SSE42-NEXT: pcmpeqq %xmm8, %xmm0 662; SSE42-NEXT: packssdw %xmm1, %xmm0 663; SSE42-NEXT: packssdw %xmm2, %xmm0 664; SSE42-NEXT: packsswb %xmm0, %xmm0 665; SSE42-NEXT: pmovmskb %xmm0, %eax 666; SSE42-NEXT: testb $1, %al 667; SSE42-NEXT: jne LBB6_1 668; SSE42-NEXT: ## %bb.2: ## %else 669; SSE42-NEXT: testb $2, %al 670; SSE42-NEXT: jne LBB6_3 671; SSE42-NEXT: LBB6_4: ## %else2 672; SSE42-NEXT: testb $4, %al 673; SSE42-NEXT: jne LBB6_5 674; SSE42-NEXT: LBB6_6: ## %else5 675; SSE42-NEXT: testb $8, %al 676; SSE42-NEXT: jne LBB6_7 677; SSE42-NEXT: LBB6_8: ## %else8 678; SSE42-NEXT: testb $16, %al 679; SSE42-NEXT: jne LBB6_9 680; SSE42-NEXT: LBB6_10: ## %else11 681; SSE42-NEXT: testb $32, %al 682; SSE42-NEXT: jne LBB6_11 683; SSE42-NEXT: LBB6_12: ## %else14 684; SSE42-NEXT: testb $64, %al 685; SSE42-NEXT: jne LBB6_13 686; SSE42-NEXT: LBB6_14: ## %else17 687; SSE42-NEXT: testb $-128, %al 688; SSE42-NEXT: je LBB6_16 689; SSE42-NEXT: LBB6_15: ## %cond.load19 690; SSE42-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1] 691; SSE42-NEXT: LBB6_16: ## %else20 692; SSE42-NEXT: movaps %xmm4, %xmm0 693; SSE42-NEXT: movaps %xmm5, %xmm1 694; SSE42-NEXT: movaps %xmm6, %xmm2 695; SSE42-NEXT: movaps %xmm7, %xmm3 696; SSE42-NEXT: retq 697; SSE42-NEXT: LBB6_1: ## %cond.load 698; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] 699; SSE42-NEXT: testb $2, %al 700; SSE42-NEXT: je LBB6_4 701; SSE42-NEXT: LBB6_3: ## %cond.load1 702; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] 703; SSE42-NEXT: testb $4, %al 704; SSE42-NEXT: je LBB6_6 705; SSE42-NEXT: LBB6_5: ## %cond.load4 706; SSE42-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] 707; SSE42-NEXT: testb $8, %al 708; SSE42-NEXT: je LBB6_8 709; SSE42-NEXT: LBB6_7: ## %cond.load7 710; SSE42-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1] 711; SSE42-NEXT: testb $16, %al 712; SSE42-NEXT: je LBB6_10 713; SSE42-NEXT: LBB6_9: ## %cond.load10 714; SSE42-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] 715; SSE42-NEXT: testb $32, %al 716; SSE42-NEXT: je LBB6_12 717; SSE42-NEXT: LBB6_11: ## %cond.load13 718; SSE42-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] 719; SSE42-NEXT: testb $64, %al 720; SSE42-NEXT: je LBB6_14 721; SSE42-NEXT: LBB6_13: ## %cond.load16 722; SSE42-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] 723; SSE42-NEXT: testb $-128, %al 724; SSE42-NEXT: jne LBB6_15 725; SSE42-NEXT: jmp LBB6_16 726; 727; AVX1-LABEL: load_v8f64_v8i64: 728; AVX1: ## %bb.0: 729; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 730; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 731; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 732; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1 733; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 734; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 735; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 736; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0 737; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 738; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 739; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0 740; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2 741; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 742; AVX1-NEXT: retq 743; 744; AVX2-LABEL: load_v8f64_v8i64: 745; AVX2: ## %bb.0: 746; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 747; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1 748; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0 749; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 750; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0 751; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2 752; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 753; AVX2-NEXT: retq 754; 755; AVX512-LABEL: load_v8f64_v8i64: 756; AVX512: ## %bb.0: 757; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1 758; AVX512-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 759; AVX512-NEXT: retq 760; 761; X86-AVX512-LABEL: load_v8f64_v8i64: 762; X86-AVX512: ## %bb.0: 763; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 764; X86-AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1 765; X86-AVX512-NEXT: vblendmpd (%eax), %zmm1, %zmm0 {%k1} 766; X86-AVX512-NEXT: retl 767 %mask = icmp eq <8 x i64> %trigger, zeroinitializer 768 %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x double> %dst) 769 ret <8 x double> %res 770} 771 772; 773; vXf32 774; 775 776define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) { 777; SSE2-LABEL: load_v2f32_v2i32: 778; SSE2: ## %bb.0: 779; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 780; SSE2-NEXT: pxor %xmm2, %xmm2 781; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 782; SSE2-NEXT: movmskpd %xmm2, %eax 783; SSE2-NEXT: testb $1, %al 784; SSE2-NEXT: jne LBB7_1 785; SSE2-NEXT: ## %bb.2: ## %else 786; SSE2-NEXT: testb $2, %al 787; SSE2-NEXT: jne LBB7_3 788; SSE2-NEXT: LBB7_4: ## %else2 789; SSE2-NEXT: movaps %xmm1, %xmm0 790; SSE2-NEXT: retq 791; SSE2-NEXT: LBB7_1: ## %cond.load 792; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 793; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 794; SSE2-NEXT: testb $2, %al 795; SSE2-NEXT: je LBB7_4 796; SSE2-NEXT: LBB7_3: ## %cond.load1 797; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 798; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 799; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 800; SSE2-NEXT: movaps %xmm0, %xmm1 801; SSE2-NEXT: movaps %xmm1, %xmm0 802; SSE2-NEXT: retq 803; 804; SSE42-LABEL: load_v2f32_v2i32: 805; SSE42: ## %bb.0: 806; SSE42-NEXT: pxor %xmm2, %xmm2 807; SSE42-NEXT: pcmpeqd %xmm0, %xmm2 808; SSE42-NEXT: pmovsxdq %xmm2, %xmm0 809; SSE42-NEXT: movmskpd %xmm0, %eax 810; SSE42-NEXT: testb $1, %al 811; SSE42-NEXT: jne LBB7_1 812; SSE42-NEXT: ## %bb.2: ## %else 813; SSE42-NEXT: testb $2, %al 814; SSE42-NEXT: jne LBB7_3 815; SSE42-NEXT: LBB7_4: ## %else2 816; SSE42-NEXT: movaps %xmm1, %xmm0 817; SSE42-NEXT: retq 818; SSE42-NEXT: LBB7_1: ## %cond.load 819; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 820; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 821; SSE42-NEXT: testb $2, %al 822; SSE42-NEXT: je LBB7_4 823; SSE42-NEXT: LBB7_3: ## %cond.load1 824; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 825; SSE42-NEXT: movaps %xmm1, %xmm0 826; SSE42-NEXT: retq 827; 828; AVX1OR2-LABEL: load_v2f32_v2i32: 829; AVX1OR2: ## %bb.0: 830; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 831; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 832; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 833; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 834; AVX1OR2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 835; AVX1OR2-NEXT: retq 836; 837; AVX512F-LABEL: load_v2f32_v2i32: 838; AVX512F: ## %bb.0: 839; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 840; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 841; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 842; AVX512F-NEXT: kshiftlw $14, %k0, %k0 843; AVX512F-NEXT: kshiftrw $14, %k0, %k1 844; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} 845; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 846; AVX512F-NEXT: vzeroupper 847; AVX512F-NEXT: retq 848; 849; AVX512VLDQ-LABEL: load_v2f32_v2i32: 850; AVX512VLDQ: ## %bb.0: 851; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0 852; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 853; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 854; AVX512VLDQ-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} 855; AVX512VLDQ-NEXT: retq 856; 857; AVX512VLBW-LABEL: load_v2f32_v2i32: 858; AVX512VLBW: ## %bb.0: 859; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0 860; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 861; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 862; AVX512VLBW-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} 863; AVX512VLBW-NEXT: retq 864; 865; X86-AVX512-LABEL: load_v2f32_v2i32: 866; X86-AVX512: ## %bb.0: 867; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 868; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 869; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0 870; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1 871; X86-AVX512-NEXT: vblendmps (%eax), %xmm1, %xmm0 {%k1} 872; X86-AVX512-NEXT: retl 873 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 874 %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) 875 ret <2 x float> %res 876} 877 878define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, ptr %addr) { 879; SSE2-LABEL: load_v2f32_v2i32_undef: 880; SSE2: ## %bb.0: 881; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 882; SSE2-NEXT: pxor %xmm1, %xmm1 883; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 884; SSE2-NEXT: movmskpd %xmm1, %eax 885; SSE2-NEXT: testb $1, %al 886; SSE2-NEXT: ## implicit-def: $xmm0 887; SSE2-NEXT: jne LBB8_1 888; SSE2-NEXT: ## %bb.2: ## %else 889; SSE2-NEXT: testb $2, %al 890; SSE2-NEXT: jne LBB8_3 891; SSE2-NEXT: LBB8_4: ## %else2 892; SSE2-NEXT: retq 893; SSE2-NEXT: LBB8_1: ## %cond.load 894; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 895; SSE2-NEXT: testb $2, %al 896; SSE2-NEXT: je LBB8_4 897; SSE2-NEXT: LBB8_3: ## %cond.load1 898; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 899; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 900; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 901; SSE2-NEXT: movaps %xmm1, %xmm0 902; SSE2-NEXT: retq 903; 904; SSE42-LABEL: load_v2f32_v2i32_undef: 905; SSE42: ## %bb.0: 906; SSE42-NEXT: pxor %xmm1, %xmm1 907; SSE42-NEXT: pcmpeqd %xmm0, %xmm1 908; SSE42-NEXT: pmovsxdq %xmm1, %xmm0 909; SSE42-NEXT: movmskpd %xmm0, %eax 910; SSE42-NEXT: testb $1, %al 911; SSE42-NEXT: ## implicit-def: $xmm0 912; SSE42-NEXT: jne LBB8_1 913; SSE42-NEXT: ## %bb.2: ## %else 914; SSE42-NEXT: testb $2, %al 915; SSE42-NEXT: jne LBB8_3 916; SSE42-NEXT: LBB8_4: ## %else2 917; SSE42-NEXT: retq 918; SSE42-NEXT: LBB8_1: ## %cond.load 919; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 920; SSE42-NEXT: testb $2, %al 921; SSE42-NEXT: je LBB8_4 922; SSE42-NEXT: LBB8_3: ## %cond.load1 923; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 924; SSE42-NEXT: retq 925; 926; AVX1OR2-LABEL: load_v2f32_v2i32_undef: 927; AVX1OR2: ## %bb.0: 928; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 929; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 930; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 931; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 932; AVX1OR2-NEXT: retq 933; 934; AVX512F-LABEL: load_v2f32_v2i32_undef: 935; AVX512F: ## %bb.0: 936; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 937; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 938; AVX512F-NEXT: kshiftlw $14, %k0, %k0 939; AVX512F-NEXT: kshiftrw $14, %k0, %k1 940; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} 941; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 942; AVX512F-NEXT: vzeroupper 943; AVX512F-NEXT: retq 944; 945; AVX512VLDQ-LABEL: load_v2f32_v2i32_undef: 946; AVX512VLDQ: ## %bb.0: 947; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0 948; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 949; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 950; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} 951; AVX512VLDQ-NEXT: retq 952; 953; AVX512VLBW-LABEL: load_v2f32_v2i32_undef: 954; AVX512VLBW: ## %bb.0: 955; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0 956; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 957; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 958; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} 959; AVX512VLBW-NEXT: retq 960; 961; X86-AVX512-LABEL: load_v2f32_v2i32_undef: 962; X86-AVX512: ## %bb.0: 963; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 964; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 965; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0 966; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1 967; X86-AVX512-NEXT: vmovups (%eax), %xmm0 {%k1} {z} 968; X86-AVX512-NEXT: retl 969 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 970 %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float>undef) 971 ret <2 x float> %res 972} 973 974define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x float> %dst) { 975; SSE2-LABEL: load_v4f32_v4i32: 976; SSE2: ## %bb.0: 977; SSE2-NEXT: pxor %xmm2, %xmm2 978; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 979; SSE2-NEXT: movmskps %xmm2, %eax 980; SSE2-NEXT: testb $1, %al 981; SSE2-NEXT: jne LBB9_1 982; SSE2-NEXT: ## %bb.2: ## %else 983; SSE2-NEXT: testb $2, %al 984; SSE2-NEXT: jne LBB9_3 985; SSE2-NEXT: LBB9_4: ## %else2 986; SSE2-NEXT: testb $4, %al 987; SSE2-NEXT: jne LBB9_5 988; SSE2-NEXT: LBB9_6: ## %else5 989; SSE2-NEXT: testb $8, %al 990; SSE2-NEXT: jne LBB9_7 991; SSE2-NEXT: LBB9_8: ## %else8 992; SSE2-NEXT: movaps %xmm1, %xmm0 993; SSE2-NEXT: retq 994; SSE2-NEXT: LBB9_1: ## %cond.load 995; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 996; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 997; SSE2-NEXT: testb $2, %al 998; SSE2-NEXT: je LBB9_4 999; SSE2-NEXT: LBB9_3: ## %cond.load1 1000; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1001; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1002; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1003; SSE2-NEXT: movaps %xmm0, %xmm1 1004; SSE2-NEXT: testb $4, %al 1005; SSE2-NEXT: je LBB9_6 1006; SSE2-NEXT: LBB9_5: ## %cond.load4 1007; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1008; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1009; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] 1010; SSE2-NEXT: testb $8, %al 1011; SSE2-NEXT: je LBB9_8 1012; SSE2-NEXT: LBB9_7: ## %cond.load7 1013; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1014; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1015; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 1016; SSE2-NEXT: movaps %xmm1, %xmm0 1017; SSE2-NEXT: retq 1018; 1019; SSE42-LABEL: load_v4f32_v4i32: 1020; SSE42: ## %bb.0: 1021; SSE42-NEXT: pxor %xmm2, %xmm2 1022; SSE42-NEXT: pcmpeqd %xmm0, %xmm2 1023; SSE42-NEXT: movmskps %xmm2, %eax 1024; SSE42-NEXT: testb $1, %al 1025; SSE42-NEXT: jne LBB9_1 1026; SSE42-NEXT: ## %bb.2: ## %else 1027; SSE42-NEXT: testb $2, %al 1028; SSE42-NEXT: jne LBB9_3 1029; SSE42-NEXT: LBB9_4: ## %else2 1030; SSE42-NEXT: testb $4, %al 1031; SSE42-NEXT: jne LBB9_5 1032; SSE42-NEXT: LBB9_6: ## %else5 1033; SSE42-NEXT: testb $8, %al 1034; SSE42-NEXT: jne LBB9_7 1035; SSE42-NEXT: LBB9_8: ## %else8 1036; SSE42-NEXT: movaps %xmm1, %xmm0 1037; SSE42-NEXT: retq 1038; SSE42-NEXT: LBB9_1: ## %cond.load 1039; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1040; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1041; SSE42-NEXT: testb $2, %al 1042; SSE42-NEXT: je LBB9_4 1043; SSE42-NEXT: LBB9_3: ## %cond.load1 1044; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 1045; SSE42-NEXT: testb $4, %al 1046; SSE42-NEXT: je LBB9_6 1047; SSE42-NEXT: LBB9_5: ## %cond.load4 1048; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] 1049; SSE42-NEXT: testb $8, %al 1050; SSE42-NEXT: je LBB9_8 1051; SSE42-NEXT: LBB9_7: ## %cond.load7 1052; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] 1053; SSE42-NEXT: movaps %xmm1, %xmm0 1054; SSE42-NEXT: retq 1055; 1056; AVX1OR2-LABEL: load_v4f32_v4i32: 1057; AVX1OR2: ## %bb.0: 1058; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1059; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 1060; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 1061; AVX1OR2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 1062; AVX1OR2-NEXT: retq 1063; 1064; AVX512F-LABEL: load_v4f32_v4i32: 1065; AVX512F: ## %bb.0: 1066; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1067; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1068; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 1069; AVX512F-NEXT: kshiftlw $12, %k0, %k0 1070; AVX512F-NEXT: kshiftrw $12, %k0, %k1 1071; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} 1072; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 1073; AVX512F-NEXT: vzeroupper 1074; AVX512F-NEXT: retq 1075; 1076; AVX512VL-LABEL: load_v4f32_v4i32: 1077; AVX512VL: ## %bb.0: 1078; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1 1079; AVX512VL-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} 1080; AVX512VL-NEXT: retq 1081; 1082; X86-AVX512-LABEL: load_v4f32_v4i32: 1083; X86-AVX512: ## %bb.0: 1084; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1085; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 1086; X86-AVX512-NEXT: vblendmps (%eax), %xmm1, %xmm0 {%k1} 1087; X86-AVX512-NEXT: retl 1088 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 1089 %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x float> %dst) 1090 ret <4 x float> %res 1091} 1092 1093define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) { 1094; SSE2-LABEL: load_v8f32_v8i1_zero: 1095; SSE2: ## %bb.0: 1096; SSE2-NEXT: psllw $15, %xmm0 1097; SSE2-NEXT: packsswb %xmm0, %xmm0 1098; SSE2-NEXT: pmovmskb %xmm0, %eax 1099; SSE2-NEXT: pxor %xmm0, %xmm0 1100; SSE2-NEXT: testb $1, %al 1101; SSE2-NEXT: xorps %xmm1, %xmm1 1102; SSE2-NEXT: jne LBB10_1 1103; SSE2-NEXT: ## %bb.2: ## %else 1104; SSE2-NEXT: testb $2, %al 1105; SSE2-NEXT: jne LBB10_3 1106; SSE2-NEXT: LBB10_4: ## %else2 1107; SSE2-NEXT: testb $4, %al 1108; SSE2-NEXT: jne LBB10_5 1109; SSE2-NEXT: LBB10_6: ## %else5 1110; SSE2-NEXT: testb $8, %al 1111; SSE2-NEXT: jne LBB10_7 1112; SSE2-NEXT: LBB10_8: ## %else8 1113; SSE2-NEXT: testb $16, %al 1114; SSE2-NEXT: jne LBB10_9 1115; SSE2-NEXT: LBB10_10: ## %else11 1116; SSE2-NEXT: testb $32, %al 1117; SSE2-NEXT: jne LBB10_11 1118; SSE2-NEXT: LBB10_12: ## %else14 1119; SSE2-NEXT: testb $64, %al 1120; SSE2-NEXT: jne LBB10_13 1121; SSE2-NEXT: LBB10_14: ## %else17 1122; SSE2-NEXT: testb $-128, %al 1123; SSE2-NEXT: jne LBB10_15 1124; SSE2-NEXT: LBB10_16: ## %else20 1125; SSE2-NEXT: retq 1126; SSE2-NEXT: LBB10_1: ## %cond.load 1127; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1128; SSE2-NEXT: testb $2, %al 1129; SSE2-NEXT: je LBB10_4 1130; SSE2-NEXT: LBB10_3: ## %cond.load1 1131; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1132; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] 1133; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] 1134; SSE2-NEXT: movaps %xmm2, %xmm0 1135; SSE2-NEXT: testb $4, %al 1136; SSE2-NEXT: je LBB10_6 1137; SSE2-NEXT: LBB10_5: ## %cond.load4 1138; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1139; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0] 1140; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] 1141; SSE2-NEXT: testb $8, %al 1142; SSE2-NEXT: je LBB10_8 1143; SSE2-NEXT: LBB10_7: ## %cond.load7 1144; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1145; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] 1146; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 1147; SSE2-NEXT: testb $16, %al 1148; SSE2-NEXT: je LBB10_10 1149; SSE2-NEXT: LBB10_9: ## %cond.load10 1150; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1151; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 1152; SSE2-NEXT: testb $32, %al 1153; SSE2-NEXT: je LBB10_12 1154; SSE2-NEXT: LBB10_11: ## %cond.load13 1155; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1156; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1157; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] 1158; SSE2-NEXT: movaps %xmm2, %xmm1 1159; SSE2-NEXT: testb $64, %al 1160; SSE2-NEXT: je LBB10_14 1161; SSE2-NEXT: LBB10_13: ## %cond.load16 1162; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1163; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] 1164; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] 1165; SSE2-NEXT: testb $-128, %al 1166; SSE2-NEXT: je LBB10_16 1167; SSE2-NEXT: LBB10_15: ## %cond.load19 1168; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1169; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] 1170; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] 1171; SSE2-NEXT: retq 1172; 1173; SSE42-LABEL: load_v8f32_v8i1_zero: 1174; SSE42: ## %bb.0: 1175; SSE42-NEXT: psllw $15, %xmm0 1176; SSE42-NEXT: packsswb %xmm0, %xmm0 1177; SSE42-NEXT: pmovmskb %xmm0, %eax 1178; SSE42-NEXT: pxor %xmm0, %xmm0 1179; SSE42-NEXT: testb $1, %al 1180; SSE42-NEXT: xorps %xmm1, %xmm1 1181; SSE42-NEXT: jne LBB10_1 1182; SSE42-NEXT: ## %bb.2: ## %else 1183; SSE42-NEXT: testb $2, %al 1184; SSE42-NEXT: jne LBB10_3 1185; SSE42-NEXT: LBB10_4: ## %else2 1186; SSE42-NEXT: testb $4, %al 1187; SSE42-NEXT: jne LBB10_5 1188; SSE42-NEXT: LBB10_6: ## %else5 1189; SSE42-NEXT: testb $8, %al 1190; SSE42-NEXT: jne LBB10_7 1191; SSE42-NEXT: LBB10_8: ## %else8 1192; SSE42-NEXT: testb $16, %al 1193; SSE42-NEXT: jne LBB10_9 1194; SSE42-NEXT: LBB10_10: ## %else11 1195; SSE42-NEXT: testb $32, %al 1196; SSE42-NEXT: jne LBB10_11 1197; SSE42-NEXT: LBB10_12: ## %else14 1198; SSE42-NEXT: testb $64, %al 1199; SSE42-NEXT: jne LBB10_13 1200; SSE42-NEXT: LBB10_14: ## %else17 1201; SSE42-NEXT: testb $-128, %al 1202; SSE42-NEXT: jne LBB10_15 1203; SSE42-NEXT: LBB10_16: ## %else20 1204; SSE42-NEXT: retq 1205; SSE42-NEXT: LBB10_1: ## %cond.load 1206; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1207; SSE42-NEXT: testb $2, %al 1208; SSE42-NEXT: je LBB10_4 1209; SSE42-NEXT: LBB10_3: ## %cond.load1 1210; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 1211; SSE42-NEXT: testb $4, %al 1212; SSE42-NEXT: je LBB10_6 1213; SSE42-NEXT: LBB10_5: ## %cond.load4 1214; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 1215; SSE42-NEXT: testb $8, %al 1216; SSE42-NEXT: je LBB10_8 1217; SSE42-NEXT: LBB10_7: ## %cond.load7 1218; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 1219; SSE42-NEXT: testb $16, %al 1220; SSE42-NEXT: je LBB10_10 1221; SSE42-NEXT: LBB10_9: ## %cond.load10 1222; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1223; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 1224; SSE42-NEXT: testb $32, %al 1225; SSE42-NEXT: je LBB10_12 1226; SSE42-NEXT: LBB10_11: ## %cond.load13 1227; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 1228; SSE42-NEXT: testb $64, %al 1229; SSE42-NEXT: je LBB10_14 1230; SSE42-NEXT: LBB10_13: ## %cond.load16 1231; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] 1232; SSE42-NEXT: testb $-128, %al 1233; SSE42-NEXT: je LBB10_16 1234; SSE42-NEXT: LBB10_15: ## %cond.load19 1235; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] 1236; SSE42-NEXT: retq 1237; 1238; AVX1-LABEL: load_v8f32_v8i1_zero: 1239; AVX1: ## %bb.0: 1240; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1241; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 1242; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1243; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 1244; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1245; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 1246; AVX1-NEXT: retq 1247; 1248; AVX2-LABEL: load_v8f32_v8i1_zero: 1249; AVX2: ## %bb.0: 1250; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1251; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 1252; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 1253; AVX2-NEXT: retq 1254; 1255; AVX512F-LABEL: load_v8f32_v8i1_zero: 1256; AVX512F: ## %bb.0: 1257; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 1258; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 1259; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 1260; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} 1261; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 1262; AVX512F-NEXT: retq 1263; 1264; AVX512VLDQ-LABEL: load_v8f32_v8i1_zero: 1265; AVX512VLDQ: ## %bb.0: 1266; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 1267; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 1268; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 1269; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} 1270; AVX512VLDQ-NEXT: retq 1271; 1272; AVX512VLBW-LABEL: load_v8f32_v8i1_zero: 1273; AVX512VLBW: ## %bb.0: 1274; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0 1275; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1 1276; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} 1277; AVX512VLBW-NEXT: retq 1278; 1279; X86-AVX512-LABEL: load_v8f32_v8i1_zero: 1280; X86-AVX512: ## %bb.0: 1281; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0 1282; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1 1283; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1284; X86-AVX512-NEXT: vmovaps (%eax), %ymm0 {%k1} {z} 1285; X86-AVX512-NEXT: retl 1286 %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer) 1287 ret <8 x float> %res 1288} 1289 1290define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) { 1291; SSE2-LABEL: load_v8f32_v8i32: 1292; SSE2: ## %bb.0: 1293; SSE2-NEXT: pxor %xmm4, %xmm4 1294; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 1295; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 1296; SSE2-NEXT: packssdw %xmm1, %xmm0 1297; SSE2-NEXT: packsswb %xmm0, %xmm0 1298; SSE2-NEXT: pmovmskb %xmm0, %eax 1299; SSE2-NEXT: testb $1, %al 1300; SSE2-NEXT: jne LBB11_1 1301; SSE2-NEXT: ## %bb.2: ## %else 1302; SSE2-NEXT: testb $2, %al 1303; SSE2-NEXT: jne LBB11_3 1304; SSE2-NEXT: LBB11_4: ## %else2 1305; SSE2-NEXT: testb $4, %al 1306; SSE2-NEXT: jne LBB11_5 1307; SSE2-NEXT: LBB11_6: ## %else5 1308; SSE2-NEXT: testb $8, %al 1309; SSE2-NEXT: jne LBB11_7 1310; SSE2-NEXT: LBB11_8: ## %else8 1311; SSE2-NEXT: testb $16, %al 1312; SSE2-NEXT: jne LBB11_9 1313; SSE2-NEXT: LBB11_10: ## %else11 1314; SSE2-NEXT: testb $32, %al 1315; SSE2-NEXT: jne LBB11_11 1316; SSE2-NEXT: LBB11_12: ## %else14 1317; SSE2-NEXT: testb $64, %al 1318; SSE2-NEXT: jne LBB11_13 1319; SSE2-NEXT: LBB11_14: ## %else17 1320; SSE2-NEXT: testb $-128, %al 1321; SSE2-NEXT: je LBB11_16 1322; SSE2-NEXT: LBB11_15: ## %cond.load19 1323; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1324; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] 1325; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] 1326; SSE2-NEXT: LBB11_16: ## %else20 1327; SSE2-NEXT: movaps %xmm2, %xmm0 1328; SSE2-NEXT: movaps %xmm3, %xmm1 1329; SSE2-NEXT: retq 1330; SSE2-NEXT: LBB11_1: ## %cond.load 1331; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1332; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 1333; SSE2-NEXT: testb $2, %al 1334; SSE2-NEXT: je LBB11_4 1335; SSE2-NEXT: LBB11_3: ## %cond.load1 1336; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1337; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1338; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] 1339; SSE2-NEXT: movaps %xmm0, %xmm2 1340; SSE2-NEXT: testb $4, %al 1341; SSE2-NEXT: je LBB11_6 1342; SSE2-NEXT: LBB11_5: ## %cond.load4 1343; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1344; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0] 1345; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] 1346; SSE2-NEXT: testb $8, %al 1347; SSE2-NEXT: je LBB11_8 1348; SSE2-NEXT: LBB11_7: ## %cond.load7 1349; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1350; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 1351; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] 1352; SSE2-NEXT: testb $16, %al 1353; SSE2-NEXT: je LBB11_10 1354; SSE2-NEXT: LBB11_9: ## %cond.load10 1355; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1356; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] 1357; SSE2-NEXT: testb $32, %al 1358; SSE2-NEXT: je LBB11_12 1359; SSE2-NEXT: LBB11_11: ## %cond.load13 1360; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1361; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] 1362; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3] 1363; SSE2-NEXT: movaps %xmm0, %xmm3 1364; SSE2-NEXT: testb $64, %al 1365; SSE2-NEXT: je LBB11_14 1366; SSE2-NEXT: LBB11_13: ## %cond.load16 1367; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1368; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[3,0] 1369; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] 1370; SSE2-NEXT: testb $-128, %al 1371; SSE2-NEXT: jne LBB11_15 1372; SSE2-NEXT: jmp LBB11_16 1373; 1374; SSE42-LABEL: load_v8f32_v8i32: 1375; SSE42: ## %bb.0: 1376; SSE42-NEXT: pxor %xmm4, %xmm4 1377; SSE42-NEXT: pcmpeqd %xmm4, %xmm1 1378; SSE42-NEXT: pcmpeqd %xmm4, %xmm0 1379; SSE42-NEXT: packssdw %xmm1, %xmm0 1380; SSE42-NEXT: packsswb %xmm0, %xmm0 1381; SSE42-NEXT: pmovmskb %xmm0, %eax 1382; SSE42-NEXT: testb $1, %al 1383; SSE42-NEXT: jne LBB11_1 1384; SSE42-NEXT: ## %bb.2: ## %else 1385; SSE42-NEXT: testb $2, %al 1386; SSE42-NEXT: jne LBB11_3 1387; SSE42-NEXT: LBB11_4: ## %else2 1388; SSE42-NEXT: testb $4, %al 1389; SSE42-NEXT: jne LBB11_5 1390; SSE42-NEXT: LBB11_6: ## %else5 1391; SSE42-NEXT: testb $8, %al 1392; SSE42-NEXT: jne LBB11_7 1393; SSE42-NEXT: LBB11_8: ## %else8 1394; SSE42-NEXT: testb $16, %al 1395; SSE42-NEXT: jne LBB11_9 1396; SSE42-NEXT: LBB11_10: ## %else11 1397; SSE42-NEXT: testb $32, %al 1398; SSE42-NEXT: jne LBB11_11 1399; SSE42-NEXT: LBB11_12: ## %else14 1400; SSE42-NEXT: testb $64, %al 1401; SSE42-NEXT: jne LBB11_13 1402; SSE42-NEXT: LBB11_14: ## %else17 1403; SSE42-NEXT: testb $-128, %al 1404; SSE42-NEXT: je LBB11_16 1405; SSE42-NEXT: LBB11_15: ## %cond.load19 1406; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] 1407; SSE42-NEXT: LBB11_16: ## %else20 1408; SSE42-NEXT: movaps %xmm2, %xmm0 1409; SSE42-NEXT: movaps %xmm3, %xmm1 1410; SSE42-NEXT: retq 1411; SSE42-NEXT: LBB11_1: ## %cond.load 1412; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1413; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] 1414; SSE42-NEXT: testb $2, %al 1415; SSE42-NEXT: je LBB11_4 1416; SSE42-NEXT: LBB11_3: ## %cond.load1 1417; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 1418; SSE42-NEXT: testb $4, %al 1419; SSE42-NEXT: je LBB11_6 1420; SSE42-NEXT: LBB11_5: ## %cond.load4 1421; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] 1422; SSE42-NEXT: testb $8, %al 1423; SSE42-NEXT: je LBB11_8 1424; SSE42-NEXT: LBB11_7: ## %cond.load7 1425; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] 1426; SSE42-NEXT: testb $16, %al 1427; SSE42-NEXT: je LBB11_10 1428; SSE42-NEXT: LBB11_9: ## %cond.load10 1429; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1430; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7] 1431; SSE42-NEXT: testb $32, %al 1432; SSE42-NEXT: je LBB11_12 1433; SSE42-NEXT: LBB11_11: ## %cond.load13 1434; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] 1435; SSE42-NEXT: testb $64, %al 1436; SSE42-NEXT: je LBB11_14 1437; SSE42-NEXT: LBB11_13: ## %cond.load16 1438; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] 1439; SSE42-NEXT: testb $-128, %al 1440; SSE42-NEXT: jne LBB11_15 1441; SSE42-NEXT: jmp LBB11_16 1442; 1443; AVX1-LABEL: load_v8f32_v8i32: 1444; AVX1: ## %bb.0: 1445; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1446; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1447; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 1448; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 1449; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1450; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 1451; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 1452; AVX1-NEXT: retq 1453; 1454; AVX2-LABEL: load_v8f32_v8i32: 1455; AVX2: ## %bb.0: 1456; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1457; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 1458; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 1459; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 1460; AVX2-NEXT: retq 1461; 1462; AVX512F-LABEL: load_v8f32_v8i32: 1463; AVX512F: ## %bb.0: 1464; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 1465; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 1466; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 1467; AVX512F-NEXT: kshiftlw $8, %k0, %k0 1468; AVX512F-NEXT: kshiftrw $8, %k0, %k1 1469; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} 1470; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 1471; AVX512F-NEXT: retq 1472; 1473; AVX512VL-LABEL: load_v8f32_v8i32: 1474; AVX512VL: ## %bb.0: 1475; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 1476; AVX512VL-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1} 1477; AVX512VL-NEXT: retq 1478; 1479; X86-AVX512-LABEL: load_v8f32_v8i32: 1480; X86-AVX512: ## %bb.0: 1481; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1482; X86-AVX512-NEXT: vptestnmd %ymm0, %ymm0, %k1 1483; X86-AVX512-NEXT: vblendmps (%eax), %ymm1, %ymm0 {%k1} 1484; X86-AVX512-NEXT: retl 1485 %mask = icmp eq <8 x i32> %trigger, zeroinitializer 1486 %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 32, <8 x i1> %mask, <8 x float> %dst) 1487 ret <8 x float> %res 1488} 1489 1490 1491; 1492; vXf64 1493; 1494 1495define <1 x i64> @load_v1i64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x i64> %dst) { 1496; SSE-LABEL: load_v1i64_v1i64: 1497; SSE: ## %bb.0: 1498; SSE-NEXT: testq %rdi, %rdi 1499; SSE-NEXT: jne LBB12_1 1500; SSE-NEXT: ## %bb.2: ## %cond.load 1501; SSE-NEXT: movq (%rsi), %rax 1502; SSE-NEXT: retq 1503; SSE-NEXT: LBB12_1: 1504; SSE-NEXT: movq %rdx, %rax 1505; SSE-NEXT: retq 1506; 1507; AVX-LABEL: load_v1i64_v1i64: 1508; AVX: ## %bb.0: 1509; AVX-NEXT: testq %rdi, %rdi 1510; AVX-NEXT: jne LBB12_1 1511; AVX-NEXT: ## %bb.2: ## %cond.load 1512; AVX-NEXT: movq (%rsi), %rax 1513; AVX-NEXT: retq 1514; AVX-NEXT: LBB12_1: 1515; AVX-NEXT: movq %rdx, %rax 1516; AVX-NEXT: retq 1517; 1518; X86-AVX512-LABEL: load_v1i64_v1i64: 1519; X86-AVX512: ## %bb.0: 1520; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1521; X86-AVX512-NEXT: orl {{[0-9]+}}(%esp), %eax 1522; X86-AVX512-NEXT: jne LBB12_1 1523; X86-AVX512-NEXT: ## %bb.2: ## %cond.load 1524; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 1525; X86-AVX512-NEXT: movl (%ecx), %eax 1526; X86-AVX512-NEXT: movl 4(%ecx), %edx 1527; X86-AVX512-NEXT: retl 1528; X86-AVX512-NEXT: LBB12_1: 1529; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx 1530; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1531; X86-AVX512-NEXT: retl 1532 %mask = icmp eq <1 x i64> %trigger, zeroinitializer 1533 %res = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %addr, i32 4, <1 x i1> %mask, <1 x i64> %dst) 1534 ret <1 x i64> %res 1535} 1536 1537define <2 x i64> @load_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %dst) { 1538; SSE2-LABEL: load_v2i64_v2i64: 1539; SSE2: ## %bb.0: 1540; SSE2-NEXT: pxor %xmm2, %xmm2 1541; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 1542; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] 1543; SSE2-NEXT: pand %xmm2, %xmm0 1544; SSE2-NEXT: movmskpd %xmm0, %eax 1545; SSE2-NEXT: testb $1, %al 1546; SSE2-NEXT: jne LBB13_1 1547; SSE2-NEXT: ## %bb.2: ## %else 1548; SSE2-NEXT: testb $2, %al 1549; SSE2-NEXT: jne LBB13_3 1550; SSE2-NEXT: LBB13_4: ## %else2 1551; SSE2-NEXT: movaps %xmm1, %xmm0 1552; SSE2-NEXT: retq 1553; SSE2-NEXT: LBB13_1: ## %cond.load 1554; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 1555; SSE2-NEXT: testb $2, %al 1556; SSE2-NEXT: je LBB13_4 1557; SSE2-NEXT: LBB13_3: ## %cond.load1 1558; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1559; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 1560; SSE2-NEXT: movaps %xmm1, %xmm0 1561; SSE2-NEXT: retq 1562; 1563; SSE42-LABEL: load_v2i64_v2i64: 1564; SSE42: ## %bb.0: 1565; SSE42-NEXT: pxor %xmm2, %xmm2 1566; SSE42-NEXT: pcmpeqq %xmm0, %xmm2 1567; SSE42-NEXT: movmskpd %xmm2, %eax 1568; SSE42-NEXT: testb $1, %al 1569; SSE42-NEXT: jne LBB13_1 1570; SSE42-NEXT: ## %bb.2: ## %else 1571; SSE42-NEXT: testb $2, %al 1572; SSE42-NEXT: jne LBB13_3 1573; SSE42-NEXT: LBB13_4: ## %else2 1574; SSE42-NEXT: movdqa %xmm1, %xmm0 1575; SSE42-NEXT: retq 1576; SSE42-NEXT: LBB13_1: ## %cond.load 1577; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1 1578; SSE42-NEXT: testb $2, %al 1579; SSE42-NEXT: je LBB13_4 1580; SSE42-NEXT: LBB13_3: ## %cond.load1 1581; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1 1582; SSE42-NEXT: movdqa %xmm1, %xmm0 1583; SSE42-NEXT: retq 1584; 1585; AVX1-LABEL: load_v2i64_v2i64: 1586; AVX1: ## %bb.0: 1587; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1588; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 1589; AVX1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 1590; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 1591; AVX1-NEXT: retq 1592; 1593; AVX2-LABEL: load_v2i64_v2i64: 1594; AVX2: ## %bb.0: 1595; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1596; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 1597; AVX2-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 1598; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 1599; AVX2-NEXT: retq 1600; 1601; AVX512F-LABEL: load_v2i64_v2i64: 1602; AVX512F: ## %bb.0: 1603; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1604; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1605; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 1606; AVX512F-NEXT: kshiftlw $14, %k0, %k0 1607; AVX512F-NEXT: kshiftrw $14, %k0, %k1 1608; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} 1609; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 1610; AVX512F-NEXT: vzeroupper 1611; AVX512F-NEXT: retq 1612; 1613; AVX512VL-LABEL: load_v2i64_v2i64: 1614; AVX512VL: ## %bb.0: 1615; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 1616; AVX512VL-NEXT: vpblendmq (%rdi), %xmm1, %xmm0 {%k1} 1617; AVX512VL-NEXT: retq 1618; 1619; X86-AVX512-LABEL: load_v2i64_v2i64: 1620; X86-AVX512: ## %bb.0: 1621; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1622; X86-AVX512-NEXT: vptestnmq %xmm0, %xmm0, %k1 1623; X86-AVX512-NEXT: vpblendmq (%eax), %xmm1, %xmm0 {%k1} 1624; X86-AVX512-NEXT: retl 1625 %mask = icmp eq <2 x i64> %trigger, zeroinitializer 1626 %res = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i64> %dst) 1627 ret <2 x i64> %res 1628} 1629 1630define <4 x i64> @load_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %dst) { 1631; SSE2-LABEL: load_v4i64_v4i64: 1632; SSE2: ## %bb.0: 1633; SSE2-NEXT: pxor %xmm4, %xmm4 1634; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 1635; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 1636; SSE2-NEXT: movdqa %xmm0, %xmm4 1637; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3] 1638; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1639; SSE2-NEXT: andps %xmm4, %xmm0 1640; SSE2-NEXT: movmskps %xmm0, %eax 1641; SSE2-NEXT: testb $1, %al 1642; SSE2-NEXT: jne LBB14_1 1643; SSE2-NEXT: ## %bb.2: ## %else 1644; SSE2-NEXT: testb $2, %al 1645; SSE2-NEXT: jne LBB14_3 1646; SSE2-NEXT: LBB14_4: ## %else2 1647; SSE2-NEXT: testb $4, %al 1648; SSE2-NEXT: jne LBB14_5 1649; SSE2-NEXT: LBB14_6: ## %else5 1650; SSE2-NEXT: testb $8, %al 1651; SSE2-NEXT: je LBB14_8 1652; SSE2-NEXT: LBB14_7: ## %cond.load7 1653; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1654; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] 1655; SSE2-NEXT: LBB14_8: ## %else8 1656; SSE2-NEXT: movaps %xmm2, %xmm0 1657; SSE2-NEXT: movaps %xmm3, %xmm1 1658; SSE2-NEXT: retq 1659; SSE2-NEXT: LBB14_1: ## %cond.load 1660; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 1661; SSE2-NEXT: testb $2, %al 1662; SSE2-NEXT: je LBB14_4 1663; SSE2-NEXT: LBB14_3: ## %cond.load1 1664; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1665; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 1666; SSE2-NEXT: testb $4, %al 1667; SSE2-NEXT: je LBB14_6 1668; SSE2-NEXT: LBB14_5: ## %cond.load4 1669; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] 1670; SSE2-NEXT: testb $8, %al 1671; SSE2-NEXT: jne LBB14_7 1672; SSE2-NEXT: jmp LBB14_8 1673; 1674; SSE42-LABEL: load_v4i64_v4i64: 1675; SSE42: ## %bb.0: 1676; SSE42-NEXT: pxor %xmm4, %xmm4 1677; SSE42-NEXT: pcmpeqq %xmm4, %xmm1 1678; SSE42-NEXT: pcmpeqq %xmm4, %xmm0 1679; SSE42-NEXT: packssdw %xmm1, %xmm0 1680; SSE42-NEXT: movmskps %xmm0, %eax 1681; SSE42-NEXT: testb $1, %al 1682; SSE42-NEXT: jne LBB14_1 1683; SSE42-NEXT: ## %bb.2: ## %else 1684; SSE42-NEXT: testb $2, %al 1685; SSE42-NEXT: jne LBB14_3 1686; SSE42-NEXT: LBB14_4: ## %else2 1687; SSE42-NEXT: testb $4, %al 1688; SSE42-NEXT: jne LBB14_5 1689; SSE42-NEXT: LBB14_6: ## %else5 1690; SSE42-NEXT: testb $8, %al 1691; SSE42-NEXT: je LBB14_8 1692; SSE42-NEXT: LBB14_7: ## %cond.load7 1693; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm3 1694; SSE42-NEXT: LBB14_8: ## %else8 1695; SSE42-NEXT: movdqa %xmm2, %xmm0 1696; SSE42-NEXT: movdqa %xmm3, %xmm1 1697; SSE42-NEXT: retq 1698; SSE42-NEXT: LBB14_1: ## %cond.load 1699; SSE42-NEXT: pinsrq $0, (%rdi), %xmm2 1700; SSE42-NEXT: testb $2, %al 1701; SSE42-NEXT: je LBB14_4 1702; SSE42-NEXT: LBB14_3: ## %cond.load1 1703; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm2 1704; SSE42-NEXT: testb $4, %al 1705; SSE42-NEXT: je LBB14_6 1706; SSE42-NEXT: LBB14_5: ## %cond.load4 1707; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm3 1708; SSE42-NEXT: testb $8, %al 1709; SSE42-NEXT: jne LBB14_7 1710; SSE42-NEXT: jmp LBB14_8 1711; 1712; AVX1-LABEL: load_v4i64_v4i64: 1713; AVX1: ## %bb.0: 1714; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1715; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1716; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 1717; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 1718; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1719; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 1720; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 1721; AVX1-NEXT: retq 1722; 1723; AVX2-LABEL: load_v4i64_v4i64: 1724; AVX2: ## %bb.0: 1725; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1726; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 1727; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 1728; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 1729; AVX2-NEXT: retq 1730; 1731; AVX512F-LABEL: load_v4i64_v4i64: 1732; AVX512F: ## %bb.0: 1733; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 1734; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 1735; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 1736; AVX512F-NEXT: kshiftlw $12, %k0, %k0 1737; AVX512F-NEXT: kshiftrw $12, %k0, %k1 1738; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} 1739; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 1740; AVX512F-NEXT: retq 1741; 1742; AVX512VL-LABEL: load_v4i64_v4i64: 1743; AVX512VL: ## %bb.0: 1744; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1 1745; AVX512VL-NEXT: vpblendmq (%rdi), %ymm1, %ymm0 {%k1} 1746; AVX512VL-NEXT: retq 1747; 1748; X86-AVX512-LABEL: load_v4i64_v4i64: 1749; X86-AVX512: ## %bb.0: 1750; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1751; X86-AVX512-NEXT: vptestnmq %ymm0, %ymm0, %k1 1752; X86-AVX512-NEXT: vpblendmq (%eax), %ymm1, %ymm0 {%k1} 1753; X86-AVX512-NEXT: retl 1754 %mask = icmp eq <4 x i64> %trigger, zeroinitializer 1755 %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i64> %dst) 1756 ret <4 x i64> %res 1757} 1758 1759define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i64> %dst) { 1760; SSE2-LABEL: load_v8i64_v8i16: 1761; SSE2: ## %bb.0: 1762; SSE2-NEXT: pxor %xmm5, %xmm5 1763; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 1764; SSE2-NEXT: packsswb %xmm5, %xmm5 1765; SSE2-NEXT: pmovmskb %xmm5, %eax 1766; SSE2-NEXT: testb $1, %al 1767; SSE2-NEXT: jne LBB15_1 1768; SSE2-NEXT: ## %bb.2: ## %else 1769; SSE2-NEXT: testb $2, %al 1770; SSE2-NEXT: jne LBB15_3 1771; SSE2-NEXT: LBB15_4: ## %else2 1772; SSE2-NEXT: testb $4, %al 1773; SSE2-NEXT: jne LBB15_5 1774; SSE2-NEXT: LBB15_6: ## %else5 1775; SSE2-NEXT: testb $8, %al 1776; SSE2-NEXT: jne LBB15_7 1777; SSE2-NEXT: LBB15_8: ## %else8 1778; SSE2-NEXT: testb $16, %al 1779; SSE2-NEXT: jne LBB15_9 1780; SSE2-NEXT: LBB15_10: ## %else11 1781; SSE2-NEXT: testb $32, %al 1782; SSE2-NEXT: jne LBB15_11 1783; SSE2-NEXT: LBB15_12: ## %else14 1784; SSE2-NEXT: testb $64, %al 1785; SSE2-NEXT: jne LBB15_13 1786; SSE2-NEXT: LBB15_14: ## %else17 1787; SSE2-NEXT: testb $-128, %al 1788; SSE2-NEXT: je LBB15_16 1789; SSE2-NEXT: LBB15_15: ## %cond.load19 1790; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1791; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] 1792; SSE2-NEXT: LBB15_16: ## %else20 1793; SSE2-NEXT: movaps %xmm1, %xmm0 1794; SSE2-NEXT: movaps %xmm2, %xmm1 1795; SSE2-NEXT: movaps %xmm3, %xmm2 1796; SSE2-NEXT: movaps %xmm4, %xmm3 1797; SSE2-NEXT: retq 1798; SSE2-NEXT: LBB15_1: ## %cond.load 1799; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 1800; SSE2-NEXT: testb $2, %al 1801; SSE2-NEXT: je LBB15_4 1802; SSE2-NEXT: LBB15_3: ## %cond.load1 1803; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1804; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 1805; SSE2-NEXT: testb $4, %al 1806; SSE2-NEXT: je LBB15_6 1807; SSE2-NEXT: LBB15_5: ## %cond.load4 1808; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 1809; SSE2-NEXT: testb $8, %al 1810; SSE2-NEXT: je LBB15_8 1811; SSE2-NEXT: LBB15_7: ## %cond.load7 1812; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1813; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 1814; SSE2-NEXT: testb $16, %al 1815; SSE2-NEXT: je LBB15_10 1816; SSE2-NEXT: LBB15_9: ## %cond.load10 1817; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] 1818; SSE2-NEXT: testb $32, %al 1819; SSE2-NEXT: je LBB15_12 1820; SSE2-NEXT: LBB15_11: ## %cond.load13 1821; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1822; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] 1823; SSE2-NEXT: testb $64, %al 1824; SSE2-NEXT: je LBB15_14 1825; SSE2-NEXT: LBB15_13: ## %cond.load16 1826; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] 1827; SSE2-NEXT: testb $-128, %al 1828; SSE2-NEXT: jne LBB15_15 1829; SSE2-NEXT: jmp LBB15_16 1830; 1831; SSE42-LABEL: load_v8i64_v8i16: 1832; SSE42: ## %bb.0: 1833; SSE42-NEXT: pxor %xmm5, %xmm5 1834; SSE42-NEXT: pcmpeqw %xmm0, %xmm5 1835; SSE42-NEXT: packsswb %xmm5, %xmm5 1836; SSE42-NEXT: pmovmskb %xmm5, %eax 1837; SSE42-NEXT: testb $1, %al 1838; SSE42-NEXT: jne LBB15_1 1839; SSE42-NEXT: ## %bb.2: ## %else 1840; SSE42-NEXT: testb $2, %al 1841; SSE42-NEXT: jne LBB15_3 1842; SSE42-NEXT: LBB15_4: ## %else2 1843; SSE42-NEXT: testb $4, %al 1844; SSE42-NEXT: jne LBB15_5 1845; SSE42-NEXT: LBB15_6: ## %else5 1846; SSE42-NEXT: testb $8, %al 1847; SSE42-NEXT: jne LBB15_7 1848; SSE42-NEXT: LBB15_8: ## %else8 1849; SSE42-NEXT: testb $16, %al 1850; SSE42-NEXT: jne LBB15_9 1851; SSE42-NEXT: LBB15_10: ## %else11 1852; SSE42-NEXT: testb $32, %al 1853; SSE42-NEXT: jne LBB15_11 1854; SSE42-NEXT: LBB15_12: ## %else14 1855; SSE42-NEXT: testb $64, %al 1856; SSE42-NEXT: jne LBB15_13 1857; SSE42-NEXT: LBB15_14: ## %else17 1858; SSE42-NEXT: testb $-128, %al 1859; SSE42-NEXT: je LBB15_16 1860; SSE42-NEXT: LBB15_15: ## %cond.load19 1861; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm4 1862; SSE42-NEXT: LBB15_16: ## %else20 1863; SSE42-NEXT: movdqa %xmm1, %xmm0 1864; SSE42-NEXT: movdqa %xmm2, %xmm1 1865; SSE42-NEXT: movdqa %xmm3, %xmm2 1866; SSE42-NEXT: movdqa %xmm4, %xmm3 1867; SSE42-NEXT: retq 1868; SSE42-NEXT: LBB15_1: ## %cond.load 1869; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1 1870; SSE42-NEXT: testb $2, %al 1871; SSE42-NEXT: je LBB15_4 1872; SSE42-NEXT: LBB15_3: ## %cond.load1 1873; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1 1874; SSE42-NEXT: testb $4, %al 1875; SSE42-NEXT: je LBB15_6 1876; SSE42-NEXT: LBB15_5: ## %cond.load4 1877; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm2 1878; SSE42-NEXT: testb $8, %al 1879; SSE42-NEXT: je LBB15_8 1880; SSE42-NEXT: LBB15_7: ## %cond.load7 1881; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm2 1882; SSE42-NEXT: testb $16, %al 1883; SSE42-NEXT: je LBB15_10 1884; SSE42-NEXT: LBB15_9: ## %cond.load10 1885; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm3 1886; SSE42-NEXT: testb $32, %al 1887; SSE42-NEXT: je LBB15_12 1888; SSE42-NEXT: LBB15_11: ## %cond.load13 1889; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm3 1890; SSE42-NEXT: testb $64, %al 1891; SSE42-NEXT: je LBB15_14 1892; SSE42-NEXT: LBB15_13: ## %cond.load16 1893; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm4 1894; SSE42-NEXT: testb $-128, %al 1895; SSE42-NEXT: jne LBB15_15 1896; SSE42-NEXT: jmp LBB15_16 1897; 1898; AVX1-LABEL: load_v8i64_v8i16: 1899; AVX1: ## %bb.0: 1900; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 1901; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 1902; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 1903; AVX1-NEXT: vpmovsxwq %xmm3, %xmm5 1904; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] 1905; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 1906; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 1907; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 1908; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4 1909; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1910; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 1911; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 1912; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 1913; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 1914; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1 1915; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 1916; AVX1-NEXT: retq 1917; 1918; AVX2-LABEL: load_v8i64_v8i16: 1919; AVX2: ## %bb.0: 1920; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 1921; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 1922; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 1923; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 1924; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 1925; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 1926; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4 1927; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 1928; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm3, %ymm1 1929; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 1930; AVX2-NEXT: retq 1931; 1932; AVX512F-LABEL: load_v8i64_v8i16: 1933; AVX512F: ## %bb.0: 1934; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 1935; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 1936; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 1937; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 1938; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} 1939; AVX512F-NEXT: retq 1940; 1941; AVX512VLDQ-LABEL: load_v8i64_v8i16: 1942; AVX512VLDQ: ## %bb.0: 1943; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 1944; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 1945; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 1946; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 1947; AVX512VLDQ-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} 1948; AVX512VLDQ-NEXT: retq 1949; 1950; AVX512VLBW-LABEL: load_v8i64_v8i16: 1951; AVX512VLBW: ## %bb.0: 1952; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1 1953; AVX512VLBW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} 1954; AVX512VLBW-NEXT: retq 1955; 1956; X86-AVX512-LABEL: load_v8i64_v8i16: 1957; X86-AVX512: ## %bb.0: 1958; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1959; X86-AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k1 1960; X86-AVX512-NEXT: vpblendmq (%eax), %zmm1, %zmm0 {%k1} 1961; X86-AVX512-NEXT: retl 1962 %mask = icmp eq <8 x i16> %trigger, zeroinitializer 1963 %res = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst) 1964 ret <8 x i64> %res 1965} 1966 1967define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst) { 1968; SSE2-LABEL: load_v8i64_v8i64: 1969; SSE2: ## %bb.0: 1970; SSE2-NEXT: pxor %xmm8, %xmm8 1971; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 1972; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,0,3,2] 1973; SSE2-NEXT: pand %xmm3, %xmm9 1974; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 1975; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] 1976; SSE2-NEXT: pand %xmm2, %xmm3 1977; SSE2-NEXT: packssdw %xmm9, %xmm3 1978; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 1979; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] 1980; SSE2-NEXT: pand %xmm1, %xmm2 1981; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 1982; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 1983; SSE2-NEXT: pand %xmm0, %xmm1 1984; SSE2-NEXT: packssdw %xmm2, %xmm1 1985; SSE2-NEXT: packssdw %xmm3, %xmm1 1986; SSE2-NEXT: packsswb %xmm1, %xmm1 1987; SSE2-NEXT: pmovmskb %xmm1, %eax 1988; SSE2-NEXT: testb $1, %al 1989; SSE2-NEXT: jne LBB16_1 1990; SSE2-NEXT: ## %bb.2: ## %else 1991; SSE2-NEXT: testb $2, %al 1992; SSE2-NEXT: jne LBB16_3 1993; SSE2-NEXT: LBB16_4: ## %else2 1994; SSE2-NEXT: testb $4, %al 1995; SSE2-NEXT: jne LBB16_5 1996; SSE2-NEXT: LBB16_6: ## %else5 1997; SSE2-NEXT: testb $8, %al 1998; SSE2-NEXT: jne LBB16_7 1999; SSE2-NEXT: LBB16_8: ## %else8 2000; SSE2-NEXT: testb $16, %al 2001; SSE2-NEXT: jne LBB16_9 2002; SSE2-NEXT: LBB16_10: ## %else11 2003; SSE2-NEXT: testb $32, %al 2004; SSE2-NEXT: jne LBB16_11 2005; SSE2-NEXT: LBB16_12: ## %else14 2006; SSE2-NEXT: testb $64, %al 2007; SSE2-NEXT: jne LBB16_13 2008; SSE2-NEXT: LBB16_14: ## %else17 2009; SSE2-NEXT: testb $-128, %al 2010; SSE2-NEXT: je LBB16_16 2011; SSE2-NEXT: LBB16_15: ## %cond.load19 2012; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2013; SSE2-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] 2014; SSE2-NEXT: LBB16_16: ## %else20 2015; SSE2-NEXT: movaps %xmm4, %xmm0 2016; SSE2-NEXT: movaps %xmm5, %xmm1 2017; SSE2-NEXT: movaps %xmm6, %xmm2 2018; SSE2-NEXT: movaps %xmm7, %xmm3 2019; SSE2-NEXT: retq 2020; SSE2-NEXT: LBB16_1: ## %cond.load 2021; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] 2022; SSE2-NEXT: testb $2, %al 2023; SSE2-NEXT: je LBB16_4 2024; SSE2-NEXT: LBB16_3: ## %cond.load1 2025; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2026; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] 2027; SSE2-NEXT: testb $4, %al 2028; SSE2-NEXT: je LBB16_6 2029; SSE2-NEXT: LBB16_5: ## %cond.load4 2030; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] 2031; SSE2-NEXT: testb $8, %al 2032; SSE2-NEXT: je LBB16_8 2033; SSE2-NEXT: LBB16_7: ## %cond.load7 2034; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2035; SSE2-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] 2036; SSE2-NEXT: testb $16, %al 2037; SSE2-NEXT: je LBB16_10 2038; SSE2-NEXT: LBB16_9: ## %cond.load10 2039; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] 2040; SSE2-NEXT: testb $32, %al 2041; SSE2-NEXT: je LBB16_12 2042; SSE2-NEXT: LBB16_11: ## %cond.load13 2043; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2044; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] 2045; SSE2-NEXT: testb $64, %al 2046; SSE2-NEXT: je LBB16_14 2047; SSE2-NEXT: LBB16_13: ## %cond.load16 2048; SSE2-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] 2049; SSE2-NEXT: testb $-128, %al 2050; SSE2-NEXT: jne LBB16_15 2051; SSE2-NEXT: jmp LBB16_16 2052; 2053; SSE42-LABEL: load_v8i64_v8i64: 2054; SSE42: ## %bb.0: 2055; SSE42-NEXT: pxor %xmm8, %xmm8 2056; SSE42-NEXT: pcmpeqq %xmm8, %xmm3 2057; SSE42-NEXT: pcmpeqq %xmm8, %xmm2 2058; SSE42-NEXT: packssdw %xmm3, %xmm2 2059; SSE42-NEXT: pcmpeqq %xmm8, %xmm1 2060; SSE42-NEXT: pcmpeqq %xmm8, %xmm0 2061; SSE42-NEXT: packssdw %xmm1, %xmm0 2062; SSE42-NEXT: packssdw %xmm2, %xmm0 2063; SSE42-NEXT: packsswb %xmm0, %xmm0 2064; SSE42-NEXT: pmovmskb %xmm0, %eax 2065; SSE42-NEXT: testb $1, %al 2066; SSE42-NEXT: jne LBB16_1 2067; SSE42-NEXT: ## %bb.2: ## %else 2068; SSE42-NEXT: testb $2, %al 2069; SSE42-NEXT: jne LBB16_3 2070; SSE42-NEXT: LBB16_4: ## %else2 2071; SSE42-NEXT: testb $4, %al 2072; SSE42-NEXT: jne LBB16_5 2073; SSE42-NEXT: LBB16_6: ## %else5 2074; SSE42-NEXT: testb $8, %al 2075; SSE42-NEXT: jne LBB16_7 2076; SSE42-NEXT: LBB16_8: ## %else8 2077; SSE42-NEXT: testb $16, %al 2078; SSE42-NEXT: jne LBB16_9 2079; SSE42-NEXT: LBB16_10: ## %else11 2080; SSE42-NEXT: testb $32, %al 2081; SSE42-NEXT: jne LBB16_11 2082; SSE42-NEXT: LBB16_12: ## %else14 2083; SSE42-NEXT: testb $64, %al 2084; SSE42-NEXT: jne LBB16_13 2085; SSE42-NEXT: LBB16_14: ## %else17 2086; SSE42-NEXT: testb $-128, %al 2087; SSE42-NEXT: je LBB16_16 2088; SSE42-NEXT: LBB16_15: ## %cond.load19 2089; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm7 2090; SSE42-NEXT: LBB16_16: ## %else20 2091; SSE42-NEXT: movdqa %xmm4, %xmm0 2092; SSE42-NEXT: movdqa %xmm5, %xmm1 2093; SSE42-NEXT: movdqa %xmm6, %xmm2 2094; SSE42-NEXT: movdqa %xmm7, %xmm3 2095; SSE42-NEXT: retq 2096; SSE42-NEXT: LBB16_1: ## %cond.load 2097; SSE42-NEXT: pinsrq $0, (%rdi), %xmm4 2098; SSE42-NEXT: testb $2, %al 2099; SSE42-NEXT: je LBB16_4 2100; SSE42-NEXT: LBB16_3: ## %cond.load1 2101; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm4 2102; SSE42-NEXT: testb $4, %al 2103; SSE42-NEXT: je LBB16_6 2104; SSE42-NEXT: LBB16_5: ## %cond.load4 2105; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm5 2106; SSE42-NEXT: testb $8, %al 2107; SSE42-NEXT: je LBB16_8 2108; SSE42-NEXT: LBB16_7: ## %cond.load7 2109; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm5 2110; SSE42-NEXT: testb $16, %al 2111; SSE42-NEXT: je LBB16_10 2112; SSE42-NEXT: LBB16_9: ## %cond.load10 2113; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm6 2114; SSE42-NEXT: testb $32, %al 2115; SSE42-NEXT: je LBB16_12 2116; SSE42-NEXT: LBB16_11: ## %cond.load13 2117; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm6 2118; SSE42-NEXT: testb $64, %al 2119; SSE42-NEXT: je LBB16_14 2120; SSE42-NEXT: LBB16_13: ## %cond.load16 2121; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm7 2122; SSE42-NEXT: testb $-128, %al 2123; SSE42-NEXT: jne LBB16_15 2124; SSE42-NEXT: jmp LBB16_16 2125; 2126; AVX1-LABEL: load_v8i64_v8i64: 2127; AVX1: ## %bb.0: 2128; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2129; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 2130; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 2131; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1 2132; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 2133; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 2134; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 2135; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0 2136; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 2137; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 2138; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0 2139; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2 2140; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 2141; AVX1-NEXT: retq 2142; 2143; AVX2-LABEL: load_v8i64_v8i64: 2144; AVX2: ## %bb.0: 2145; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 2146; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1 2147; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0 2148; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4 2149; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0 2150; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm1, %ymm2 2151; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 2152; AVX2-NEXT: retq 2153; 2154; AVX512-LABEL: load_v8i64_v8i64: 2155; AVX512: ## %bb.0: 2156; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1 2157; AVX512-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} 2158; AVX512-NEXT: retq 2159; 2160; X86-AVX512-LABEL: load_v8i64_v8i64: 2161; X86-AVX512: ## %bb.0: 2162; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 2163; X86-AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1 2164; X86-AVX512-NEXT: vpblendmq (%eax), %zmm1, %zmm0 {%k1} 2165; X86-AVX512-NEXT: retl 2166 %mask = icmp eq <8 x i64> %trigger, zeroinitializer 2167 %res = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst) 2168 ret <8 x i64> %res 2169} 2170 2171; 2172; vXi32 2173; 2174 2175define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) { 2176; SSE2-LABEL: load_v2i32_v2i32: 2177; SSE2: ## %bb.0: 2178; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 2179; SSE2-NEXT: pxor %xmm2, %xmm2 2180; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 2181; SSE2-NEXT: movmskpd %xmm2, %eax 2182; SSE2-NEXT: testb $1, %al 2183; SSE2-NEXT: jne LBB17_1 2184; SSE2-NEXT: ## %bb.2: ## %else 2185; SSE2-NEXT: testb $2, %al 2186; SSE2-NEXT: jne LBB17_3 2187; SSE2-NEXT: LBB17_4: ## %else2 2188; SSE2-NEXT: movaps %xmm1, %xmm0 2189; SSE2-NEXT: retq 2190; SSE2-NEXT: LBB17_1: ## %cond.load 2191; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2192; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2193; SSE2-NEXT: testb $2, %al 2194; SSE2-NEXT: je LBB17_4 2195; SSE2-NEXT: LBB17_3: ## %cond.load1 2196; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2197; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2198; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 2199; SSE2-NEXT: movaps %xmm0, %xmm1 2200; SSE2-NEXT: movaps %xmm1, %xmm0 2201; SSE2-NEXT: retq 2202; 2203; SSE42-LABEL: load_v2i32_v2i32: 2204; SSE42: ## %bb.0: 2205; SSE42-NEXT: pxor %xmm2, %xmm2 2206; SSE42-NEXT: pcmpeqd %xmm0, %xmm2 2207; SSE42-NEXT: pmovsxdq %xmm2, %xmm0 2208; SSE42-NEXT: movmskpd %xmm0, %eax 2209; SSE42-NEXT: testb $1, %al 2210; SSE42-NEXT: jne LBB17_1 2211; SSE42-NEXT: ## %bb.2: ## %else 2212; SSE42-NEXT: testb $2, %al 2213; SSE42-NEXT: jne LBB17_3 2214; SSE42-NEXT: LBB17_4: ## %else2 2215; SSE42-NEXT: movdqa %xmm1, %xmm0 2216; SSE42-NEXT: retq 2217; SSE42-NEXT: LBB17_1: ## %cond.load 2218; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1 2219; SSE42-NEXT: testb $2, %al 2220; SSE42-NEXT: je LBB17_4 2221; SSE42-NEXT: LBB17_3: ## %cond.load1 2222; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1 2223; SSE42-NEXT: movdqa %xmm1, %xmm0 2224; SSE42-NEXT: retq 2225; 2226; AVX1-LABEL: load_v2i32_v2i32: 2227; AVX1: ## %bb.0: 2228; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2229; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 2230; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2231; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 2232; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 2233; AVX1-NEXT: retq 2234; 2235; AVX2-LABEL: load_v2i32_v2i32: 2236; AVX2: ## %bb.0: 2237; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2238; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 2239; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2240; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 2241; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 2242; AVX2-NEXT: retq 2243; 2244; AVX512F-LABEL: load_v2i32_v2i32: 2245; AVX512F: ## %bb.0: 2246; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 2247; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 2248; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 2249; AVX512F-NEXT: kshiftlw $14, %k0, %k0 2250; AVX512F-NEXT: kshiftrw $14, %k0, %k1 2251; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} 2252; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 2253; AVX512F-NEXT: vzeroupper 2254; AVX512F-NEXT: retq 2255; 2256; AVX512VLDQ-LABEL: load_v2i32_v2i32: 2257; AVX512VLDQ: ## %bb.0: 2258; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0 2259; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 2260; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 2261; AVX512VLDQ-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} 2262; AVX512VLDQ-NEXT: retq 2263; 2264; AVX512VLBW-LABEL: load_v2i32_v2i32: 2265; AVX512VLBW: ## %bb.0: 2266; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0 2267; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 2268; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 2269; AVX512VLBW-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} 2270; AVX512VLBW-NEXT: retq 2271; 2272; X86-AVX512-LABEL: load_v2i32_v2i32: 2273; X86-AVX512: ## %bb.0: 2274; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 2275; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 2276; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0 2277; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1 2278; X86-AVX512-NEXT: vpblendmd (%eax), %xmm1, %xmm0 {%k1} 2279; X86-AVX512-NEXT: retl 2280 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 2281 %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) 2282 ret <2 x i32> %res 2283} 2284 2285define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) { 2286; SSE2-LABEL: load_v4i32_v4i32: 2287; SSE2: ## %bb.0: 2288; SSE2-NEXT: pxor %xmm2, %xmm2 2289; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 2290; SSE2-NEXT: movmskps %xmm2, %eax 2291; SSE2-NEXT: testb $1, %al 2292; SSE2-NEXT: jne LBB18_1 2293; SSE2-NEXT: ## %bb.2: ## %else 2294; SSE2-NEXT: testb $2, %al 2295; SSE2-NEXT: jne LBB18_3 2296; SSE2-NEXT: LBB18_4: ## %else2 2297; SSE2-NEXT: testb $4, %al 2298; SSE2-NEXT: jne LBB18_5 2299; SSE2-NEXT: LBB18_6: ## %else5 2300; SSE2-NEXT: testb $8, %al 2301; SSE2-NEXT: jne LBB18_7 2302; SSE2-NEXT: LBB18_8: ## %else8 2303; SSE2-NEXT: movaps %xmm1, %xmm0 2304; SSE2-NEXT: retq 2305; SSE2-NEXT: LBB18_1: ## %cond.load 2306; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2307; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2308; SSE2-NEXT: testb $2, %al 2309; SSE2-NEXT: je LBB18_4 2310; SSE2-NEXT: LBB18_3: ## %cond.load1 2311; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2312; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2313; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 2314; SSE2-NEXT: movaps %xmm0, %xmm1 2315; SSE2-NEXT: testb $4, %al 2316; SSE2-NEXT: je LBB18_6 2317; SSE2-NEXT: LBB18_5: ## %cond.load4 2318; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2319; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 2320; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] 2321; SSE2-NEXT: testb $8, %al 2322; SSE2-NEXT: je LBB18_8 2323; SSE2-NEXT: LBB18_7: ## %cond.load7 2324; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2325; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2326; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 2327; SSE2-NEXT: movaps %xmm1, %xmm0 2328; SSE2-NEXT: retq 2329; 2330; SSE42-LABEL: load_v4i32_v4i32: 2331; SSE42: ## %bb.0: 2332; SSE42-NEXT: pxor %xmm2, %xmm2 2333; SSE42-NEXT: pcmpeqd %xmm0, %xmm2 2334; SSE42-NEXT: movmskps %xmm2, %eax 2335; SSE42-NEXT: testb $1, %al 2336; SSE42-NEXT: jne LBB18_1 2337; SSE42-NEXT: ## %bb.2: ## %else 2338; SSE42-NEXT: testb $2, %al 2339; SSE42-NEXT: jne LBB18_3 2340; SSE42-NEXT: LBB18_4: ## %else2 2341; SSE42-NEXT: testb $4, %al 2342; SSE42-NEXT: jne LBB18_5 2343; SSE42-NEXT: LBB18_6: ## %else5 2344; SSE42-NEXT: testb $8, %al 2345; SSE42-NEXT: jne LBB18_7 2346; SSE42-NEXT: LBB18_8: ## %else8 2347; SSE42-NEXT: movdqa %xmm1, %xmm0 2348; SSE42-NEXT: retq 2349; SSE42-NEXT: LBB18_1: ## %cond.load 2350; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1 2351; SSE42-NEXT: testb $2, %al 2352; SSE42-NEXT: je LBB18_4 2353; SSE42-NEXT: LBB18_3: ## %cond.load1 2354; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1 2355; SSE42-NEXT: testb $4, %al 2356; SSE42-NEXT: je LBB18_6 2357; SSE42-NEXT: LBB18_5: ## %cond.load4 2358; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1 2359; SSE42-NEXT: testb $8, %al 2360; SSE42-NEXT: je LBB18_8 2361; SSE42-NEXT: LBB18_7: ## %cond.load7 2362; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1 2363; SSE42-NEXT: movdqa %xmm1, %xmm0 2364; SSE42-NEXT: retq 2365; 2366; AVX1-LABEL: load_v4i32_v4i32: 2367; AVX1: ## %bb.0: 2368; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2369; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 2370; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 2371; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 2372; AVX1-NEXT: retq 2373; 2374; AVX2-LABEL: load_v4i32_v4i32: 2375; AVX2: ## %bb.0: 2376; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2377; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 2378; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 2379; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 2380; AVX2-NEXT: retq 2381; 2382; AVX512F-LABEL: load_v4i32_v4i32: 2383; AVX512F: ## %bb.0: 2384; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 2385; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 2386; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 2387; AVX512F-NEXT: kshiftlw $12, %k0, %k0 2388; AVX512F-NEXT: kshiftrw $12, %k0, %k1 2389; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} 2390; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 2391; AVX512F-NEXT: vzeroupper 2392; AVX512F-NEXT: retq 2393; 2394; AVX512VL-LABEL: load_v4i32_v4i32: 2395; AVX512VL: ## %bb.0: 2396; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1 2397; AVX512VL-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} 2398; AVX512VL-NEXT: retq 2399; 2400; X86-AVX512-LABEL: load_v4i32_v4i32: 2401; X86-AVX512: ## %bb.0: 2402; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 2403; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 2404; X86-AVX512-NEXT: vpblendmd (%eax), %xmm1, %xmm0 {%k1} 2405; X86-AVX512-NEXT: retl 2406 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 2407 %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) 2408 ret <4 x i32> %res 2409} 2410 2411define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, ptr %addr, <8 x i32> %dst) { 2412; SSE2-LABEL: load_v8i32_v8i1: 2413; SSE2: ## %bb.0: 2414; SSE2-NEXT: psllw $15, %xmm0 2415; SSE2-NEXT: packsswb %xmm0, %xmm0 2416; SSE2-NEXT: pmovmskb %xmm0, %eax 2417; SSE2-NEXT: testb $1, %al 2418; SSE2-NEXT: jne LBB19_1 2419; SSE2-NEXT: ## %bb.2: ## %else 2420; SSE2-NEXT: testb $2, %al 2421; SSE2-NEXT: jne LBB19_3 2422; SSE2-NEXT: LBB19_4: ## %else2 2423; SSE2-NEXT: testb $4, %al 2424; SSE2-NEXT: jne LBB19_5 2425; SSE2-NEXT: LBB19_6: ## %else5 2426; SSE2-NEXT: testb $8, %al 2427; SSE2-NEXT: jne LBB19_7 2428; SSE2-NEXT: LBB19_8: ## %else8 2429; SSE2-NEXT: testb $16, %al 2430; SSE2-NEXT: jne LBB19_9 2431; SSE2-NEXT: LBB19_10: ## %else11 2432; SSE2-NEXT: testb $32, %al 2433; SSE2-NEXT: jne LBB19_11 2434; SSE2-NEXT: LBB19_12: ## %else14 2435; SSE2-NEXT: testb $64, %al 2436; SSE2-NEXT: jne LBB19_13 2437; SSE2-NEXT: LBB19_14: ## %else17 2438; SSE2-NEXT: testb $-128, %al 2439; SSE2-NEXT: je LBB19_16 2440; SSE2-NEXT: LBB19_15: ## %cond.load19 2441; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2442; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 2443; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] 2444; SSE2-NEXT: LBB19_16: ## %else20 2445; SSE2-NEXT: movaps %xmm1, %xmm0 2446; SSE2-NEXT: movaps %xmm2, %xmm1 2447; SSE2-NEXT: retq 2448; SSE2-NEXT: LBB19_1: ## %cond.load 2449; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2450; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2451; SSE2-NEXT: testb $2, %al 2452; SSE2-NEXT: je LBB19_4 2453; SSE2-NEXT: LBB19_3: ## %cond.load1 2454; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2455; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2456; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 2457; SSE2-NEXT: movaps %xmm0, %xmm1 2458; SSE2-NEXT: testb $4, %al 2459; SSE2-NEXT: je LBB19_6 2460; SSE2-NEXT: LBB19_5: ## %cond.load4 2461; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2462; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 2463; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] 2464; SSE2-NEXT: testb $8, %al 2465; SSE2-NEXT: je LBB19_8 2466; SSE2-NEXT: LBB19_7: ## %cond.load7 2467; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2468; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2469; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 2470; SSE2-NEXT: testb $16, %al 2471; SSE2-NEXT: je LBB19_10 2472; SSE2-NEXT: LBB19_9: ## %cond.load10 2473; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2474; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] 2475; SSE2-NEXT: testb $32, %al 2476; SSE2-NEXT: je LBB19_12 2477; SSE2-NEXT: LBB19_11: ## %cond.load13 2478; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2479; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2480; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] 2481; SSE2-NEXT: movaps %xmm0, %xmm2 2482; SSE2-NEXT: testb $64, %al 2483; SSE2-NEXT: je LBB19_14 2484; SSE2-NEXT: LBB19_13: ## %cond.load16 2485; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2486; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0] 2487; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] 2488; SSE2-NEXT: testb $-128, %al 2489; SSE2-NEXT: jne LBB19_15 2490; SSE2-NEXT: jmp LBB19_16 2491; 2492; SSE42-LABEL: load_v8i32_v8i1: 2493; SSE42: ## %bb.0: 2494; SSE42-NEXT: psllw $15, %xmm0 2495; SSE42-NEXT: packsswb %xmm0, %xmm0 2496; SSE42-NEXT: pmovmskb %xmm0, %eax 2497; SSE42-NEXT: testb $1, %al 2498; SSE42-NEXT: jne LBB19_1 2499; SSE42-NEXT: ## %bb.2: ## %else 2500; SSE42-NEXT: testb $2, %al 2501; SSE42-NEXT: jne LBB19_3 2502; SSE42-NEXT: LBB19_4: ## %else2 2503; SSE42-NEXT: testb $4, %al 2504; SSE42-NEXT: jne LBB19_5 2505; SSE42-NEXT: LBB19_6: ## %else5 2506; SSE42-NEXT: testb $8, %al 2507; SSE42-NEXT: jne LBB19_7 2508; SSE42-NEXT: LBB19_8: ## %else8 2509; SSE42-NEXT: testb $16, %al 2510; SSE42-NEXT: jne LBB19_9 2511; SSE42-NEXT: LBB19_10: ## %else11 2512; SSE42-NEXT: testb $32, %al 2513; SSE42-NEXT: jne LBB19_11 2514; SSE42-NEXT: LBB19_12: ## %else14 2515; SSE42-NEXT: testb $64, %al 2516; SSE42-NEXT: jne LBB19_13 2517; SSE42-NEXT: LBB19_14: ## %else17 2518; SSE42-NEXT: testb $-128, %al 2519; SSE42-NEXT: je LBB19_16 2520; SSE42-NEXT: LBB19_15: ## %cond.load19 2521; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm2 2522; SSE42-NEXT: LBB19_16: ## %else20 2523; SSE42-NEXT: movdqa %xmm1, %xmm0 2524; SSE42-NEXT: movdqa %xmm2, %xmm1 2525; SSE42-NEXT: retq 2526; SSE42-NEXT: LBB19_1: ## %cond.load 2527; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1 2528; SSE42-NEXT: testb $2, %al 2529; SSE42-NEXT: je LBB19_4 2530; SSE42-NEXT: LBB19_3: ## %cond.load1 2531; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1 2532; SSE42-NEXT: testb $4, %al 2533; SSE42-NEXT: je LBB19_6 2534; SSE42-NEXT: LBB19_5: ## %cond.load4 2535; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1 2536; SSE42-NEXT: testb $8, %al 2537; SSE42-NEXT: je LBB19_8 2538; SSE42-NEXT: LBB19_7: ## %cond.load7 2539; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1 2540; SSE42-NEXT: testb $16, %al 2541; SSE42-NEXT: je LBB19_10 2542; SSE42-NEXT: LBB19_9: ## %cond.load10 2543; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm2 2544; SSE42-NEXT: testb $32, %al 2545; SSE42-NEXT: je LBB19_12 2546; SSE42-NEXT: LBB19_11: ## %cond.load13 2547; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm2 2548; SSE42-NEXT: testb $64, %al 2549; SSE42-NEXT: je LBB19_14 2550; SSE42-NEXT: LBB19_13: ## %cond.load16 2551; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm2 2552; SSE42-NEXT: testb $-128, %al 2553; SSE42-NEXT: jne LBB19_15 2554; SSE42-NEXT: jmp LBB19_16 2555; 2556; AVX1-LABEL: load_v8i32_v8i1: 2557; AVX1: ## %bb.0: 2558; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2559; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 2560; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 2561; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 2562; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 2563; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 2564; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 2565; AVX1-NEXT: retq 2566; 2567; AVX2-LABEL: load_v8i32_v8i1: 2568; AVX2: ## %bb.0: 2569; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2570; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 2571; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 2572; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 2573; AVX2-NEXT: retq 2574; 2575; AVX512F-LABEL: load_v8i32_v8i1: 2576; AVX512F: ## %bb.0: 2577; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 2578; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 2579; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 2580; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 2581; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} 2582; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 2583; AVX512F-NEXT: retq 2584; 2585; AVX512VLDQ-LABEL: load_v8i32_v8i1: 2586; AVX512VLDQ: ## %bb.0: 2587; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 2588; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 2589; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 2590; AVX512VLDQ-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1} 2591; AVX512VLDQ-NEXT: retq 2592; 2593; AVX512VLBW-LABEL: load_v8i32_v8i1: 2594; AVX512VLBW: ## %bb.0: 2595; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0 2596; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1 2597; AVX512VLBW-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1} 2598; AVX512VLBW-NEXT: retq 2599; 2600; X86-AVX512-LABEL: load_v8i32_v8i1: 2601; X86-AVX512: ## %bb.0: 2602; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0 2603; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1 2604; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 2605; X86-AVX512-NEXT: vpblendmd (%eax), %ymm1, %ymm0 {%k1} 2606; X86-AVX512-NEXT: retl 2607 %res = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i32> %dst) 2608 ret <8 x i32> %res 2609} 2610 2611define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) { 2612; SSE2-LABEL: load_v8i32_v8i1_zero: 2613; SSE2: ## %bb.0: 2614; SSE2-NEXT: psllw $15, %xmm0 2615; SSE2-NEXT: packsswb %xmm0, %xmm0 2616; SSE2-NEXT: pmovmskb %xmm0, %eax 2617; SSE2-NEXT: pxor %xmm0, %xmm0 2618; SSE2-NEXT: testb $1, %al 2619; SSE2-NEXT: xorps %xmm1, %xmm1 2620; SSE2-NEXT: jne LBB20_1 2621; SSE2-NEXT: ## %bb.2: ## %else 2622; SSE2-NEXT: testb $2, %al 2623; SSE2-NEXT: jne LBB20_3 2624; SSE2-NEXT: LBB20_4: ## %else2 2625; SSE2-NEXT: testb $4, %al 2626; SSE2-NEXT: jne LBB20_5 2627; SSE2-NEXT: LBB20_6: ## %else5 2628; SSE2-NEXT: testb $8, %al 2629; SSE2-NEXT: jne LBB20_7 2630; SSE2-NEXT: LBB20_8: ## %else8 2631; SSE2-NEXT: testb $16, %al 2632; SSE2-NEXT: jne LBB20_9 2633; SSE2-NEXT: LBB20_10: ## %else11 2634; SSE2-NEXT: testb $32, %al 2635; SSE2-NEXT: jne LBB20_11 2636; SSE2-NEXT: LBB20_12: ## %else14 2637; SSE2-NEXT: testb $64, %al 2638; SSE2-NEXT: jne LBB20_13 2639; SSE2-NEXT: LBB20_14: ## %else17 2640; SSE2-NEXT: testb $-128, %al 2641; SSE2-NEXT: jne LBB20_15 2642; SSE2-NEXT: LBB20_16: ## %else20 2643; SSE2-NEXT: retq 2644; SSE2-NEXT: LBB20_1: ## %cond.load 2645; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2646; SSE2-NEXT: testb $2, %al 2647; SSE2-NEXT: je LBB20_4 2648; SSE2-NEXT: LBB20_3: ## %cond.load1 2649; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 2650; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] 2651; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] 2652; SSE2-NEXT: movaps %xmm2, %xmm0 2653; SSE2-NEXT: testb $4, %al 2654; SSE2-NEXT: je LBB20_6 2655; SSE2-NEXT: LBB20_5: ## %cond.load4 2656; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2657; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0] 2658; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] 2659; SSE2-NEXT: testb $8, %al 2660; SSE2-NEXT: je LBB20_8 2661; SSE2-NEXT: LBB20_7: ## %cond.load7 2662; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2663; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] 2664; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 2665; SSE2-NEXT: testb $16, %al 2666; SSE2-NEXT: je LBB20_10 2667; SSE2-NEXT: LBB20_9: ## %cond.load10 2668; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2669; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 2670; SSE2-NEXT: testb $32, %al 2671; SSE2-NEXT: je LBB20_12 2672; SSE2-NEXT: LBB20_11: ## %cond.load13 2673; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2674; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 2675; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] 2676; SSE2-NEXT: movaps %xmm2, %xmm1 2677; SSE2-NEXT: testb $64, %al 2678; SSE2-NEXT: je LBB20_14 2679; SSE2-NEXT: LBB20_13: ## %cond.load16 2680; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2681; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] 2682; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] 2683; SSE2-NEXT: testb $-128, %al 2684; SSE2-NEXT: je LBB20_16 2685; SSE2-NEXT: LBB20_15: ## %cond.load19 2686; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2687; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] 2688; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] 2689; SSE2-NEXT: retq 2690; 2691; SSE42-LABEL: load_v8i32_v8i1_zero: 2692; SSE42: ## %bb.0: 2693; SSE42-NEXT: psllw $15, %xmm0 2694; SSE42-NEXT: packsswb %xmm0, %xmm0 2695; SSE42-NEXT: pmovmskb %xmm0, %eax 2696; SSE42-NEXT: pxor %xmm0, %xmm0 2697; SSE42-NEXT: testb $1, %al 2698; SSE42-NEXT: pxor %xmm1, %xmm1 2699; SSE42-NEXT: jne LBB20_1 2700; SSE42-NEXT: ## %bb.2: ## %else 2701; SSE42-NEXT: testb $2, %al 2702; SSE42-NEXT: jne LBB20_3 2703; SSE42-NEXT: LBB20_4: ## %else2 2704; SSE42-NEXT: testb $4, %al 2705; SSE42-NEXT: jne LBB20_5 2706; SSE42-NEXT: LBB20_6: ## %else5 2707; SSE42-NEXT: testb $8, %al 2708; SSE42-NEXT: jne LBB20_7 2709; SSE42-NEXT: LBB20_8: ## %else8 2710; SSE42-NEXT: testb $16, %al 2711; SSE42-NEXT: jne LBB20_9 2712; SSE42-NEXT: LBB20_10: ## %else11 2713; SSE42-NEXT: testb $32, %al 2714; SSE42-NEXT: jne LBB20_11 2715; SSE42-NEXT: LBB20_12: ## %else14 2716; SSE42-NEXT: testb $64, %al 2717; SSE42-NEXT: jne LBB20_13 2718; SSE42-NEXT: LBB20_14: ## %else17 2719; SSE42-NEXT: testb $-128, %al 2720; SSE42-NEXT: jne LBB20_15 2721; SSE42-NEXT: LBB20_16: ## %else20 2722; SSE42-NEXT: retq 2723; SSE42-NEXT: LBB20_1: ## %cond.load 2724; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2725; SSE42-NEXT: testb $2, %al 2726; SSE42-NEXT: je LBB20_4 2727; SSE42-NEXT: LBB20_3: ## %cond.load1 2728; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0 2729; SSE42-NEXT: testb $4, %al 2730; SSE42-NEXT: je LBB20_6 2731; SSE42-NEXT: LBB20_5: ## %cond.load4 2732; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0 2733; SSE42-NEXT: testb $8, %al 2734; SSE42-NEXT: je LBB20_8 2735; SSE42-NEXT: LBB20_7: ## %cond.load7 2736; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0 2737; SSE42-NEXT: testb $16, %al 2738; SSE42-NEXT: je LBB20_10 2739; SSE42-NEXT: LBB20_9: ## %cond.load10 2740; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm1 2741; SSE42-NEXT: testb $32, %al 2742; SSE42-NEXT: je LBB20_12 2743; SSE42-NEXT: LBB20_11: ## %cond.load13 2744; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm1 2745; SSE42-NEXT: testb $64, %al 2746; SSE42-NEXT: je LBB20_14 2747; SSE42-NEXT: LBB20_13: ## %cond.load16 2748; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm1 2749; SSE42-NEXT: testb $-128, %al 2750; SSE42-NEXT: je LBB20_16 2751; SSE42-NEXT: LBB20_15: ## %cond.load19 2752; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1 2753; SSE42-NEXT: retq 2754; 2755; AVX1-LABEL: load_v8i32_v8i1_zero: 2756; AVX1: ## %bb.0: 2757; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2758; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 2759; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 2760; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 2761; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2762; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 2763; AVX1-NEXT: retq 2764; 2765; AVX2-LABEL: load_v8i32_v8i1_zero: 2766; AVX2: ## %bb.0: 2767; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2768; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 2769; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0 2770; AVX2-NEXT: retq 2771; 2772; AVX512F-LABEL: load_v8i32_v8i1_zero: 2773; AVX512F: ## %bb.0: 2774; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 2775; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 2776; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 2777; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} 2778; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 2779; AVX512F-NEXT: retq 2780; 2781; AVX512VLDQ-LABEL: load_v8i32_v8i1_zero: 2782; AVX512VLDQ: ## %bb.0: 2783; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 2784; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 2785; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 2786; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} 2787; AVX512VLDQ-NEXT: retq 2788; 2789; AVX512VLBW-LABEL: load_v8i32_v8i1_zero: 2790; AVX512VLBW: ## %bb.0: 2791; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0 2792; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1 2793; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} 2794; AVX512VLBW-NEXT: retq 2795; 2796; X86-AVX512-LABEL: load_v8i32_v8i1_zero: 2797; X86-AVX512: ## %bb.0: 2798; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0 2799; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1 2800; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 2801; X86-AVX512-NEXT: vmovdqu32 (%eax), %ymm0 {%k1} {z} 2802; X86-AVX512-NEXT: retl 2803 %res = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer) 2804 ret <8 x i32> %res 2805} 2806 2807; 2808; vXi16 2809; 2810 2811define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %dst) { 2812; SSE-LABEL: load_v8i16_v8i16: 2813; SSE: ## %bb.0: 2814; SSE-NEXT: packsswb %xmm0, %xmm0 2815; SSE-NEXT: pmovmskb %xmm0, %eax 2816; SSE-NEXT: testb $1, %al 2817; SSE-NEXT: jne LBB21_1 2818; SSE-NEXT: ## %bb.2: ## %else 2819; SSE-NEXT: testb $2, %al 2820; SSE-NEXT: jne LBB21_3 2821; SSE-NEXT: LBB21_4: ## %else2 2822; SSE-NEXT: testb $4, %al 2823; SSE-NEXT: jne LBB21_5 2824; SSE-NEXT: LBB21_6: ## %else5 2825; SSE-NEXT: testb $8, %al 2826; SSE-NEXT: jne LBB21_7 2827; SSE-NEXT: LBB21_8: ## %else8 2828; SSE-NEXT: testb $16, %al 2829; SSE-NEXT: jne LBB21_9 2830; SSE-NEXT: LBB21_10: ## %else11 2831; SSE-NEXT: testb $32, %al 2832; SSE-NEXT: jne LBB21_11 2833; SSE-NEXT: LBB21_12: ## %else14 2834; SSE-NEXT: testb $64, %al 2835; SSE-NEXT: jne LBB21_13 2836; SSE-NEXT: LBB21_14: ## %else17 2837; SSE-NEXT: testb $-128, %al 2838; SSE-NEXT: jne LBB21_15 2839; SSE-NEXT: LBB21_16: ## %else20 2840; SSE-NEXT: movdqa %xmm1, %xmm0 2841; SSE-NEXT: retq 2842; SSE-NEXT: LBB21_1: ## %cond.load 2843; SSE-NEXT: pinsrw $0, (%rdi), %xmm1 2844; SSE-NEXT: testb $2, %al 2845; SSE-NEXT: je LBB21_4 2846; SSE-NEXT: LBB21_3: ## %cond.load1 2847; SSE-NEXT: pinsrw $1, 2(%rdi), %xmm1 2848; SSE-NEXT: testb $4, %al 2849; SSE-NEXT: je LBB21_6 2850; SSE-NEXT: LBB21_5: ## %cond.load4 2851; SSE-NEXT: pinsrw $2, 4(%rdi), %xmm1 2852; SSE-NEXT: testb $8, %al 2853; SSE-NEXT: je LBB21_8 2854; SSE-NEXT: LBB21_7: ## %cond.load7 2855; SSE-NEXT: pinsrw $3, 6(%rdi), %xmm1 2856; SSE-NEXT: testb $16, %al 2857; SSE-NEXT: je LBB21_10 2858; SSE-NEXT: LBB21_9: ## %cond.load10 2859; SSE-NEXT: pinsrw $4, 8(%rdi), %xmm1 2860; SSE-NEXT: testb $32, %al 2861; SSE-NEXT: je LBB21_12 2862; SSE-NEXT: LBB21_11: ## %cond.load13 2863; SSE-NEXT: pinsrw $5, 10(%rdi), %xmm1 2864; SSE-NEXT: testb $64, %al 2865; SSE-NEXT: je LBB21_14 2866; SSE-NEXT: LBB21_13: ## %cond.load16 2867; SSE-NEXT: pinsrw $6, 12(%rdi), %xmm1 2868; SSE-NEXT: testb $-128, %al 2869; SSE-NEXT: je LBB21_16 2870; SSE-NEXT: LBB21_15: ## %cond.load19 2871; SSE-NEXT: pinsrw $7, 14(%rdi), %xmm1 2872; SSE-NEXT: movdqa %xmm1, %xmm0 2873; SSE-NEXT: retq 2874; 2875; AVX1OR2-LABEL: load_v8i16_v8i16: 2876; AVX1OR2: ## %bb.0: 2877; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 2878; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax 2879; AVX1OR2-NEXT: testb $1, %al 2880; AVX1OR2-NEXT: jne LBB21_1 2881; AVX1OR2-NEXT: ## %bb.2: ## %else 2882; AVX1OR2-NEXT: testb $2, %al 2883; AVX1OR2-NEXT: jne LBB21_3 2884; AVX1OR2-NEXT: LBB21_4: ## %else2 2885; AVX1OR2-NEXT: testb $4, %al 2886; AVX1OR2-NEXT: jne LBB21_5 2887; AVX1OR2-NEXT: LBB21_6: ## %else5 2888; AVX1OR2-NEXT: testb $8, %al 2889; AVX1OR2-NEXT: jne LBB21_7 2890; AVX1OR2-NEXT: LBB21_8: ## %else8 2891; AVX1OR2-NEXT: testb $16, %al 2892; AVX1OR2-NEXT: jne LBB21_9 2893; AVX1OR2-NEXT: LBB21_10: ## %else11 2894; AVX1OR2-NEXT: testb $32, %al 2895; AVX1OR2-NEXT: jne LBB21_11 2896; AVX1OR2-NEXT: LBB21_12: ## %else14 2897; AVX1OR2-NEXT: testb $64, %al 2898; AVX1OR2-NEXT: jne LBB21_13 2899; AVX1OR2-NEXT: LBB21_14: ## %else17 2900; AVX1OR2-NEXT: testb $-128, %al 2901; AVX1OR2-NEXT: jne LBB21_15 2902; AVX1OR2-NEXT: LBB21_16: ## %else20 2903; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0 2904; AVX1OR2-NEXT: retq 2905; AVX1OR2-NEXT: LBB21_1: ## %cond.load 2906; AVX1OR2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1 2907; AVX1OR2-NEXT: testb $2, %al 2908; AVX1OR2-NEXT: je LBB21_4 2909; AVX1OR2-NEXT: LBB21_3: ## %cond.load1 2910; AVX1OR2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1 2911; AVX1OR2-NEXT: testb $4, %al 2912; AVX1OR2-NEXT: je LBB21_6 2913; AVX1OR2-NEXT: LBB21_5: ## %cond.load4 2914; AVX1OR2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1 2915; AVX1OR2-NEXT: testb $8, %al 2916; AVX1OR2-NEXT: je LBB21_8 2917; AVX1OR2-NEXT: LBB21_7: ## %cond.load7 2918; AVX1OR2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1 2919; AVX1OR2-NEXT: testb $16, %al 2920; AVX1OR2-NEXT: je LBB21_10 2921; AVX1OR2-NEXT: LBB21_9: ## %cond.load10 2922; AVX1OR2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1 2923; AVX1OR2-NEXT: testb $32, %al 2924; AVX1OR2-NEXT: je LBB21_12 2925; AVX1OR2-NEXT: LBB21_11: ## %cond.load13 2926; AVX1OR2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1 2927; AVX1OR2-NEXT: testb $64, %al 2928; AVX1OR2-NEXT: je LBB21_14 2929; AVX1OR2-NEXT: LBB21_13: ## %cond.load16 2930; AVX1OR2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1 2931; AVX1OR2-NEXT: testb $-128, %al 2932; AVX1OR2-NEXT: je LBB21_16 2933; AVX1OR2-NEXT: LBB21_15: ## %cond.load19 2934; AVX1OR2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1 2935; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0 2936; AVX1OR2-NEXT: retq 2937; 2938; AVX512F-LABEL: load_v8i16_v8i16: 2939; AVX512F: ## %bb.0: 2940; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 2941; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 2942; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 2943; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 2944; AVX512F-NEXT: kmovw %k0, %eax 2945; AVX512F-NEXT: testb $1, %al 2946; AVX512F-NEXT: jne LBB21_1 2947; AVX512F-NEXT: ## %bb.2: ## %else 2948; AVX512F-NEXT: testb $2, %al 2949; AVX512F-NEXT: jne LBB21_3 2950; AVX512F-NEXT: LBB21_4: ## %else2 2951; AVX512F-NEXT: testb $4, %al 2952; AVX512F-NEXT: jne LBB21_5 2953; AVX512F-NEXT: LBB21_6: ## %else5 2954; AVX512F-NEXT: testb $8, %al 2955; AVX512F-NEXT: jne LBB21_7 2956; AVX512F-NEXT: LBB21_8: ## %else8 2957; AVX512F-NEXT: testb $16, %al 2958; AVX512F-NEXT: jne LBB21_9 2959; AVX512F-NEXT: LBB21_10: ## %else11 2960; AVX512F-NEXT: testb $32, %al 2961; AVX512F-NEXT: jne LBB21_11 2962; AVX512F-NEXT: LBB21_12: ## %else14 2963; AVX512F-NEXT: testb $64, %al 2964; AVX512F-NEXT: jne LBB21_13 2965; AVX512F-NEXT: LBB21_14: ## %else17 2966; AVX512F-NEXT: testb $-128, %al 2967; AVX512F-NEXT: jne LBB21_15 2968; AVX512F-NEXT: LBB21_16: ## %else20 2969; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 2970; AVX512F-NEXT: vzeroupper 2971; AVX512F-NEXT: retq 2972; AVX512F-NEXT: LBB21_1: ## %cond.load 2973; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1 2974; AVX512F-NEXT: testb $2, %al 2975; AVX512F-NEXT: je LBB21_4 2976; AVX512F-NEXT: LBB21_3: ## %cond.load1 2977; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1 2978; AVX512F-NEXT: testb $4, %al 2979; AVX512F-NEXT: je LBB21_6 2980; AVX512F-NEXT: LBB21_5: ## %cond.load4 2981; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1 2982; AVX512F-NEXT: testb $8, %al 2983; AVX512F-NEXT: je LBB21_8 2984; AVX512F-NEXT: LBB21_7: ## %cond.load7 2985; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1 2986; AVX512F-NEXT: testb $16, %al 2987; AVX512F-NEXT: je LBB21_10 2988; AVX512F-NEXT: LBB21_9: ## %cond.load10 2989; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1 2990; AVX512F-NEXT: testb $32, %al 2991; AVX512F-NEXT: je LBB21_12 2992; AVX512F-NEXT: LBB21_11: ## %cond.load13 2993; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1 2994; AVX512F-NEXT: testb $64, %al 2995; AVX512F-NEXT: je LBB21_14 2996; AVX512F-NEXT: LBB21_13: ## %cond.load16 2997; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1 2998; AVX512F-NEXT: testb $-128, %al 2999; AVX512F-NEXT: je LBB21_16 3000; AVX512F-NEXT: LBB21_15: ## %cond.load19 3001; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1 3002; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 3003; AVX512F-NEXT: vzeroupper 3004; AVX512F-NEXT: retq 3005; 3006; AVX512VLDQ-LABEL: load_v8i16_v8i16: 3007; AVX512VLDQ: ## %bb.0: 3008; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 3009; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 3010; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 3011; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0 3012; AVX512VLDQ-NEXT: kmovw %k0, %eax 3013; AVX512VLDQ-NEXT: testb $1, %al 3014; AVX512VLDQ-NEXT: jne LBB21_1 3015; AVX512VLDQ-NEXT: ## %bb.2: ## %else 3016; AVX512VLDQ-NEXT: testb $2, %al 3017; AVX512VLDQ-NEXT: jne LBB21_3 3018; AVX512VLDQ-NEXT: LBB21_4: ## %else2 3019; AVX512VLDQ-NEXT: testb $4, %al 3020; AVX512VLDQ-NEXT: jne LBB21_5 3021; AVX512VLDQ-NEXT: LBB21_6: ## %else5 3022; AVX512VLDQ-NEXT: testb $8, %al 3023; AVX512VLDQ-NEXT: jne LBB21_7 3024; AVX512VLDQ-NEXT: LBB21_8: ## %else8 3025; AVX512VLDQ-NEXT: testb $16, %al 3026; AVX512VLDQ-NEXT: jne LBB21_9 3027; AVX512VLDQ-NEXT: LBB21_10: ## %else11 3028; AVX512VLDQ-NEXT: testb $32, %al 3029; AVX512VLDQ-NEXT: jne LBB21_11 3030; AVX512VLDQ-NEXT: LBB21_12: ## %else14 3031; AVX512VLDQ-NEXT: testb $64, %al 3032; AVX512VLDQ-NEXT: jne LBB21_13 3033; AVX512VLDQ-NEXT: LBB21_14: ## %else17 3034; AVX512VLDQ-NEXT: testb $-128, %al 3035; AVX512VLDQ-NEXT: jne LBB21_15 3036; AVX512VLDQ-NEXT: LBB21_16: ## %else20 3037; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 3038; AVX512VLDQ-NEXT: vzeroupper 3039; AVX512VLDQ-NEXT: retq 3040; AVX512VLDQ-NEXT: LBB21_1: ## %cond.load 3041; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1 3042; AVX512VLDQ-NEXT: testb $2, %al 3043; AVX512VLDQ-NEXT: je LBB21_4 3044; AVX512VLDQ-NEXT: LBB21_3: ## %cond.load1 3045; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1 3046; AVX512VLDQ-NEXT: testb $4, %al 3047; AVX512VLDQ-NEXT: je LBB21_6 3048; AVX512VLDQ-NEXT: LBB21_5: ## %cond.load4 3049; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1 3050; AVX512VLDQ-NEXT: testb $8, %al 3051; AVX512VLDQ-NEXT: je LBB21_8 3052; AVX512VLDQ-NEXT: LBB21_7: ## %cond.load7 3053; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1 3054; AVX512VLDQ-NEXT: testb $16, %al 3055; AVX512VLDQ-NEXT: je LBB21_10 3056; AVX512VLDQ-NEXT: LBB21_9: ## %cond.load10 3057; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1 3058; AVX512VLDQ-NEXT: testb $32, %al 3059; AVX512VLDQ-NEXT: je LBB21_12 3060; AVX512VLDQ-NEXT: LBB21_11: ## %cond.load13 3061; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1 3062; AVX512VLDQ-NEXT: testb $64, %al 3063; AVX512VLDQ-NEXT: je LBB21_14 3064; AVX512VLDQ-NEXT: LBB21_13: ## %cond.load16 3065; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1 3066; AVX512VLDQ-NEXT: testb $-128, %al 3067; AVX512VLDQ-NEXT: je LBB21_16 3068; AVX512VLDQ-NEXT: LBB21_15: ## %cond.load19 3069; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1 3070; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 3071; AVX512VLDQ-NEXT: vzeroupper 3072; AVX512VLDQ-NEXT: retq 3073; 3074; AVX512VLBW-LABEL: load_v8i16_v8i16: 3075; AVX512VLBW: ## %bb.0: 3076; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1 3077; AVX512VLBW-NEXT: vpblendmw (%rdi), %xmm1, %xmm0 {%k1} 3078; AVX512VLBW-NEXT: retq 3079; 3080; X86-AVX512-LABEL: load_v8i16_v8i16: 3081; X86-AVX512: ## %bb.0: 3082; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 3083; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1 3084; X86-AVX512-NEXT: vpblendmw (%eax), %xmm1, %xmm0 {%k1} 3085; X86-AVX512-NEXT: retl 3086 %mask = icmp slt <8 x i16> %trigger, zeroinitializer 3087 %res = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i16> %dst) 3088 ret <8 x i16> %res 3089} 3090 3091define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %dst) { 3092; SSE-LABEL: load_v16i16_v16i16: 3093; SSE: ## %bb.0: 3094; SSE-NEXT: packsswb %xmm1, %xmm0 3095; SSE-NEXT: pmovmskb %xmm0, %eax 3096; SSE-NEXT: testb $1, %al 3097; SSE-NEXT: jne LBB22_1 3098; SSE-NEXT: ## %bb.2: ## %else 3099; SSE-NEXT: testb $2, %al 3100; SSE-NEXT: jne LBB22_3 3101; SSE-NEXT: LBB22_4: ## %else2 3102; SSE-NEXT: testb $4, %al 3103; SSE-NEXT: jne LBB22_5 3104; SSE-NEXT: LBB22_6: ## %else5 3105; SSE-NEXT: testb $8, %al 3106; SSE-NEXT: jne LBB22_7 3107; SSE-NEXT: LBB22_8: ## %else8 3108; SSE-NEXT: testb $16, %al 3109; SSE-NEXT: jne LBB22_9 3110; SSE-NEXT: LBB22_10: ## %else11 3111; SSE-NEXT: testb $32, %al 3112; SSE-NEXT: jne LBB22_11 3113; SSE-NEXT: LBB22_12: ## %else14 3114; SSE-NEXT: testb $64, %al 3115; SSE-NEXT: jne LBB22_13 3116; SSE-NEXT: LBB22_14: ## %else17 3117; SSE-NEXT: testb %al, %al 3118; SSE-NEXT: js LBB22_15 3119; SSE-NEXT: LBB22_16: ## %else20 3120; SSE-NEXT: testl $256, %eax ## imm = 0x100 3121; SSE-NEXT: jne LBB22_17 3122; SSE-NEXT: LBB22_18: ## %else23 3123; SSE-NEXT: testl $512, %eax ## imm = 0x200 3124; SSE-NEXT: jne LBB22_19 3125; SSE-NEXT: LBB22_20: ## %else26 3126; SSE-NEXT: testl $1024, %eax ## imm = 0x400 3127; SSE-NEXT: jne LBB22_21 3128; SSE-NEXT: LBB22_22: ## %else29 3129; SSE-NEXT: testl $2048, %eax ## imm = 0x800 3130; SSE-NEXT: jne LBB22_23 3131; SSE-NEXT: LBB22_24: ## %else32 3132; SSE-NEXT: testl $4096, %eax ## imm = 0x1000 3133; SSE-NEXT: jne LBB22_25 3134; SSE-NEXT: LBB22_26: ## %else35 3135; SSE-NEXT: testl $8192, %eax ## imm = 0x2000 3136; SSE-NEXT: jne LBB22_27 3137; SSE-NEXT: LBB22_28: ## %else38 3138; SSE-NEXT: testl $16384, %eax ## imm = 0x4000 3139; SSE-NEXT: jne LBB22_29 3140; SSE-NEXT: LBB22_30: ## %else41 3141; SSE-NEXT: testl $32768, %eax ## imm = 0x8000 3142; SSE-NEXT: je LBB22_32 3143; SSE-NEXT: LBB22_31: ## %cond.load43 3144; SSE-NEXT: pinsrw $7, 30(%rdi), %xmm3 3145; SSE-NEXT: LBB22_32: ## %else44 3146; SSE-NEXT: movdqa %xmm2, %xmm0 3147; SSE-NEXT: movdqa %xmm3, %xmm1 3148; SSE-NEXT: retq 3149; SSE-NEXT: LBB22_1: ## %cond.load 3150; SSE-NEXT: pinsrw $0, (%rdi), %xmm2 3151; SSE-NEXT: testb $2, %al 3152; SSE-NEXT: je LBB22_4 3153; SSE-NEXT: LBB22_3: ## %cond.load1 3154; SSE-NEXT: pinsrw $1, 2(%rdi), %xmm2 3155; SSE-NEXT: testb $4, %al 3156; SSE-NEXT: je LBB22_6 3157; SSE-NEXT: LBB22_5: ## %cond.load4 3158; SSE-NEXT: pinsrw $2, 4(%rdi), %xmm2 3159; SSE-NEXT: testb $8, %al 3160; SSE-NEXT: je LBB22_8 3161; SSE-NEXT: LBB22_7: ## %cond.load7 3162; SSE-NEXT: pinsrw $3, 6(%rdi), %xmm2 3163; SSE-NEXT: testb $16, %al 3164; SSE-NEXT: je LBB22_10 3165; SSE-NEXT: LBB22_9: ## %cond.load10 3166; SSE-NEXT: pinsrw $4, 8(%rdi), %xmm2 3167; SSE-NEXT: testb $32, %al 3168; SSE-NEXT: je LBB22_12 3169; SSE-NEXT: LBB22_11: ## %cond.load13 3170; SSE-NEXT: pinsrw $5, 10(%rdi), %xmm2 3171; SSE-NEXT: testb $64, %al 3172; SSE-NEXT: je LBB22_14 3173; SSE-NEXT: LBB22_13: ## %cond.load16 3174; SSE-NEXT: pinsrw $6, 12(%rdi), %xmm2 3175; SSE-NEXT: testb %al, %al 3176; SSE-NEXT: jns LBB22_16 3177; SSE-NEXT: LBB22_15: ## %cond.load19 3178; SSE-NEXT: pinsrw $7, 14(%rdi), %xmm2 3179; SSE-NEXT: testl $256, %eax ## imm = 0x100 3180; SSE-NEXT: je LBB22_18 3181; SSE-NEXT: LBB22_17: ## %cond.load22 3182; SSE-NEXT: pinsrw $0, 16(%rdi), %xmm3 3183; SSE-NEXT: testl $512, %eax ## imm = 0x200 3184; SSE-NEXT: je LBB22_20 3185; SSE-NEXT: LBB22_19: ## %cond.load25 3186; SSE-NEXT: pinsrw $1, 18(%rdi), %xmm3 3187; SSE-NEXT: testl $1024, %eax ## imm = 0x400 3188; SSE-NEXT: je LBB22_22 3189; SSE-NEXT: LBB22_21: ## %cond.load28 3190; SSE-NEXT: pinsrw $2, 20(%rdi), %xmm3 3191; SSE-NEXT: testl $2048, %eax ## imm = 0x800 3192; SSE-NEXT: je LBB22_24 3193; SSE-NEXT: LBB22_23: ## %cond.load31 3194; SSE-NEXT: pinsrw $3, 22(%rdi), %xmm3 3195; SSE-NEXT: testl $4096, %eax ## imm = 0x1000 3196; SSE-NEXT: je LBB22_26 3197; SSE-NEXT: LBB22_25: ## %cond.load34 3198; SSE-NEXT: pinsrw $4, 24(%rdi), %xmm3 3199; SSE-NEXT: testl $8192, %eax ## imm = 0x2000 3200; SSE-NEXT: je LBB22_28 3201; SSE-NEXT: LBB22_27: ## %cond.load37 3202; SSE-NEXT: pinsrw $5, 26(%rdi), %xmm3 3203; SSE-NEXT: testl $16384, %eax ## imm = 0x4000 3204; SSE-NEXT: je LBB22_30 3205; SSE-NEXT: LBB22_29: ## %cond.load40 3206; SSE-NEXT: pinsrw $6, 28(%rdi), %xmm3 3207; SSE-NEXT: testl $32768, %eax ## imm = 0x8000 3208; SSE-NEXT: jne LBB22_31 3209; SSE-NEXT: jmp LBB22_32 3210; 3211; AVX1-LABEL: load_v16i16_v16i16: 3212; AVX1: ## %bb.0: 3213; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3214; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 3215; AVX1-NEXT: vpmovmskb %xmm0, %eax 3216; AVX1-NEXT: testb $1, %al 3217; AVX1-NEXT: jne LBB22_1 3218; AVX1-NEXT: ## %bb.2: ## %else 3219; AVX1-NEXT: testb $2, %al 3220; AVX1-NEXT: jne LBB22_3 3221; AVX1-NEXT: LBB22_4: ## %else2 3222; AVX1-NEXT: testb $4, %al 3223; AVX1-NEXT: jne LBB22_5 3224; AVX1-NEXT: LBB22_6: ## %else5 3225; AVX1-NEXT: testb $8, %al 3226; AVX1-NEXT: jne LBB22_7 3227; AVX1-NEXT: LBB22_8: ## %else8 3228; AVX1-NEXT: testb $16, %al 3229; AVX1-NEXT: jne LBB22_9 3230; AVX1-NEXT: LBB22_10: ## %else11 3231; AVX1-NEXT: testb $32, %al 3232; AVX1-NEXT: jne LBB22_11 3233; AVX1-NEXT: LBB22_12: ## %else14 3234; AVX1-NEXT: testb $64, %al 3235; AVX1-NEXT: jne LBB22_13 3236; AVX1-NEXT: LBB22_14: ## %else17 3237; AVX1-NEXT: testb %al, %al 3238; AVX1-NEXT: js LBB22_15 3239; AVX1-NEXT: LBB22_16: ## %else20 3240; AVX1-NEXT: testl $256, %eax ## imm = 0x100 3241; AVX1-NEXT: jne LBB22_17 3242; AVX1-NEXT: LBB22_18: ## %else23 3243; AVX1-NEXT: testl $512, %eax ## imm = 0x200 3244; AVX1-NEXT: jne LBB22_19 3245; AVX1-NEXT: LBB22_20: ## %else26 3246; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 3247; AVX1-NEXT: jne LBB22_21 3248; AVX1-NEXT: LBB22_22: ## %else29 3249; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 3250; AVX1-NEXT: jne LBB22_23 3251; AVX1-NEXT: LBB22_24: ## %else32 3252; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 3253; AVX1-NEXT: jne LBB22_25 3254; AVX1-NEXT: LBB22_26: ## %else35 3255; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 3256; AVX1-NEXT: jne LBB22_27 3257; AVX1-NEXT: LBB22_28: ## %else38 3258; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 3259; AVX1-NEXT: jne LBB22_29 3260; AVX1-NEXT: LBB22_30: ## %else41 3261; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 3262; AVX1-NEXT: jne LBB22_31 3263; AVX1-NEXT: LBB22_32: ## %else44 3264; AVX1-NEXT: vmovaps %ymm1, %ymm0 3265; AVX1-NEXT: retq 3266; AVX1-NEXT: LBB22_1: ## %cond.load 3267; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0 3268; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3269; AVX1-NEXT: testb $2, %al 3270; AVX1-NEXT: je LBB22_4 3271; AVX1-NEXT: LBB22_3: ## %cond.load1 3272; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0 3273; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3274; AVX1-NEXT: testb $4, %al 3275; AVX1-NEXT: je LBB22_6 3276; AVX1-NEXT: LBB22_5: ## %cond.load4 3277; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0 3278; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3279; AVX1-NEXT: testb $8, %al 3280; AVX1-NEXT: je LBB22_8 3281; AVX1-NEXT: LBB22_7: ## %cond.load7 3282; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0 3283; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3284; AVX1-NEXT: testb $16, %al 3285; AVX1-NEXT: je LBB22_10 3286; AVX1-NEXT: LBB22_9: ## %cond.load10 3287; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0 3288; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3289; AVX1-NEXT: testb $32, %al 3290; AVX1-NEXT: je LBB22_12 3291; AVX1-NEXT: LBB22_11: ## %cond.load13 3292; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0 3293; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3294; AVX1-NEXT: testb $64, %al 3295; AVX1-NEXT: je LBB22_14 3296; AVX1-NEXT: LBB22_13: ## %cond.load16 3297; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0 3298; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3299; AVX1-NEXT: testb %al, %al 3300; AVX1-NEXT: jns LBB22_16 3301; AVX1-NEXT: LBB22_15: ## %cond.load19 3302; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0 3303; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3304; AVX1-NEXT: testl $256, %eax ## imm = 0x100 3305; AVX1-NEXT: je LBB22_18 3306; AVX1-NEXT: LBB22_17: ## %cond.load22 3307; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 3308; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 3309; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 3310; AVX1-NEXT: testl $512, %eax ## imm = 0x200 3311; AVX1-NEXT: je LBB22_20 3312; AVX1-NEXT: LBB22_19: ## %cond.load25 3313; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 3314; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 3315; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 3316; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 3317; AVX1-NEXT: je LBB22_22 3318; AVX1-NEXT: LBB22_21: ## %cond.load28 3319; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 3320; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 3321; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 3322; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 3323; AVX1-NEXT: je LBB22_24 3324; AVX1-NEXT: LBB22_23: ## %cond.load31 3325; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 3326; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 3327; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 3328; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 3329; AVX1-NEXT: je LBB22_26 3330; AVX1-NEXT: LBB22_25: ## %cond.load34 3331; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 3332; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 3333; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 3334; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 3335; AVX1-NEXT: je LBB22_28 3336; AVX1-NEXT: LBB22_27: ## %cond.load37 3337; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 3338; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 3339; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 3340; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 3341; AVX1-NEXT: je LBB22_30 3342; AVX1-NEXT: LBB22_29: ## %cond.load40 3343; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 3344; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 3345; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 3346; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 3347; AVX1-NEXT: je LBB22_32 3348; AVX1-NEXT: LBB22_31: ## %cond.load43 3349; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 3350; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 3351; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 3352; AVX1-NEXT: vmovaps %ymm1, %ymm0 3353; AVX1-NEXT: retq 3354; 3355; AVX2-LABEL: load_v16i16_v16i16: 3356; AVX2: ## %bb.0: 3357; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 3358; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 3359; AVX2-NEXT: vpmovmskb %xmm0, %eax 3360; AVX2-NEXT: testb $1, %al 3361; AVX2-NEXT: jne LBB22_1 3362; AVX2-NEXT: ## %bb.2: ## %else 3363; AVX2-NEXT: testb $2, %al 3364; AVX2-NEXT: jne LBB22_3 3365; AVX2-NEXT: LBB22_4: ## %else2 3366; AVX2-NEXT: testb $4, %al 3367; AVX2-NEXT: jne LBB22_5 3368; AVX2-NEXT: LBB22_6: ## %else5 3369; AVX2-NEXT: testb $8, %al 3370; AVX2-NEXT: jne LBB22_7 3371; AVX2-NEXT: LBB22_8: ## %else8 3372; AVX2-NEXT: testb $16, %al 3373; AVX2-NEXT: jne LBB22_9 3374; AVX2-NEXT: LBB22_10: ## %else11 3375; AVX2-NEXT: testb $32, %al 3376; AVX2-NEXT: jne LBB22_11 3377; AVX2-NEXT: LBB22_12: ## %else14 3378; AVX2-NEXT: testb $64, %al 3379; AVX2-NEXT: jne LBB22_13 3380; AVX2-NEXT: LBB22_14: ## %else17 3381; AVX2-NEXT: testb %al, %al 3382; AVX2-NEXT: js LBB22_15 3383; AVX2-NEXT: LBB22_16: ## %else20 3384; AVX2-NEXT: testl $256, %eax ## imm = 0x100 3385; AVX2-NEXT: jne LBB22_17 3386; AVX2-NEXT: LBB22_18: ## %else23 3387; AVX2-NEXT: testl $512, %eax ## imm = 0x200 3388; AVX2-NEXT: jne LBB22_19 3389; AVX2-NEXT: LBB22_20: ## %else26 3390; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 3391; AVX2-NEXT: jne LBB22_21 3392; AVX2-NEXT: LBB22_22: ## %else29 3393; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 3394; AVX2-NEXT: jne LBB22_23 3395; AVX2-NEXT: LBB22_24: ## %else32 3396; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 3397; AVX2-NEXT: jne LBB22_25 3398; AVX2-NEXT: LBB22_26: ## %else35 3399; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 3400; AVX2-NEXT: jne LBB22_27 3401; AVX2-NEXT: LBB22_28: ## %else38 3402; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 3403; AVX2-NEXT: jne LBB22_29 3404; AVX2-NEXT: LBB22_30: ## %else41 3405; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 3406; AVX2-NEXT: jne LBB22_31 3407; AVX2-NEXT: LBB22_32: ## %else44 3408; AVX2-NEXT: vmovdqa %ymm1, %ymm0 3409; AVX2-NEXT: retq 3410; AVX2-NEXT: LBB22_1: ## %cond.load 3411; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0 3412; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3413; AVX2-NEXT: testb $2, %al 3414; AVX2-NEXT: je LBB22_4 3415; AVX2-NEXT: LBB22_3: ## %cond.load1 3416; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0 3417; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3418; AVX2-NEXT: testb $4, %al 3419; AVX2-NEXT: je LBB22_6 3420; AVX2-NEXT: LBB22_5: ## %cond.load4 3421; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0 3422; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3423; AVX2-NEXT: testb $8, %al 3424; AVX2-NEXT: je LBB22_8 3425; AVX2-NEXT: LBB22_7: ## %cond.load7 3426; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0 3427; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3428; AVX2-NEXT: testb $16, %al 3429; AVX2-NEXT: je LBB22_10 3430; AVX2-NEXT: LBB22_9: ## %cond.load10 3431; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0 3432; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3433; AVX2-NEXT: testb $32, %al 3434; AVX2-NEXT: je LBB22_12 3435; AVX2-NEXT: LBB22_11: ## %cond.load13 3436; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0 3437; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3438; AVX2-NEXT: testb $64, %al 3439; AVX2-NEXT: je LBB22_14 3440; AVX2-NEXT: LBB22_13: ## %cond.load16 3441; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0 3442; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3443; AVX2-NEXT: testb %al, %al 3444; AVX2-NEXT: jns LBB22_16 3445; AVX2-NEXT: LBB22_15: ## %cond.load19 3446; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0 3447; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3448; AVX2-NEXT: testl $256, %eax ## imm = 0x100 3449; AVX2-NEXT: je LBB22_18 3450; AVX2-NEXT: LBB22_17: ## %cond.load22 3451; AVX2-NEXT: vpbroadcastw 16(%rdi), %ymm0 3452; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 3453; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3454; AVX2-NEXT: testl $512, %eax ## imm = 0x200 3455; AVX2-NEXT: je LBB22_20 3456; AVX2-NEXT: LBB22_19: ## %cond.load25 3457; AVX2-NEXT: vpbroadcastw 18(%rdi), %ymm0 3458; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] 3459; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3460; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 3461; AVX2-NEXT: je LBB22_22 3462; AVX2-NEXT: LBB22_21: ## %cond.load28 3463; AVX2-NEXT: vpbroadcastw 20(%rdi), %ymm0 3464; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] 3465; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3466; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 3467; AVX2-NEXT: je LBB22_24 3468; AVX2-NEXT: LBB22_23: ## %cond.load31 3469; AVX2-NEXT: vpbroadcastw 22(%rdi), %ymm0 3470; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] 3471; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3472; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 3473; AVX2-NEXT: je LBB22_26 3474; AVX2-NEXT: LBB22_25: ## %cond.load34 3475; AVX2-NEXT: vpbroadcastw 24(%rdi), %ymm0 3476; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] 3477; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3478; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 3479; AVX2-NEXT: je LBB22_28 3480; AVX2-NEXT: LBB22_27: ## %cond.load37 3481; AVX2-NEXT: vpbroadcastw 26(%rdi), %ymm0 3482; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] 3483; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3484; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 3485; AVX2-NEXT: je LBB22_30 3486; AVX2-NEXT: LBB22_29: ## %cond.load40 3487; AVX2-NEXT: vpbroadcastw 28(%rdi), %ymm0 3488; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] 3489; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3490; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 3491; AVX2-NEXT: je LBB22_32 3492; AVX2-NEXT: LBB22_31: ## %cond.load43 3493; AVX2-NEXT: vpbroadcastw 30(%rdi), %ymm0 3494; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] 3495; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3496; AVX2-NEXT: vmovdqa %ymm1, %ymm0 3497; AVX2-NEXT: retq 3498; 3499; AVX512F-LABEL: load_v16i16_v16i16: 3500; AVX512F: ## %bb.0: 3501; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 3502; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 3503; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 3504; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 3505; AVX512F-NEXT: kmovw %k0, %eax 3506; AVX512F-NEXT: testb $1, %al 3507; AVX512F-NEXT: jne LBB22_1 3508; AVX512F-NEXT: ## %bb.2: ## %else 3509; AVX512F-NEXT: testb $2, %al 3510; AVX512F-NEXT: jne LBB22_3 3511; AVX512F-NEXT: LBB22_4: ## %else2 3512; AVX512F-NEXT: testb $4, %al 3513; AVX512F-NEXT: jne LBB22_5 3514; AVX512F-NEXT: LBB22_6: ## %else5 3515; AVX512F-NEXT: testb $8, %al 3516; AVX512F-NEXT: jne LBB22_7 3517; AVX512F-NEXT: LBB22_8: ## %else8 3518; AVX512F-NEXT: testb $16, %al 3519; AVX512F-NEXT: jne LBB22_9 3520; AVX512F-NEXT: LBB22_10: ## %else11 3521; AVX512F-NEXT: testb $32, %al 3522; AVX512F-NEXT: jne LBB22_11 3523; AVX512F-NEXT: LBB22_12: ## %else14 3524; AVX512F-NEXT: testb $64, %al 3525; AVX512F-NEXT: jne LBB22_13 3526; AVX512F-NEXT: LBB22_14: ## %else17 3527; AVX512F-NEXT: testb %al, %al 3528; AVX512F-NEXT: js LBB22_15 3529; AVX512F-NEXT: LBB22_16: ## %else20 3530; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 3531; AVX512F-NEXT: jne LBB22_17 3532; AVX512F-NEXT: LBB22_18: ## %else23 3533; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 3534; AVX512F-NEXT: jne LBB22_19 3535; AVX512F-NEXT: LBB22_20: ## %else26 3536; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 3537; AVX512F-NEXT: jne LBB22_21 3538; AVX512F-NEXT: LBB22_22: ## %else29 3539; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 3540; AVX512F-NEXT: jne LBB22_23 3541; AVX512F-NEXT: LBB22_24: ## %else32 3542; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 3543; AVX512F-NEXT: jne LBB22_25 3544; AVX512F-NEXT: LBB22_26: ## %else35 3545; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 3546; AVX512F-NEXT: jne LBB22_27 3547; AVX512F-NEXT: LBB22_28: ## %else38 3548; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 3549; AVX512F-NEXT: jne LBB22_29 3550; AVX512F-NEXT: LBB22_30: ## %else41 3551; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 3552; AVX512F-NEXT: jne LBB22_31 3553; AVX512F-NEXT: LBB22_32: ## %else44 3554; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 3555; AVX512F-NEXT: retq 3556; AVX512F-NEXT: LBB22_1: ## %cond.load 3557; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0 3558; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3559; AVX512F-NEXT: testb $2, %al 3560; AVX512F-NEXT: je LBB22_4 3561; AVX512F-NEXT: LBB22_3: ## %cond.load1 3562; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0 3563; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3564; AVX512F-NEXT: testb $4, %al 3565; AVX512F-NEXT: je LBB22_6 3566; AVX512F-NEXT: LBB22_5: ## %cond.load4 3567; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0 3568; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3569; AVX512F-NEXT: testb $8, %al 3570; AVX512F-NEXT: je LBB22_8 3571; AVX512F-NEXT: LBB22_7: ## %cond.load7 3572; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0 3573; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3574; AVX512F-NEXT: testb $16, %al 3575; AVX512F-NEXT: je LBB22_10 3576; AVX512F-NEXT: LBB22_9: ## %cond.load10 3577; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0 3578; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3579; AVX512F-NEXT: testb $32, %al 3580; AVX512F-NEXT: je LBB22_12 3581; AVX512F-NEXT: LBB22_11: ## %cond.load13 3582; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0 3583; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3584; AVX512F-NEXT: testb $64, %al 3585; AVX512F-NEXT: je LBB22_14 3586; AVX512F-NEXT: LBB22_13: ## %cond.load16 3587; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0 3588; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3589; AVX512F-NEXT: testb %al, %al 3590; AVX512F-NEXT: jns LBB22_16 3591; AVX512F-NEXT: LBB22_15: ## %cond.load19 3592; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0 3593; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3594; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 3595; AVX512F-NEXT: je LBB22_18 3596; AVX512F-NEXT: LBB22_17: ## %cond.load22 3597; AVX512F-NEXT: vpbroadcastw 16(%rdi), %ymm0 3598; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 3599; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3600; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 3601; AVX512F-NEXT: je LBB22_20 3602; AVX512F-NEXT: LBB22_19: ## %cond.load25 3603; AVX512F-NEXT: vpbroadcastw 18(%rdi), %ymm0 3604; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] 3605; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3606; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 3607; AVX512F-NEXT: je LBB22_22 3608; AVX512F-NEXT: LBB22_21: ## %cond.load28 3609; AVX512F-NEXT: vpbroadcastw 20(%rdi), %ymm0 3610; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] 3611; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3612; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 3613; AVX512F-NEXT: je LBB22_24 3614; AVX512F-NEXT: LBB22_23: ## %cond.load31 3615; AVX512F-NEXT: vpbroadcastw 22(%rdi), %ymm0 3616; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] 3617; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3618; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 3619; AVX512F-NEXT: je LBB22_26 3620; AVX512F-NEXT: LBB22_25: ## %cond.load34 3621; AVX512F-NEXT: vpbroadcastw 24(%rdi), %ymm0 3622; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] 3623; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3624; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 3625; AVX512F-NEXT: je LBB22_28 3626; AVX512F-NEXT: LBB22_27: ## %cond.load37 3627; AVX512F-NEXT: vpbroadcastw 26(%rdi), %ymm0 3628; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] 3629; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3630; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 3631; AVX512F-NEXT: je LBB22_30 3632; AVX512F-NEXT: LBB22_29: ## %cond.load40 3633; AVX512F-NEXT: vpbroadcastw 28(%rdi), %ymm0 3634; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] 3635; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3636; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 3637; AVX512F-NEXT: je LBB22_32 3638; AVX512F-NEXT: LBB22_31: ## %cond.load43 3639; AVX512F-NEXT: vpbroadcastw 30(%rdi), %ymm0 3640; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] 3641; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3642; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 3643; AVX512F-NEXT: retq 3644; 3645; AVX512VLDQ-LABEL: load_v16i16_v16i16: 3646; AVX512VLDQ: ## %bb.0: 3647; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 3648; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 3649; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0 3650; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 3651; AVX512VLDQ-NEXT: kmovw %k0, %eax 3652; AVX512VLDQ-NEXT: testb $1, %al 3653; AVX512VLDQ-NEXT: jne LBB22_1 3654; AVX512VLDQ-NEXT: ## %bb.2: ## %else 3655; AVX512VLDQ-NEXT: testb $2, %al 3656; AVX512VLDQ-NEXT: jne LBB22_3 3657; AVX512VLDQ-NEXT: LBB22_4: ## %else2 3658; AVX512VLDQ-NEXT: testb $4, %al 3659; AVX512VLDQ-NEXT: jne LBB22_5 3660; AVX512VLDQ-NEXT: LBB22_6: ## %else5 3661; AVX512VLDQ-NEXT: testb $8, %al 3662; AVX512VLDQ-NEXT: jne LBB22_7 3663; AVX512VLDQ-NEXT: LBB22_8: ## %else8 3664; AVX512VLDQ-NEXT: testb $16, %al 3665; AVX512VLDQ-NEXT: jne LBB22_9 3666; AVX512VLDQ-NEXT: LBB22_10: ## %else11 3667; AVX512VLDQ-NEXT: testb $32, %al 3668; AVX512VLDQ-NEXT: jne LBB22_11 3669; AVX512VLDQ-NEXT: LBB22_12: ## %else14 3670; AVX512VLDQ-NEXT: testb $64, %al 3671; AVX512VLDQ-NEXT: jne LBB22_13 3672; AVX512VLDQ-NEXT: LBB22_14: ## %else17 3673; AVX512VLDQ-NEXT: testb %al, %al 3674; AVX512VLDQ-NEXT: js LBB22_15 3675; AVX512VLDQ-NEXT: LBB22_16: ## %else20 3676; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 3677; AVX512VLDQ-NEXT: jne LBB22_17 3678; AVX512VLDQ-NEXT: LBB22_18: ## %else23 3679; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 3680; AVX512VLDQ-NEXT: jne LBB22_19 3681; AVX512VLDQ-NEXT: LBB22_20: ## %else26 3682; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 3683; AVX512VLDQ-NEXT: jne LBB22_21 3684; AVX512VLDQ-NEXT: LBB22_22: ## %else29 3685; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 3686; AVX512VLDQ-NEXT: jne LBB22_23 3687; AVX512VLDQ-NEXT: LBB22_24: ## %else32 3688; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 3689; AVX512VLDQ-NEXT: jne LBB22_25 3690; AVX512VLDQ-NEXT: LBB22_26: ## %else35 3691; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 3692; AVX512VLDQ-NEXT: jne LBB22_27 3693; AVX512VLDQ-NEXT: LBB22_28: ## %else38 3694; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 3695; AVX512VLDQ-NEXT: jne LBB22_29 3696; AVX512VLDQ-NEXT: LBB22_30: ## %else41 3697; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 3698; AVX512VLDQ-NEXT: jne LBB22_31 3699; AVX512VLDQ-NEXT: LBB22_32: ## %else44 3700; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 3701; AVX512VLDQ-NEXT: retq 3702; AVX512VLDQ-NEXT: LBB22_1: ## %cond.load 3703; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0 3704; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3705; AVX512VLDQ-NEXT: testb $2, %al 3706; AVX512VLDQ-NEXT: je LBB22_4 3707; AVX512VLDQ-NEXT: LBB22_3: ## %cond.load1 3708; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0 3709; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3710; AVX512VLDQ-NEXT: testb $4, %al 3711; AVX512VLDQ-NEXT: je LBB22_6 3712; AVX512VLDQ-NEXT: LBB22_5: ## %cond.load4 3713; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0 3714; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3715; AVX512VLDQ-NEXT: testb $8, %al 3716; AVX512VLDQ-NEXT: je LBB22_8 3717; AVX512VLDQ-NEXT: LBB22_7: ## %cond.load7 3718; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0 3719; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3720; AVX512VLDQ-NEXT: testb $16, %al 3721; AVX512VLDQ-NEXT: je LBB22_10 3722; AVX512VLDQ-NEXT: LBB22_9: ## %cond.load10 3723; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0 3724; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3725; AVX512VLDQ-NEXT: testb $32, %al 3726; AVX512VLDQ-NEXT: je LBB22_12 3727; AVX512VLDQ-NEXT: LBB22_11: ## %cond.load13 3728; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0 3729; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3730; AVX512VLDQ-NEXT: testb $64, %al 3731; AVX512VLDQ-NEXT: je LBB22_14 3732; AVX512VLDQ-NEXT: LBB22_13: ## %cond.load16 3733; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0 3734; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3735; AVX512VLDQ-NEXT: testb %al, %al 3736; AVX512VLDQ-NEXT: jns LBB22_16 3737; AVX512VLDQ-NEXT: LBB22_15: ## %cond.load19 3738; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0 3739; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3740; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 3741; AVX512VLDQ-NEXT: je LBB22_18 3742; AVX512VLDQ-NEXT: LBB22_17: ## %cond.load22 3743; AVX512VLDQ-NEXT: vpbroadcastw 16(%rdi), %ymm0 3744; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 3745; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3746; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 3747; AVX512VLDQ-NEXT: je LBB22_20 3748; AVX512VLDQ-NEXT: LBB22_19: ## %cond.load25 3749; AVX512VLDQ-NEXT: vpbroadcastw 18(%rdi), %ymm0 3750; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] 3751; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3752; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 3753; AVX512VLDQ-NEXT: je LBB22_22 3754; AVX512VLDQ-NEXT: LBB22_21: ## %cond.load28 3755; AVX512VLDQ-NEXT: vpbroadcastw 20(%rdi), %ymm0 3756; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] 3757; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3758; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 3759; AVX512VLDQ-NEXT: je LBB22_24 3760; AVX512VLDQ-NEXT: LBB22_23: ## %cond.load31 3761; AVX512VLDQ-NEXT: vpbroadcastw 22(%rdi), %ymm0 3762; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] 3763; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3764; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 3765; AVX512VLDQ-NEXT: je LBB22_26 3766; AVX512VLDQ-NEXT: LBB22_25: ## %cond.load34 3767; AVX512VLDQ-NEXT: vpbroadcastw 24(%rdi), %ymm0 3768; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] 3769; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3770; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 3771; AVX512VLDQ-NEXT: je LBB22_28 3772; AVX512VLDQ-NEXT: LBB22_27: ## %cond.load37 3773; AVX512VLDQ-NEXT: vpbroadcastw 26(%rdi), %ymm0 3774; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] 3775; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3776; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 3777; AVX512VLDQ-NEXT: je LBB22_30 3778; AVX512VLDQ-NEXT: LBB22_29: ## %cond.load40 3779; AVX512VLDQ-NEXT: vpbroadcastw 28(%rdi), %ymm0 3780; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] 3781; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3782; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 3783; AVX512VLDQ-NEXT: je LBB22_32 3784; AVX512VLDQ-NEXT: LBB22_31: ## %cond.load43 3785; AVX512VLDQ-NEXT: vpbroadcastw 30(%rdi), %ymm0 3786; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] 3787; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] 3788; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 3789; AVX512VLDQ-NEXT: retq 3790; 3791; AVX512VLBW-LABEL: load_v16i16_v16i16: 3792; AVX512VLBW: ## %bb.0: 3793; AVX512VLBW-NEXT: vpmovw2m %ymm0, %k1 3794; AVX512VLBW-NEXT: vpblendmw (%rdi), %ymm1, %ymm0 {%k1} 3795; AVX512VLBW-NEXT: retq 3796; 3797; X86-AVX512-LABEL: load_v16i16_v16i16: 3798; X86-AVX512: ## %bb.0: 3799; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 3800; X86-AVX512-NEXT: vpmovw2m %ymm0, %k1 3801; X86-AVX512-NEXT: vpblendmw (%eax), %ymm1, %ymm0 {%k1} 3802; X86-AVX512-NEXT: retl 3803 %mask = icmp slt <16 x i16> %trigger, zeroinitializer 3804 %res = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x i16> %dst) 3805 ret <16 x i16> %res 3806} 3807 3808; 3809; vXi8 3810; 3811 3812define <16 x i8> @load_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %dst) { 3813; SSE2-LABEL: load_v16i8_v16i8: 3814; SSE2: ## %bb.0: 3815; SSE2-NEXT: pmovmskb %xmm0, %eax 3816; SSE2-NEXT: testb $1, %al 3817; SSE2-NEXT: jne LBB23_1 3818; SSE2-NEXT: ## %bb.2: ## %else 3819; SSE2-NEXT: testb $2, %al 3820; SSE2-NEXT: jne LBB23_3 3821; SSE2-NEXT: LBB23_4: ## %else2 3822; SSE2-NEXT: testb $4, %al 3823; SSE2-NEXT: jne LBB23_5 3824; SSE2-NEXT: LBB23_6: ## %else5 3825; SSE2-NEXT: testb $8, %al 3826; SSE2-NEXT: jne LBB23_7 3827; SSE2-NEXT: LBB23_8: ## %else8 3828; SSE2-NEXT: testb $16, %al 3829; SSE2-NEXT: jne LBB23_9 3830; SSE2-NEXT: LBB23_10: ## %else11 3831; SSE2-NEXT: testb $32, %al 3832; SSE2-NEXT: jne LBB23_11 3833; SSE2-NEXT: LBB23_12: ## %else14 3834; SSE2-NEXT: testb $64, %al 3835; SSE2-NEXT: jne LBB23_13 3836; SSE2-NEXT: LBB23_14: ## %else17 3837; SSE2-NEXT: testb %al, %al 3838; SSE2-NEXT: js LBB23_15 3839; SSE2-NEXT: LBB23_16: ## %else20 3840; SSE2-NEXT: testl $256, %eax ## imm = 0x100 3841; SSE2-NEXT: jne LBB23_17 3842; SSE2-NEXT: LBB23_18: ## %else23 3843; SSE2-NEXT: testl $512, %eax ## imm = 0x200 3844; SSE2-NEXT: jne LBB23_19 3845; SSE2-NEXT: LBB23_20: ## %else26 3846; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 3847; SSE2-NEXT: jne LBB23_21 3848; SSE2-NEXT: LBB23_22: ## %else29 3849; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 3850; SSE2-NEXT: jne LBB23_23 3851; SSE2-NEXT: LBB23_24: ## %else32 3852; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 3853; SSE2-NEXT: jne LBB23_25 3854; SSE2-NEXT: LBB23_26: ## %else35 3855; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 3856; SSE2-NEXT: jne LBB23_27 3857; SSE2-NEXT: LBB23_28: ## %else38 3858; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 3859; SSE2-NEXT: jne LBB23_29 3860; SSE2-NEXT: LBB23_30: ## %else41 3861; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 3862; SSE2-NEXT: jne LBB23_31 3863; SSE2-NEXT: LBB23_32: ## %else44 3864; SSE2-NEXT: movdqa %xmm1, %xmm0 3865; SSE2-NEXT: retq 3866; SSE2-NEXT: LBB23_1: ## %cond.load 3867; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3868; SSE2-NEXT: pand %xmm0, %xmm1 3869; SSE2-NEXT: movzbl (%rdi), %ecx 3870; SSE2-NEXT: movd %ecx, %xmm2 3871; SSE2-NEXT: pandn %xmm2, %xmm0 3872; SSE2-NEXT: por %xmm0, %xmm1 3873; SSE2-NEXT: testb $2, %al 3874; SSE2-NEXT: je LBB23_4 3875; SSE2-NEXT: LBB23_3: ## %cond.load1 3876; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 3877; SSE2-NEXT: pand %xmm0, %xmm1 3878; SSE2-NEXT: movzbl 1(%rdi), %ecx 3879; SSE2-NEXT: movd %ecx, %xmm2 3880; SSE2-NEXT: psllw $8, %xmm2 3881; SSE2-NEXT: pandn %xmm2, %xmm0 3882; SSE2-NEXT: por %xmm0, %xmm1 3883; SSE2-NEXT: testb $4, %al 3884; SSE2-NEXT: je LBB23_6 3885; SSE2-NEXT: LBB23_5: ## %cond.load4 3886; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] 3887; SSE2-NEXT: pand %xmm0, %xmm1 3888; SSE2-NEXT: movzbl 2(%rdi), %ecx 3889; SSE2-NEXT: movd %ecx, %xmm2 3890; SSE2-NEXT: pslld $16, %xmm2 3891; SSE2-NEXT: pandn %xmm2, %xmm0 3892; SSE2-NEXT: por %xmm0, %xmm1 3893; SSE2-NEXT: testb $8, %al 3894; SSE2-NEXT: je LBB23_8 3895; SSE2-NEXT: LBB23_7: ## %cond.load7 3896; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] 3897; SSE2-NEXT: pand %xmm0, %xmm1 3898; SSE2-NEXT: movzbl 3(%rdi), %ecx 3899; SSE2-NEXT: movd %ecx, %xmm2 3900; SSE2-NEXT: pslld $24, %xmm2 3901; SSE2-NEXT: pandn %xmm2, %xmm0 3902; SSE2-NEXT: por %xmm0, %xmm1 3903; SSE2-NEXT: testb $16, %al 3904; SSE2-NEXT: je LBB23_10 3905; SSE2-NEXT: LBB23_9: ## %cond.load10 3906; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] 3907; SSE2-NEXT: pand %xmm0, %xmm1 3908; SSE2-NEXT: movzbl 4(%rdi), %ecx 3909; SSE2-NEXT: movd %ecx, %xmm2 3910; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] 3911; SSE2-NEXT: pandn %xmm2, %xmm0 3912; SSE2-NEXT: por %xmm0, %xmm1 3913; SSE2-NEXT: testb $32, %al 3914; SSE2-NEXT: je LBB23_12 3915; SSE2-NEXT: LBB23_11: ## %cond.load13 3916; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] 3917; SSE2-NEXT: pand %xmm0, %xmm1 3918; SSE2-NEXT: movzbl 5(%rdi), %ecx 3919; SSE2-NEXT: movd %ecx, %xmm2 3920; SSE2-NEXT: psllq $40, %xmm2 3921; SSE2-NEXT: pandn %xmm2, %xmm0 3922; SSE2-NEXT: por %xmm0, %xmm1 3923; SSE2-NEXT: testb $64, %al 3924; SSE2-NEXT: je LBB23_14 3925; SSE2-NEXT: LBB23_13: ## %cond.load16 3926; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] 3927; SSE2-NEXT: pand %xmm0, %xmm1 3928; SSE2-NEXT: movzbl 6(%rdi), %ecx 3929; SSE2-NEXT: movd %ecx, %xmm2 3930; SSE2-NEXT: psllq $48, %xmm2 3931; SSE2-NEXT: pandn %xmm2, %xmm0 3932; SSE2-NEXT: por %xmm0, %xmm1 3933; SSE2-NEXT: testb %al, %al 3934; SSE2-NEXT: jns LBB23_16 3935; SSE2-NEXT: LBB23_15: ## %cond.load19 3936; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] 3937; SSE2-NEXT: pand %xmm0, %xmm1 3938; SSE2-NEXT: movzbl 7(%rdi), %ecx 3939; SSE2-NEXT: movd %ecx, %xmm2 3940; SSE2-NEXT: psllq $56, %xmm2 3941; SSE2-NEXT: pandn %xmm2, %xmm0 3942; SSE2-NEXT: por %xmm0, %xmm1 3943; SSE2-NEXT: testl $256, %eax ## imm = 0x100 3944; SSE2-NEXT: je LBB23_18 3945; SSE2-NEXT: LBB23_17: ## %cond.load22 3946; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 3947; SSE2-NEXT: pand %xmm0, %xmm1 3948; SSE2-NEXT: movzbl 8(%rdi), %ecx 3949; SSE2-NEXT: movd %ecx, %xmm2 3950; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 3951; SSE2-NEXT: pandn %xmm2, %xmm0 3952; SSE2-NEXT: por %xmm0, %xmm1 3953; SSE2-NEXT: testl $512, %eax ## imm = 0x200 3954; SSE2-NEXT: je LBB23_20 3955; SSE2-NEXT: LBB23_19: ## %cond.load25 3956; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] 3957; SSE2-NEXT: pand %xmm0, %xmm1 3958; SSE2-NEXT: movzbl 9(%rdi), %ecx 3959; SSE2-NEXT: movd %ecx, %xmm2 3960; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] 3961; SSE2-NEXT: pandn %xmm2, %xmm0 3962; SSE2-NEXT: por %xmm0, %xmm1 3963; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 3964; SSE2-NEXT: je LBB23_22 3965; SSE2-NEXT: LBB23_21: ## %cond.load28 3966; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] 3967; SSE2-NEXT: pand %xmm0, %xmm1 3968; SSE2-NEXT: movzbl 10(%rdi), %ecx 3969; SSE2-NEXT: movd %ecx, %xmm2 3970; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] 3971; SSE2-NEXT: pandn %xmm2, %xmm0 3972; SSE2-NEXT: por %xmm0, %xmm1 3973; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 3974; SSE2-NEXT: je LBB23_24 3975; SSE2-NEXT: LBB23_23: ## %cond.load31 3976; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] 3977; SSE2-NEXT: pand %xmm0, %xmm1 3978; SSE2-NEXT: movzbl 11(%rdi), %ecx 3979; SSE2-NEXT: movd %ecx, %xmm2 3980; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] 3981; SSE2-NEXT: pandn %xmm2, %xmm0 3982; SSE2-NEXT: por %xmm0, %xmm1 3983; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 3984; SSE2-NEXT: je LBB23_26 3985; SSE2-NEXT: LBB23_25: ## %cond.load34 3986; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] 3987; SSE2-NEXT: pand %xmm0, %xmm1 3988; SSE2-NEXT: movzbl 12(%rdi), %ecx 3989; SSE2-NEXT: movd %ecx, %xmm2 3990; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 3991; SSE2-NEXT: pandn %xmm2, %xmm0 3992; SSE2-NEXT: por %xmm0, %xmm1 3993; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 3994; SSE2-NEXT: je LBB23_28 3995; SSE2-NEXT: LBB23_27: ## %cond.load37 3996; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] 3997; SSE2-NEXT: pand %xmm0, %xmm1 3998; SSE2-NEXT: movzbl 13(%rdi), %ecx 3999; SSE2-NEXT: movd %ecx, %xmm2 4000; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] 4001; SSE2-NEXT: pandn %xmm2, %xmm0 4002; SSE2-NEXT: por %xmm0, %xmm1 4003; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 4004; SSE2-NEXT: je LBB23_30 4005; SSE2-NEXT: LBB23_29: ## %cond.load40 4006; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] 4007; SSE2-NEXT: pand %xmm0, %xmm1 4008; SSE2-NEXT: movzbl 14(%rdi), %ecx 4009; SSE2-NEXT: movd %ecx, %xmm2 4010; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] 4011; SSE2-NEXT: pandn %xmm2, %xmm0 4012; SSE2-NEXT: por %xmm0, %xmm1 4013; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 4014; SSE2-NEXT: je LBB23_32 4015; SSE2-NEXT: LBB23_31: ## %cond.load43 4016; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 4017; SSE2-NEXT: movzbl 15(%rdi), %eax 4018; SSE2-NEXT: movd %eax, %xmm0 4019; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] 4020; SSE2-NEXT: por %xmm0, %xmm1 4021; SSE2-NEXT: movdqa %xmm1, %xmm0 4022; SSE2-NEXT: retq 4023; 4024; SSE42-LABEL: load_v16i8_v16i8: 4025; SSE42: ## %bb.0: 4026; SSE42-NEXT: pmovmskb %xmm0, %eax 4027; SSE42-NEXT: testb $1, %al 4028; SSE42-NEXT: jne LBB23_1 4029; SSE42-NEXT: ## %bb.2: ## %else 4030; SSE42-NEXT: testb $2, %al 4031; SSE42-NEXT: jne LBB23_3 4032; SSE42-NEXT: LBB23_4: ## %else2 4033; SSE42-NEXT: testb $4, %al 4034; SSE42-NEXT: jne LBB23_5 4035; SSE42-NEXT: LBB23_6: ## %else5 4036; SSE42-NEXT: testb $8, %al 4037; SSE42-NEXT: jne LBB23_7 4038; SSE42-NEXT: LBB23_8: ## %else8 4039; SSE42-NEXT: testb $16, %al 4040; SSE42-NEXT: jne LBB23_9 4041; SSE42-NEXT: LBB23_10: ## %else11 4042; SSE42-NEXT: testb $32, %al 4043; SSE42-NEXT: jne LBB23_11 4044; SSE42-NEXT: LBB23_12: ## %else14 4045; SSE42-NEXT: testb $64, %al 4046; SSE42-NEXT: jne LBB23_13 4047; SSE42-NEXT: LBB23_14: ## %else17 4048; SSE42-NEXT: testb %al, %al 4049; SSE42-NEXT: js LBB23_15 4050; SSE42-NEXT: LBB23_16: ## %else20 4051; SSE42-NEXT: testl $256, %eax ## imm = 0x100 4052; SSE42-NEXT: jne LBB23_17 4053; SSE42-NEXT: LBB23_18: ## %else23 4054; SSE42-NEXT: testl $512, %eax ## imm = 0x200 4055; SSE42-NEXT: jne LBB23_19 4056; SSE42-NEXT: LBB23_20: ## %else26 4057; SSE42-NEXT: testl $1024, %eax ## imm = 0x400 4058; SSE42-NEXT: jne LBB23_21 4059; SSE42-NEXT: LBB23_22: ## %else29 4060; SSE42-NEXT: testl $2048, %eax ## imm = 0x800 4061; SSE42-NEXT: jne LBB23_23 4062; SSE42-NEXT: LBB23_24: ## %else32 4063; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000 4064; SSE42-NEXT: jne LBB23_25 4065; SSE42-NEXT: LBB23_26: ## %else35 4066; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000 4067; SSE42-NEXT: jne LBB23_27 4068; SSE42-NEXT: LBB23_28: ## %else38 4069; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000 4070; SSE42-NEXT: jne LBB23_29 4071; SSE42-NEXT: LBB23_30: ## %else41 4072; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000 4073; SSE42-NEXT: jne LBB23_31 4074; SSE42-NEXT: LBB23_32: ## %else44 4075; SSE42-NEXT: movdqa %xmm1, %xmm0 4076; SSE42-NEXT: retq 4077; SSE42-NEXT: LBB23_1: ## %cond.load 4078; SSE42-NEXT: pinsrb $0, (%rdi), %xmm1 4079; SSE42-NEXT: testb $2, %al 4080; SSE42-NEXT: je LBB23_4 4081; SSE42-NEXT: LBB23_3: ## %cond.load1 4082; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm1 4083; SSE42-NEXT: testb $4, %al 4084; SSE42-NEXT: je LBB23_6 4085; SSE42-NEXT: LBB23_5: ## %cond.load4 4086; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm1 4087; SSE42-NEXT: testb $8, %al 4088; SSE42-NEXT: je LBB23_8 4089; SSE42-NEXT: LBB23_7: ## %cond.load7 4090; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm1 4091; SSE42-NEXT: testb $16, %al 4092; SSE42-NEXT: je LBB23_10 4093; SSE42-NEXT: LBB23_9: ## %cond.load10 4094; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm1 4095; SSE42-NEXT: testb $32, %al 4096; SSE42-NEXT: je LBB23_12 4097; SSE42-NEXT: LBB23_11: ## %cond.load13 4098; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm1 4099; SSE42-NEXT: testb $64, %al 4100; SSE42-NEXT: je LBB23_14 4101; SSE42-NEXT: LBB23_13: ## %cond.load16 4102; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm1 4103; SSE42-NEXT: testb %al, %al 4104; SSE42-NEXT: jns LBB23_16 4105; SSE42-NEXT: LBB23_15: ## %cond.load19 4106; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm1 4107; SSE42-NEXT: testl $256, %eax ## imm = 0x100 4108; SSE42-NEXT: je LBB23_18 4109; SSE42-NEXT: LBB23_17: ## %cond.load22 4110; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm1 4111; SSE42-NEXT: testl $512, %eax ## imm = 0x200 4112; SSE42-NEXT: je LBB23_20 4113; SSE42-NEXT: LBB23_19: ## %cond.load25 4114; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm1 4115; SSE42-NEXT: testl $1024, %eax ## imm = 0x400 4116; SSE42-NEXT: je LBB23_22 4117; SSE42-NEXT: LBB23_21: ## %cond.load28 4118; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm1 4119; SSE42-NEXT: testl $2048, %eax ## imm = 0x800 4120; SSE42-NEXT: je LBB23_24 4121; SSE42-NEXT: LBB23_23: ## %cond.load31 4122; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm1 4123; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000 4124; SSE42-NEXT: je LBB23_26 4125; SSE42-NEXT: LBB23_25: ## %cond.load34 4126; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm1 4127; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000 4128; SSE42-NEXT: je LBB23_28 4129; SSE42-NEXT: LBB23_27: ## %cond.load37 4130; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm1 4131; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000 4132; SSE42-NEXT: je LBB23_30 4133; SSE42-NEXT: LBB23_29: ## %cond.load40 4134; SSE42-NEXT: pinsrb $14, 14(%rdi), %xmm1 4135; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000 4136; SSE42-NEXT: je LBB23_32 4137; SSE42-NEXT: LBB23_31: ## %cond.load43 4138; SSE42-NEXT: pinsrb $15, 15(%rdi), %xmm1 4139; SSE42-NEXT: movdqa %xmm1, %xmm0 4140; SSE42-NEXT: retq 4141; 4142; AVX1OR2-LABEL: load_v16i8_v16i8: 4143; AVX1OR2: ## %bb.0: 4144; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax 4145; AVX1OR2-NEXT: testb $1, %al 4146; AVX1OR2-NEXT: jne LBB23_1 4147; AVX1OR2-NEXT: ## %bb.2: ## %else 4148; AVX1OR2-NEXT: testb $2, %al 4149; AVX1OR2-NEXT: jne LBB23_3 4150; AVX1OR2-NEXT: LBB23_4: ## %else2 4151; AVX1OR2-NEXT: testb $4, %al 4152; AVX1OR2-NEXT: jne LBB23_5 4153; AVX1OR2-NEXT: LBB23_6: ## %else5 4154; AVX1OR2-NEXT: testb $8, %al 4155; AVX1OR2-NEXT: jne LBB23_7 4156; AVX1OR2-NEXT: LBB23_8: ## %else8 4157; AVX1OR2-NEXT: testb $16, %al 4158; AVX1OR2-NEXT: jne LBB23_9 4159; AVX1OR2-NEXT: LBB23_10: ## %else11 4160; AVX1OR2-NEXT: testb $32, %al 4161; AVX1OR2-NEXT: jne LBB23_11 4162; AVX1OR2-NEXT: LBB23_12: ## %else14 4163; AVX1OR2-NEXT: testb $64, %al 4164; AVX1OR2-NEXT: jne LBB23_13 4165; AVX1OR2-NEXT: LBB23_14: ## %else17 4166; AVX1OR2-NEXT: testb %al, %al 4167; AVX1OR2-NEXT: js LBB23_15 4168; AVX1OR2-NEXT: LBB23_16: ## %else20 4169; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100 4170; AVX1OR2-NEXT: jne LBB23_17 4171; AVX1OR2-NEXT: LBB23_18: ## %else23 4172; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200 4173; AVX1OR2-NEXT: jne LBB23_19 4174; AVX1OR2-NEXT: LBB23_20: ## %else26 4175; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400 4176; AVX1OR2-NEXT: jne LBB23_21 4177; AVX1OR2-NEXT: LBB23_22: ## %else29 4178; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800 4179; AVX1OR2-NEXT: jne LBB23_23 4180; AVX1OR2-NEXT: LBB23_24: ## %else32 4181; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000 4182; AVX1OR2-NEXT: jne LBB23_25 4183; AVX1OR2-NEXT: LBB23_26: ## %else35 4184; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000 4185; AVX1OR2-NEXT: jne LBB23_27 4186; AVX1OR2-NEXT: LBB23_28: ## %else38 4187; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000 4188; AVX1OR2-NEXT: jne LBB23_29 4189; AVX1OR2-NEXT: LBB23_30: ## %else41 4190; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000 4191; AVX1OR2-NEXT: jne LBB23_31 4192; AVX1OR2-NEXT: LBB23_32: ## %else44 4193; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0 4194; AVX1OR2-NEXT: retq 4195; AVX1OR2-NEXT: LBB23_1: ## %cond.load 4196; AVX1OR2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1 4197; AVX1OR2-NEXT: testb $2, %al 4198; AVX1OR2-NEXT: je LBB23_4 4199; AVX1OR2-NEXT: LBB23_3: ## %cond.load1 4200; AVX1OR2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1 4201; AVX1OR2-NEXT: testb $4, %al 4202; AVX1OR2-NEXT: je LBB23_6 4203; AVX1OR2-NEXT: LBB23_5: ## %cond.load4 4204; AVX1OR2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1 4205; AVX1OR2-NEXT: testb $8, %al 4206; AVX1OR2-NEXT: je LBB23_8 4207; AVX1OR2-NEXT: LBB23_7: ## %cond.load7 4208; AVX1OR2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1 4209; AVX1OR2-NEXT: testb $16, %al 4210; AVX1OR2-NEXT: je LBB23_10 4211; AVX1OR2-NEXT: LBB23_9: ## %cond.load10 4212; AVX1OR2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1 4213; AVX1OR2-NEXT: testb $32, %al 4214; AVX1OR2-NEXT: je LBB23_12 4215; AVX1OR2-NEXT: LBB23_11: ## %cond.load13 4216; AVX1OR2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1 4217; AVX1OR2-NEXT: testb $64, %al 4218; AVX1OR2-NEXT: je LBB23_14 4219; AVX1OR2-NEXT: LBB23_13: ## %cond.load16 4220; AVX1OR2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1 4221; AVX1OR2-NEXT: testb %al, %al 4222; AVX1OR2-NEXT: jns LBB23_16 4223; AVX1OR2-NEXT: LBB23_15: ## %cond.load19 4224; AVX1OR2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1 4225; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100 4226; AVX1OR2-NEXT: je LBB23_18 4227; AVX1OR2-NEXT: LBB23_17: ## %cond.load22 4228; AVX1OR2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1 4229; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200 4230; AVX1OR2-NEXT: je LBB23_20 4231; AVX1OR2-NEXT: LBB23_19: ## %cond.load25 4232; AVX1OR2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1 4233; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400 4234; AVX1OR2-NEXT: je LBB23_22 4235; AVX1OR2-NEXT: LBB23_21: ## %cond.load28 4236; AVX1OR2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1 4237; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800 4238; AVX1OR2-NEXT: je LBB23_24 4239; AVX1OR2-NEXT: LBB23_23: ## %cond.load31 4240; AVX1OR2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1 4241; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000 4242; AVX1OR2-NEXT: je LBB23_26 4243; AVX1OR2-NEXT: LBB23_25: ## %cond.load34 4244; AVX1OR2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1 4245; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000 4246; AVX1OR2-NEXT: je LBB23_28 4247; AVX1OR2-NEXT: LBB23_27: ## %cond.load37 4248; AVX1OR2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1 4249; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000 4250; AVX1OR2-NEXT: je LBB23_30 4251; AVX1OR2-NEXT: LBB23_29: ## %cond.load40 4252; AVX1OR2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1 4253; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000 4254; AVX1OR2-NEXT: je LBB23_32 4255; AVX1OR2-NEXT: LBB23_31: ## %cond.load43 4256; AVX1OR2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1 4257; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0 4258; AVX1OR2-NEXT: retq 4259; 4260; AVX512F-LABEL: load_v16i8_v16i8: 4261; AVX512F: ## %bb.0: 4262; AVX512F-NEXT: vpmovmskb %xmm0, %eax 4263; AVX512F-NEXT: testb $1, %al 4264; AVX512F-NEXT: jne LBB23_1 4265; AVX512F-NEXT: ## %bb.2: ## %else 4266; AVX512F-NEXT: testb $2, %al 4267; AVX512F-NEXT: jne LBB23_3 4268; AVX512F-NEXT: LBB23_4: ## %else2 4269; AVX512F-NEXT: testb $4, %al 4270; AVX512F-NEXT: jne LBB23_5 4271; AVX512F-NEXT: LBB23_6: ## %else5 4272; AVX512F-NEXT: testb $8, %al 4273; AVX512F-NEXT: jne LBB23_7 4274; AVX512F-NEXT: LBB23_8: ## %else8 4275; AVX512F-NEXT: testb $16, %al 4276; AVX512F-NEXT: jne LBB23_9 4277; AVX512F-NEXT: LBB23_10: ## %else11 4278; AVX512F-NEXT: testb $32, %al 4279; AVX512F-NEXT: jne LBB23_11 4280; AVX512F-NEXT: LBB23_12: ## %else14 4281; AVX512F-NEXT: testb $64, %al 4282; AVX512F-NEXT: jne LBB23_13 4283; AVX512F-NEXT: LBB23_14: ## %else17 4284; AVX512F-NEXT: testb %al, %al 4285; AVX512F-NEXT: js LBB23_15 4286; AVX512F-NEXT: LBB23_16: ## %else20 4287; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 4288; AVX512F-NEXT: jne LBB23_17 4289; AVX512F-NEXT: LBB23_18: ## %else23 4290; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 4291; AVX512F-NEXT: jne LBB23_19 4292; AVX512F-NEXT: LBB23_20: ## %else26 4293; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 4294; AVX512F-NEXT: jne LBB23_21 4295; AVX512F-NEXT: LBB23_22: ## %else29 4296; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 4297; AVX512F-NEXT: jne LBB23_23 4298; AVX512F-NEXT: LBB23_24: ## %else32 4299; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 4300; AVX512F-NEXT: jne LBB23_25 4301; AVX512F-NEXT: LBB23_26: ## %else35 4302; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 4303; AVX512F-NEXT: jne LBB23_27 4304; AVX512F-NEXT: LBB23_28: ## %else38 4305; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 4306; AVX512F-NEXT: jne LBB23_29 4307; AVX512F-NEXT: LBB23_30: ## %else41 4308; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 4309; AVX512F-NEXT: jne LBB23_31 4310; AVX512F-NEXT: LBB23_32: ## %else44 4311; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 4312; AVX512F-NEXT: retq 4313; AVX512F-NEXT: LBB23_1: ## %cond.load 4314; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1 4315; AVX512F-NEXT: testb $2, %al 4316; AVX512F-NEXT: je LBB23_4 4317; AVX512F-NEXT: LBB23_3: ## %cond.load1 4318; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1 4319; AVX512F-NEXT: testb $4, %al 4320; AVX512F-NEXT: je LBB23_6 4321; AVX512F-NEXT: LBB23_5: ## %cond.load4 4322; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1 4323; AVX512F-NEXT: testb $8, %al 4324; AVX512F-NEXT: je LBB23_8 4325; AVX512F-NEXT: LBB23_7: ## %cond.load7 4326; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1 4327; AVX512F-NEXT: testb $16, %al 4328; AVX512F-NEXT: je LBB23_10 4329; AVX512F-NEXT: LBB23_9: ## %cond.load10 4330; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1 4331; AVX512F-NEXT: testb $32, %al 4332; AVX512F-NEXT: je LBB23_12 4333; AVX512F-NEXT: LBB23_11: ## %cond.load13 4334; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1 4335; AVX512F-NEXT: testb $64, %al 4336; AVX512F-NEXT: je LBB23_14 4337; AVX512F-NEXT: LBB23_13: ## %cond.load16 4338; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1 4339; AVX512F-NEXT: testb %al, %al 4340; AVX512F-NEXT: jns LBB23_16 4341; AVX512F-NEXT: LBB23_15: ## %cond.load19 4342; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1 4343; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 4344; AVX512F-NEXT: je LBB23_18 4345; AVX512F-NEXT: LBB23_17: ## %cond.load22 4346; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1 4347; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 4348; AVX512F-NEXT: je LBB23_20 4349; AVX512F-NEXT: LBB23_19: ## %cond.load25 4350; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1 4351; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 4352; AVX512F-NEXT: je LBB23_22 4353; AVX512F-NEXT: LBB23_21: ## %cond.load28 4354; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1 4355; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 4356; AVX512F-NEXT: je LBB23_24 4357; AVX512F-NEXT: LBB23_23: ## %cond.load31 4358; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1 4359; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 4360; AVX512F-NEXT: je LBB23_26 4361; AVX512F-NEXT: LBB23_25: ## %cond.load34 4362; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1 4363; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 4364; AVX512F-NEXT: je LBB23_28 4365; AVX512F-NEXT: LBB23_27: ## %cond.load37 4366; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1 4367; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 4368; AVX512F-NEXT: je LBB23_30 4369; AVX512F-NEXT: LBB23_29: ## %cond.load40 4370; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1 4371; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 4372; AVX512F-NEXT: je LBB23_32 4373; AVX512F-NEXT: LBB23_31: ## %cond.load43 4374; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1 4375; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 4376; AVX512F-NEXT: retq 4377; 4378; AVX512VLDQ-LABEL: load_v16i8_v16i8: 4379; AVX512VLDQ: ## %bb.0: 4380; AVX512VLDQ-NEXT: vpmovmskb %xmm0, %eax 4381; AVX512VLDQ-NEXT: testb $1, %al 4382; AVX512VLDQ-NEXT: jne LBB23_1 4383; AVX512VLDQ-NEXT: ## %bb.2: ## %else 4384; AVX512VLDQ-NEXT: testb $2, %al 4385; AVX512VLDQ-NEXT: jne LBB23_3 4386; AVX512VLDQ-NEXT: LBB23_4: ## %else2 4387; AVX512VLDQ-NEXT: testb $4, %al 4388; AVX512VLDQ-NEXT: jne LBB23_5 4389; AVX512VLDQ-NEXT: LBB23_6: ## %else5 4390; AVX512VLDQ-NEXT: testb $8, %al 4391; AVX512VLDQ-NEXT: jne LBB23_7 4392; AVX512VLDQ-NEXT: LBB23_8: ## %else8 4393; AVX512VLDQ-NEXT: testb $16, %al 4394; AVX512VLDQ-NEXT: jne LBB23_9 4395; AVX512VLDQ-NEXT: LBB23_10: ## %else11 4396; AVX512VLDQ-NEXT: testb $32, %al 4397; AVX512VLDQ-NEXT: jne LBB23_11 4398; AVX512VLDQ-NEXT: LBB23_12: ## %else14 4399; AVX512VLDQ-NEXT: testb $64, %al 4400; AVX512VLDQ-NEXT: jne LBB23_13 4401; AVX512VLDQ-NEXT: LBB23_14: ## %else17 4402; AVX512VLDQ-NEXT: testb %al, %al 4403; AVX512VLDQ-NEXT: js LBB23_15 4404; AVX512VLDQ-NEXT: LBB23_16: ## %else20 4405; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 4406; AVX512VLDQ-NEXT: jne LBB23_17 4407; AVX512VLDQ-NEXT: LBB23_18: ## %else23 4408; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 4409; AVX512VLDQ-NEXT: jne LBB23_19 4410; AVX512VLDQ-NEXT: LBB23_20: ## %else26 4411; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 4412; AVX512VLDQ-NEXT: jne LBB23_21 4413; AVX512VLDQ-NEXT: LBB23_22: ## %else29 4414; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 4415; AVX512VLDQ-NEXT: jne LBB23_23 4416; AVX512VLDQ-NEXT: LBB23_24: ## %else32 4417; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 4418; AVX512VLDQ-NEXT: jne LBB23_25 4419; AVX512VLDQ-NEXT: LBB23_26: ## %else35 4420; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 4421; AVX512VLDQ-NEXT: jne LBB23_27 4422; AVX512VLDQ-NEXT: LBB23_28: ## %else38 4423; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 4424; AVX512VLDQ-NEXT: jne LBB23_29 4425; AVX512VLDQ-NEXT: LBB23_30: ## %else41 4426; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 4427; AVX512VLDQ-NEXT: jne LBB23_31 4428; AVX512VLDQ-NEXT: LBB23_32: ## %else44 4429; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 4430; AVX512VLDQ-NEXT: retq 4431; AVX512VLDQ-NEXT: LBB23_1: ## %cond.load 4432; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1 4433; AVX512VLDQ-NEXT: testb $2, %al 4434; AVX512VLDQ-NEXT: je LBB23_4 4435; AVX512VLDQ-NEXT: LBB23_3: ## %cond.load1 4436; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1 4437; AVX512VLDQ-NEXT: testb $4, %al 4438; AVX512VLDQ-NEXT: je LBB23_6 4439; AVX512VLDQ-NEXT: LBB23_5: ## %cond.load4 4440; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1 4441; AVX512VLDQ-NEXT: testb $8, %al 4442; AVX512VLDQ-NEXT: je LBB23_8 4443; AVX512VLDQ-NEXT: LBB23_7: ## %cond.load7 4444; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1 4445; AVX512VLDQ-NEXT: testb $16, %al 4446; AVX512VLDQ-NEXT: je LBB23_10 4447; AVX512VLDQ-NEXT: LBB23_9: ## %cond.load10 4448; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1 4449; AVX512VLDQ-NEXT: testb $32, %al 4450; AVX512VLDQ-NEXT: je LBB23_12 4451; AVX512VLDQ-NEXT: LBB23_11: ## %cond.load13 4452; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1 4453; AVX512VLDQ-NEXT: testb $64, %al 4454; AVX512VLDQ-NEXT: je LBB23_14 4455; AVX512VLDQ-NEXT: LBB23_13: ## %cond.load16 4456; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1 4457; AVX512VLDQ-NEXT: testb %al, %al 4458; AVX512VLDQ-NEXT: jns LBB23_16 4459; AVX512VLDQ-NEXT: LBB23_15: ## %cond.load19 4460; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1 4461; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 4462; AVX512VLDQ-NEXT: je LBB23_18 4463; AVX512VLDQ-NEXT: LBB23_17: ## %cond.load22 4464; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1 4465; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 4466; AVX512VLDQ-NEXT: je LBB23_20 4467; AVX512VLDQ-NEXT: LBB23_19: ## %cond.load25 4468; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1 4469; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 4470; AVX512VLDQ-NEXT: je LBB23_22 4471; AVX512VLDQ-NEXT: LBB23_21: ## %cond.load28 4472; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1 4473; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 4474; AVX512VLDQ-NEXT: je LBB23_24 4475; AVX512VLDQ-NEXT: LBB23_23: ## %cond.load31 4476; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1 4477; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 4478; AVX512VLDQ-NEXT: je LBB23_26 4479; AVX512VLDQ-NEXT: LBB23_25: ## %cond.load34 4480; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1 4481; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 4482; AVX512VLDQ-NEXT: je LBB23_28 4483; AVX512VLDQ-NEXT: LBB23_27: ## %cond.load37 4484; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1 4485; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 4486; AVX512VLDQ-NEXT: je LBB23_30 4487; AVX512VLDQ-NEXT: LBB23_29: ## %cond.load40 4488; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1 4489; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 4490; AVX512VLDQ-NEXT: je LBB23_32 4491; AVX512VLDQ-NEXT: LBB23_31: ## %cond.load43 4492; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1 4493; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 4494; AVX512VLDQ-NEXT: retq 4495; 4496; AVX512VLBW-LABEL: load_v16i8_v16i8: 4497; AVX512VLBW: ## %bb.0: 4498; AVX512VLBW-NEXT: vpmovb2m %xmm0, %k1 4499; AVX512VLBW-NEXT: vpblendmb (%rdi), %xmm1, %xmm0 {%k1} 4500; AVX512VLBW-NEXT: retq 4501; 4502; X86-AVX512-LABEL: load_v16i8_v16i8: 4503; X86-AVX512: ## %bb.0: 4504; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 4505; X86-AVX512-NEXT: vpmovb2m %xmm0, %k1 4506; X86-AVX512-NEXT: vpblendmb (%eax), %xmm1, %xmm0 {%k1} 4507; X86-AVX512-NEXT: retl 4508 %mask = icmp slt <16 x i8> %trigger, zeroinitializer 4509 %res = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x i8> %dst) 4510 ret <16 x i8> %res 4511} 4512 4513define <32 x i8> @load_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %dst) { 4514; SSE2-LABEL: load_v32i8_v32i8: 4515; SSE2: ## %bb.0: 4516; SSE2-NEXT: pmovmskb %xmm0, %ecx 4517; SSE2-NEXT: pmovmskb %xmm1, %eax 4518; SSE2-NEXT: shll $16, %eax 4519; SSE2-NEXT: orl %ecx, %eax 4520; SSE2-NEXT: testb $1, %al 4521; SSE2-NEXT: jne LBB24_1 4522; SSE2-NEXT: ## %bb.2: ## %else 4523; SSE2-NEXT: testb $2, %al 4524; SSE2-NEXT: jne LBB24_3 4525; SSE2-NEXT: LBB24_4: ## %else2 4526; SSE2-NEXT: testb $4, %al 4527; SSE2-NEXT: jne LBB24_5 4528; SSE2-NEXT: LBB24_6: ## %else5 4529; SSE2-NEXT: testb $8, %al 4530; SSE2-NEXT: jne LBB24_7 4531; SSE2-NEXT: LBB24_8: ## %else8 4532; SSE2-NEXT: testb $16, %al 4533; SSE2-NEXT: jne LBB24_9 4534; SSE2-NEXT: LBB24_10: ## %else11 4535; SSE2-NEXT: testb $32, %al 4536; SSE2-NEXT: jne LBB24_11 4537; SSE2-NEXT: LBB24_12: ## %else14 4538; SSE2-NEXT: testb $64, %al 4539; SSE2-NEXT: jne LBB24_13 4540; SSE2-NEXT: LBB24_14: ## %else17 4541; SSE2-NEXT: testb %al, %al 4542; SSE2-NEXT: js LBB24_15 4543; SSE2-NEXT: LBB24_16: ## %else20 4544; SSE2-NEXT: testl $256, %eax ## imm = 0x100 4545; SSE2-NEXT: jne LBB24_17 4546; SSE2-NEXT: LBB24_18: ## %else23 4547; SSE2-NEXT: testl $512, %eax ## imm = 0x200 4548; SSE2-NEXT: jne LBB24_19 4549; SSE2-NEXT: LBB24_20: ## %else26 4550; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 4551; SSE2-NEXT: jne LBB24_21 4552; SSE2-NEXT: LBB24_22: ## %else29 4553; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 4554; SSE2-NEXT: jne LBB24_23 4555; SSE2-NEXT: LBB24_24: ## %else32 4556; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 4557; SSE2-NEXT: jne LBB24_25 4558; SSE2-NEXT: LBB24_26: ## %else35 4559; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 4560; SSE2-NEXT: jne LBB24_27 4561; SSE2-NEXT: LBB24_28: ## %else38 4562; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 4563; SSE2-NEXT: jne LBB24_29 4564; SSE2-NEXT: LBB24_30: ## %else41 4565; SSE2-NEXT: testw %ax, %ax 4566; SSE2-NEXT: js LBB24_31 4567; SSE2-NEXT: LBB24_32: ## %else44 4568; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000 4569; SSE2-NEXT: jne LBB24_33 4570; SSE2-NEXT: LBB24_34: ## %else47 4571; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000 4572; SSE2-NEXT: jne LBB24_35 4573; SSE2-NEXT: LBB24_36: ## %else50 4574; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000 4575; SSE2-NEXT: jne LBB24_37 4576; SSE2-NEXT: LBB24_38: ## %else53 4577; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000 4578; SSE2-NEXT: jne LBB24_39 4579; SSE2-NEXT: LBB24_40: ## %else56 4580; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000 4581; SSE2-NEXT: jne LBB24_41 4582; SSE2-NEXT: LBB24_42: ## %else59 4583; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000 4584; SSE2-NEXT: jne LBB24_43 4585; SSE2-NEXT: LBB24_44: ## %else62 4586; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000 4587; SSE2-NEXT: jne LBB24_45 4588; SSE2-NEXT: LBB24_46: ## %else65 4589; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000 4590; SSE2-NEXT: jne LBB24_47 4591; SSE2-NEXT: LBB24_48: ## %else68 4592; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000 4593; SSE2-NEXT: jne LBB24_49 4594; SSE2-NEXT: LBB24_50: ## %else71 4595; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000 4596; SSE2-NEXT: jne LBB24_51 4597; SSE2-NEXT: LBB24_52: ## %else74 4598; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000 4599; SSE2-NEXT: jne LBB24_53 4600; SSE2-NEXT: LBB24_54: ## %else77 4601; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000 4602; SSE2-NEXT: jne LBB24_55 4603; SSE2-NEXT: LBB24_56: ## %else80 4604; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000 4605; SSE2-NEXT: jne LBB24_57 4606; SSE2-NEXT: LBB24_58: ## %else83 4607; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000 4608; SSE2-NEXT: jne LBB24_59 4609; SSE2-NEXT: LBB24_60: ## %else86 4610; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 4611; SSE2-NEXT: jne LBB24_61 4612; SSE2-NEXT: LBB24_62: ## %else89 4613; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 4614; SSE2-NEXT: je LBB24_64 4615; SSE2-NEXT: LBB24_63: ## %cond.load91 4616; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 4617; SSE2-NEXT: movzbl 31(%rdi), %eax 4618; SSE2-NEXT: movd %eax, %xmm0 4619; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] 4620; SSE2-NEXT: por %xmm0, %xmm3 4621; SSE2-NEXT: LBB24_64: ## %else92 4622; SSE2-NEXT: movdqa %xmm2, %xmm0 4623; SSE2-NEXT: movdqa %xmm3, %xmm1 4624; SSE2-NEXT: retq 4625; SSE2-NEXT: LBB24_1: ## %cond.load 4626; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 4627; SSE2-NEXT: pand %xmm0, %xmm2 4628; SSE2-NEXT: movzbl (%rdi), %ecx 4629; SSE2-NEXT: movd %ecx, %xmm1 4630; SSE2-NEXT: pandn %xmm1, %xmm0 4631; SSE2-NEXT: por %xmm0, %xmm2 4632; SSE2-NEXT: testb $2, %al 4633; SSE2-NEXT: je LBB24_4 4634; SSE2-NEXT: LBB24_3: ## %cond.load1 4635; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 4636; SSE2-NEXT: pand %xmm0, %xmm2 4637; SSE2-NEXT: movzbl 1(%rdi), %ecx 4638; SSE2-NEXT: movd %ecx, %xmm1 4639; SSE2-NEXT: psllw $8, %xmm1 4640; SSE2-NEXT: pandn %xmm1, %xmm0 4641; SSE2-NEXT: por %xmm0, %xmm2 4642; SSE2-NEXT: testb $4, %al 4643; SSE2-NEXT: je LBB24_6 4644; SSE2-NEXT: LBB24_5: ## %cond.load4 4645; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] 4646; SSE2-NEXT: pand %xmm0, %xmm2 4647; SSE2-NEXT: movzbl 2(%rdi), %ecx 4648; SSE2-NEXT: movd %ecx, %xmm1 4649; SSE2-NEXT: pslld $16, %xmm1 4650; SSE2-NEXT: pandn %xmm1, %xmm0 4651; SSE2-NEXT: por %xmm0, %xmm2 4652; SSE2-NEXT: testb $8, %al 4653; SSE2-NEXT: je LBB24_8 4654; SSE2-NEXT: LBB24_7: ## %cond.load7 4655; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] 4656; SSE2-NEXT: pand %xmm0, %xmm2 4657; SSE2-NEXT: movzbl 3(%rdi), %ecx 4658; SSE2-NEXT: movd %ecx, %xmm1 4659; SSE2-NEXT: pslld $24, %xmm1 4660; SSE2-NEXT: pandn %xmm1, %xmm0 4661; SSE2-NEXT: por %xmm0, %xmm2 4662; SSE2-NEXT: testb $16, %al 4663; SSE2-NEXT: je LBB24_10 4664; SSE2-NEXT: LBB24_9: ## %cond.load10 4665; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] 4666; SSE2-NEXT: pand %xmm0, %xmm2 4667; SSE2-NEXT: movzbl 4(%rdi), %ecx 4668; SSE2-NEXT: movd %ecx, %xmm1 4669; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 4670; SSE2-NEXT: pandn %xmm1, %xmm0 4671; SSE2-NEXT: por %xmm0, %xmm2 4672; SSE2-NEXT: testb $32, %al 4673; SSE2-NEXT: je LBB24_12 4674; SSE2-NEXT: LBB24_11: ## %cond.load13 4675; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] 4676; SSE2-NEXT: pand %xmm0, %xmm2 4677; SSE2-NEXT: movzbl 5(%rdi), %ecx 4678; SSE2-NEXT: movd %ecx, %xmm1 4679; SSE2-NEXT: psllq $40, %xmm1 4680; SSE2-NEXT: pandn %xmm1, %xmm0 4681; SSE2-NEXT: por %xmm0, %xmm2 4682; SSE2-NEXT: testb $64, %al 4683; SSE2-NEXT: je LBB24_14 4684; SSE2-NEXT: LBB24_13: ## %cond.load16 4685; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] 4686; SSE2-NEXT: pand %xmm0, %xmm2 4687; SSE2-NEXT: movzbl 6(%rdi), %ecx 4688; SSE2-NEXT: movd %ecx, %xmm1 4689; SSE2-NEXT: psllq $48, %xmm1 4690; SSE2-NEXT: pandn %xmm1, %xmm0 4691; SSE2-NEXT: por %xmm0, %xmm2 4692; SSE2-NEXT: testb %al, %al 4693; SSE2-NEXT: jns LBB24_16 4694; SSE2-NEXT: LBB24_15: ## %cond.load19 4695; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] 4696; SSE2-NEXT: pand %xmm0, %xmm2 4697; SSE2-NEXT: movzbl 7(%rdi), %ecx 4698; SSE2-NEXT: movd %ecx, %xmm1 4699; SSE2-NEXT: psllq $56, %xmm1 4700; SSE2-NEXT: pandn %xmm1, %xmm0 4701; SSE2-NEXT: por %xmm0, %xmm2 4702; SSE2-NEXT: testl $256, %eax ## imm = 0x100 4703; SSE2-NEXT: je LBB24_18 4704; SSE2-NEXT: LBB24_17: ## %cond.load22 4705; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 4706; SSE2-NEXT: pand %xmm0, %xmm2 4707; SSE2-NEXT: movzbl 8(%rdi), %ecx 4708; SSE2-NEXT: movd %ecx, %xmm1 4709; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 4710; SSE2-NEXT: pandn %xmm1, %xmm0 4711; SSE2-NEXT: por %xmm0, %xmm2 4712; SSE2-NEXT: testl $512, %eax ## imm = 0x200 4713; SSE2-NEXT: je LBB24_20 4714; SSE2-NEXT: LBB24_19: ## %cond.load25 4715; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] 4716; SSE2-NEXT: pand %xmm0, %xmm2 4717; SSE2-NEXT: movzbl 9(%rdi), %ecx 4718; SSE2-NEXT: movd %ecx, %xmm1 4719; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6] 4720; SSE2-NEXT: pandn %xmm1, %xmm0 4721; SSE2-NEXT: por %xmm0, %xmm2 4722; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 4723; SSE2-NEXT: je LBB24_22 4724; SSE2-NEXT: LBB24_21: ## %cond.load28 4725; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] 4726; SSE2-NEXT: pand %xmm0, %xmm2 4727; SSE2-NEXT: movzbl 10(%rdi), %ecx 4728; SSE2-NEXT: movd %ecx, %xmm1 4729; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] 4730; SSE2-NEXT: pandn %xmm1, %xmm0 4731; SSE2-NEXT: por %xmm0, %xmm2 4732; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 4733; SSE2-NEXT: je LBB24_24 4734; SSE2-NEXT: LBB24_23: ## %cond.load31 4735; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] 4736; SSE2-NEXT: pand %xmm0, %xmm2 4737; SSE2-NEXT: movzbl 11(%rdi), %ecx 4738; SSE2-NEXT: movd %ecx, %xmm1 4739; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] 4740; SSE2-NEXT: pandn %xmm1, %xmm0 4741; SSE2-NEXT: por %xmm0, %xmm2 4742; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 4743; SSE2-NEXT: je LBB24_26 4744; SSE2-NEXT: LBB24_25: ## %cond.load34 4745; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] 4746; SSE2-NEXT: pand %xmm0, %xmm2 4747; SSE2-NEXT: movzbl 12(%rdi), %ecx 4748; SSE2-NEXT: movd %ecx, %xmm1 4749; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 4750; SSE2-NEXT: pandn %xmm1, %xmm0 4751; SSE2-NEXT: por %xmm0, %xmm2 4752; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 4753; SSE2-NEXT: je LBB24_28 4754; SSE2-NEXT: LBB24_27: ## %cond.load37 4755; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] 4756; SSE2-NEXT: pand %xmm0, %xmm2 4757; SSE2-NEXT: movzbl 13(%rdi), %ecx 4758; SSE2-NEXT: movd %ecx, %xmm1 4759; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2] 4760; SSE2-NEXT: pandn %xmm1, %xmm0 4761; SSE2-NEXT: por %xmm0, %xmm2 4762; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 4763; SSE2-NEXT: je LBB24_30 4764; SSE2-NEXT: LBB24_29: ## %cond.load40 4765; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] 4766; SSE2-NEXT: pand %xmm0, %xmm2 4767; SSE2-NEXT: movzbl 14(%rdi), %ecx 4768; SSE2-NEXT: movd %ecx, %xmm1 4769; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 4770; SSE2-NEXT: pandn %xmm1, %xmm0 4771; SSE2-NEXT: por %xmm0, %xmm2 4772; SSE2-NEXT: testw %ax, %ax 4773; SSE2-NEXT: jns LBB24_32 4774; SSE2-NEXT: LBB24_31: ## %cond.load43 4775; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 4776; SSE2-NEXT: movzbl 15(%rdi), %ecx 4777; SSE2-NEXT: movd %ecx, %xmm0 4778; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] 4779; SSE2-NEXT: por %xmm0, %xmm2 4780; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000 4781; SSE2-NEXT: je LBB24_34 4782; SSE2-NEXT: LBB24_33: ## %cond.load46 4783; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 4784; SSE2-NEXT: pand %xmm0, %xmm3 4785; SSE2-NEXT: movzbl 16(%rdi), %ecx 4786; SSE2-NEXT: movd %ecx, %xmm1 4787; SSE2-NEXT: pandn %xmm1, %xmm0 4788; SSE2-NEXT: por %xmm0, %xmm3 4789; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000 4790; SSE2-NEXT: je LBB24_36 4791; SSE2-NEXT: LBB24_35: ## %cond.load49 4792; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 4793; SSE2-NEXT: pand %xmm0, %xmm3 4794; SSE2-NEXT: movzbl 17(%rdi), %ecx 4795; SSE2-NEXT: movd %ecx, %xmm1 4796; SSE2-NEXT: psllw $8, %xmm1 4797; SSE2-NEXT: pandn %xmm1, %xmm0 4798; SSE2-NEXT: por %xmm0, %xmm3 4799; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000 4800; SSE2-NEXT: je LBB24_38 4801; SSE2-NEXT: LBB24_37: ## %cond.load52 4802; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] 4803; SSE2-NEXT: pand %xmm0, %xmm3 4804; SSE2-NEXT: movzbl 18(%rdi), %ecx 4805; SSE2-NEXT: movd %ecx, %xmm1 4806; SSE2-NEXT: pslld $16, %xmm1 4807; SSE2-NEXT: pandn %xmm1, %xmm0 4808; SSE2-NEXT: por %xmm0, %xmm3 4809; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000 4810; SSE2-NEXT: je LBB24_40 4811; SSE2-NEXT: LBB24_39: ## %cond.load55 4812; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] 4813; SSE2-NEXT: pand %xmm0, %xmm3 4814; SSE2-NEXT: movzbl 19(%rdi), %ecx 4815; SSE2-NEXT: movd %ecx, %xmm1 4816; SSE2-NEXT: pslld $24, %xmm1 4817; SSE2-NEXT: pandn %xmm1, %xmm0 4818; SSE2-NEXT: por %xmm0, %xmm3 4819; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000 4820; SSE2-NEXT: je LBB24_42 4821; SSE2-NEXT: LBB24_41: ## %cond.load58 4822; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] 4823; SSE2-NEXT: pand %xmm0, %xmm3 4824; SSE2-NEXT: movzbl 20(%rdi), %ecx 4825; SSE2-NEXT: movd %ecx, %xmm1 4826; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 4827; SSE2-NEXT: pandn %xmm1, %xmm0 4828; SSE2-NEXT: por %xmm0, %xmm3 4829; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000 4830; SSE2-NEXT: je LBB24_44 4831; SSE2-NEXT: LBB24_43: ## %cond.load61 4832; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] 4833; SSE2-NEXT: pand %xmm0, %xmm3 4834; SSE2-NEXT: movzbl 21(%rdi), %ecx 4835; SSE2-NEXT: movd %ecx, %xmm1 4836; SSE2-NEXT: psllq $40, %xmm1 4837; SSE2-NEXT: pandn %xmm1, %xmm0 4838; SSE2-NEXT: por %xmm0, %xmm3 4839; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000 4840; SSE2-NEXT: je LBB24_46 4841; SSE2-NEXT: LBB24_45: ## %cond.load64 4842; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] 4843; SSE2-NEXT: pand %xmm0, %xmm3 4844; SSE2-NEXT: movzbl 22(%rdi), %ecx 4845; SSE2-NEXT: movd %ecx, %xmm1 4846; SSE2-NEXT: psllq $48, %xmm1 4847; SSE2-NEXT: pandn %xmm1, %xmm0 4848; SSE2-NEXT: por %xmm0, %xmm3 4849; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000 4850; SSE2-NEXT: je LBB24_48 4851; SSE2-NEXT: LBB24_47: ## %cond.load67 4852; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] 4853; SSE2-NEXT: pand %xmm0, %xmm3 4854; SSE2-NEXT: movzbl 23(%rdi), %ecx 4855; SSE2-NEXT: movd %ecx, %xmm1 4856; SSE2-NEXT: psllq $56, %xmm1 4857; SSE2-NEXT: pandn %xmm1, %xmm0 4858; SSE2-NEXT: por %xmm0, %xmm3 4859; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000 4860; SSE2-NEXT: je LBB24_50 4861; SSE2-NEXT: LBB24_49: ## %cond.load70 4862; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 4863; SSE2-NEXT: pand %xmm0, %xmm3 4864; SSE2-NEXT: movzbl 24(%rdi), %ecx 4865; SSE2-NEXT: movd %ecx, %xmm1 4866; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 4867; SSE2-NEXT: pandn %xmm1, %xmm0 4868; SSE2-NEXT: por %xmm0, %xmm3 4869; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000 4870; SSE2-NEXT: je LBB24_52 4871; SSE2-NEXT: LBB24_51: ## %cond.load73 4872; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] 4873; SSE2-NEXT: pand %xmm0, %xmm3 4874; SSE2-NEXT: movzbl 25(%rdi), %ecx 4875; SSE2-NEXT: movd %ecx, %xmm1 4876; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6] 4877; SSE2-NEXT: pandn %xmm1, %xmm0 4878; SSE2-NEXT: por %xmm0, %xmm3 4879; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000 4880; SSE2-NEXT: je LBB24_54 4881; SSE2-NEXT: LBB24_53: ## %cond.load76 4882; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] 4883; SSE2-NEXT: pand %xmm0, %xmm3 4884; SSE2-NEXT: movzbl 26(%rdi), %ecx 4885; SSE2-NEXT: movd %ecx, %xmm1 4886; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] 4887; SSE2-NEXT: pandn %xmm1, %xmm0 4888; SSE2-NEXT: por %xmm0, %xmm3 4889; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000 4890; SSE2-NEXT: je LBB24_56 4891; SSE2-NEXT: LBB24_55: ## %cond.load79 4892; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] 4893; SSE2-NEXT: pand %xmm0, %xmm3 4894; SSE2-NEXT: movzbl 27(%rdi), %ecx 4895; SSE2-NEXT: movd %ecx, %xmm1 4896; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] 4897; SSE2-NEXT: pandn %xmm1, %xmm0 4898; SSE2-NEXT: por %xmm0, %xmm3 4899; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000 4900; SSE2-NEXT: je LBB24_58 4901; SSE2-NEXT: LBB24_57: ## %cond.load82 4902; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] 4903; SSE2-NEXT: pand %xmm0, %xmm3 4904; SSE2-NEXT: movzbl 28(%rdi), %ecx 4905; SSE2-NEXT: movd %ecx, %xmm1 4906; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 4907; SSE2-NEXT: pandn %xmm1, %xmm0 4908; SSE2-NEXT: por %xmm0, %xmm3 4909; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000 4910; SSE2-NEXT: je LBB24_60 4911; SSE2-NEXT: LBB24_59: ## %cond.load85 4912; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] 4913; SSE2-NEXT: pand %xmm0, %xmm3 4914; SSE2-NEXT: movzbl 29(%rdi), %ecx 4915; SSE2-NEXT: movd %ecx, %xmm1 4916; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2] 4917; SSE2-NEXT: pandn %xmm1, %xmm0 4918; SSE2-NEXT: por %xmm0, %xmm3 4919; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 4920; SSE2-NEXT: je LBB24_62 4921; SSE2-NEXT: LBB24_61: ## %cond.load88 4922; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] 4923; SSE2-NEXT: pand %xmm0, %xmm3 4924; SSE2-NEXT: movzbl 30(%rdi), %ecx 4925; SSE2-NEXT: movd %ecx, %xmm1 4926; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 4927; SSE2-NEXT: pandn %xmm1, %xmm0 4928; SSE2-NEXT: por %xmm0, %xmm3 4929; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 4930; SSE2-NEXT: jne LBB24_63 4931; SSE2-NEXT: jmp LBB24_64 4932; 4933; SSE42-LABEL: load_v32i8_v32i8: 4934; SSE42: ## %bb.0: 4935; SSE42-NEXT: pmovmskb %xmm0, %ecx 4936; SSE42-NEXT: pmovmskb %xmm1, %eax 4937; SSE42-NEXT: shll $16, %eax 4938; SSE42-NEXT: orl %ecx, %eax 4939; SSE42-NEXT: testb $1, %al 4940; SSE42-NEXT: jne LBB24_1 4941; SSE42-NEXT: ## %bb.2: ## %else 4942; SSE42-NEXT: testb $2, %al 4943; SSE42-NEXT: jne LBB24_3 4944; SSE42-NEXT: LBB24_4: ## %else2 4945; SSE42-NEXT: testb $4, %al 4946; SSE42-NEXT: jne LBB24_5 4947; SSE42-NEXT: LBB24_6: ## %else5 4948; SSE42-NEXT: testb $8, %al 4949; SSE42-NEXT: jne LBB24_7 4950; SSE42-NEXT: LBB24_8: ## %else8 4951; SSE42-NEXT: testb $16, %al 4952; SSE42-NEXT: jne LBB24_9 4953; SSE42-NEXT: LBB24_10: ## %else11 4954; SSE42-NEXT: testb $32, %al 4955; SSE42-NEXT: jne LBB24_11 4956; SSE42-NEXT: LBB24_12: ## %else14 4957; SSE42-NEXT: testb $64, %al 4958; SSE42-NEXT: jne LBB24_13 4959; SSE42-NEXT: LBB24_14: ## %else17 4960; SSE42-NEXT: testb %al, %al 4961; SSE42-NEXT: js LBB24_15 4962; SSE42-NEXT: LBB24_16: ## %else20 4963; SSE42-NEXT: testl $256, %eax ## imm = 0x100 4964; SSE42-NEXT: jne LBB24_17 4965; SSE42-NEXT: LBB24_18: ## %else23 4966; SSE42-NEXT: testl $512, %eax ## imm = 0x200 4967; SSE42-NEXT: jne LBB24_19 4968; SSE42-NEXT: LBB24_20: ## %else26 4969; SSE42-NEXT: testl $1024, %eax ## imm = 0x400 4970; SSE42-NEXT: jne LBB24_21 4971; SSE42-NEXT: LBB24_22: ## %else29 4972; SSE42-NEXT: testl $2048, %eax ## imm = 0x800 4973; SSE42-NEXT: jne LBB24_23 4974; SSE42-NEXT: LBB24_24: ## %else32 4975; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000 4976; SSE42-NEXT: jne LBB24_25 4977; SSE42-NEXT: LBB24_26: ## %else35 4978; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000 4979; SSE42-NEXT: jne LBB24_27 4980; SSE42-NEXT: LBB24_28: ## %else38 4981; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000 4982; SSE42-NEXT: jne LBB24_29 4983; SSE42-NEXT: LBB24_30: ## %else41 4984; SSE42-NEXT: testw %ax, %ax 4985; SSE42-NEXT: js LBB24_31 4986; SSE42-NEXT: LBB24_32: ## %else44 4987; SSE42-NEXT: testl $65536, %eax ## imm = 0x10000 4988; SSE42-NEXT: jne LBB24_33 4989; SSE42-NEXT: LBB24_34: ## %else47 4990; SSE42-NEXT: testl $131072, %eax ## imm = 0x20000 4991; SSE42-NEXT: jne LBB24_35 4992; SSE42-NEXT: LBB24_36: ## %else50 4993; SSE42-NEXT: testl $262144, %eax ## imm = 0x40000 4994; SSE42-NEXT: jne LBB24_37 4995; SSE42-NEXT: LBB24_38: ## %else53 4996; SSE42-NEXT: testl $524288, %eax ## imm = 0x80000 4997; SSE42-NEXT: jne LBB24_39 4998; SSE42-NEXT: LBB24_40: ## %else56 4999; SSE42-NEXT: testl $1048576, %eax ## imm = 0x100000 5000; SSE42-NEXT: jne LBB24_41 5001; SSE42-NEXT: LBB24_42: ## %else59 5002; SSE42-NEXT: testl $2097152, %eax ## imm = 0x200000 5003; SSE42-NEXT: jne LBB24_43 5004; SSE42-NEXT: LBB24_44: ## %else62 5005; SSE42-NEXT: testl $4194304, %eax ## imm = 0x400000 5006; SSE42-NEXT: jne LBB24_45 5007; SSE42-NEXT: LBB24_46: ## %else65 5008; SSE42-NEXT: testl $8388608, %eax ## imm = 0x800000 5009; SSE42-NEXT: jne LBB24_47 5010; SSE42-NEXT: LBB24_48: ## %else68 5011; SSE42-NEXT: testl $16777216, %eax ## imm = 0x1000000 5012; SSE42-NEXT: jne LBB24_49 5013; SSE42-NEXT: LBB24_50: ## %else71 5014; SSE42-NEXT: testl $33554432, %eax ## imm = 0x2000000 5015; SSE42-NEXT: jne LBB24_51 5016; SSE42-NEXT: LBB24_52: ## %else74 5017; SSE42-NEXT: testl $67108864, %eax ## imm = 0x4000000 5018; SSE42-NEXT: jne LBB24_53 5019; SSE42-NEXT: LBB24_54: ## %else77 5020; SSE42-NEXT: testl $134217728, %eax ## imm = 0x8000000 5021; SSE42-NEXT: jne LBB24_55 5022; SSE42-NEXT: LBB24_56: ## %else80 5023; SSE42-NEXT: testl $268435456, %eax ## imm = 0x10000000 5024; SSE42-NEXT: jne LBB24_57 5025; SSE42-NEXT: LBB24_58: ## %else83 5026; SSE42-NEXT: testl $536870912, %eax ## imm = 0x20000000 5027; SSE42-NEXT: jne LBB24_59 5028; SSE42-NEXT: LBB24_60: ## %else86 5029; SSE42-NEXT: testl $1073741824, %eax ## imm = 0x40000000 5030; SSE42-NEXT: jne LBB24_61 5031; SSE42-NEXT: LBB24_62: ## %else89 5032; SSE42-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 5033; SSE42-NEXT: je LBB24_64 5034; SSE42-NEXT: LBB24_63: ## %cond.load91 5035; SSE42-NEXT: pinsrb $15, 31(%rdi), %xmm3 5036; SSE42-NEXT: LBB24_64: ## %else92 5037; SSE42-NEXT: movdqa %xmm2, %xmm0 5038; SSE42-NEXT: movdqa %xmm3, %xmm1 5039; SSE42-NEXT: retq 5040; SSE42-NEXT: LBB24_1: ## %cond.load 5041; SSE42-NEXT: pinsrb $0, (%rdi), %xmm2 5042; SSE42-NEXT: testb $2, %al 5043; SSE42-NEXT: je LBB24_4 5044; SSE42-NEXT: LBB24_3: ## %cond.load1 5045; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm2 5046; SSE42-NEXT: testb $4, %al 5047; SSE42-NEXT: je LBB24_6 5048; SSE42-NEXT: LBB24_5: ## %cond.load4 5049; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm2 5050; SSE42-NEXT: testb $8, %al 5051; SSE42-NEXT: je LBB24_8 5052; SSE42-NEXT: LBB24_7: ## %cond.load7 5053; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm2 5054; SSE42-NEXT: testb $16, %al 5055; SSE42-NEXT: je LBB24_10 5056; SSE42-NEXT: LBB24_9: ## %cond.load10 5057; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm2 5058; SSE42-NEXT: testb $32, %al 5059; SSE42-NEXT: je LBB24_12 5060; SSE42-NEXT: LBB24_11: ## %cond.load13 5061; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm2 5062; SSE42-NEXT: testb $64, %al 5063; SSE42-NEXT: je LBB24_14 5064; SSE42-NEXT: LBB24_13: ## %cond.load16 5065; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm2 5066; SSE42-NEXT: testb %al, %al 5067; SSE42-NEXT: jns LBB24_16 5068; SSE42-NEXT: LBB24_15: ## %cond.load19 5069; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm2 5070; SSE42-NEXT: testl $256, %eax ## imm = 0x100 5071; SSE42-NEXT: je LBB24_18 5072; SSE42-NEXT: LBB24_17: ## %cond.load22 5073; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm2 5074; SSE42-NEXT: testl $512, %eax ## imm = 0x200 5075; SSE42-NEXT: je LBB24_20 5076; SSE42-NEXT: LBB24_19: ## %cond.load25 5077; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm2 5078; SSE42-NEXT: testl $1024, %eax ## imm = 0x400 5079; SSE42-NEXT: je LBB24_22 5080; SSE42-NEXT: LBB24_21: ## %cond.load28 5081; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm2 5082; SSE42-NEXT: testl $2048, %eax ## imm = 0x800 5083; SSE42-NEXT: je LBB24_24 5084; SSE42-NEXT: LBB24_23: ## %cond.load31 5085; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm2 5086; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000 5087; SSE42-NEXT: je LBB24_26 5088; SSE42-NEXT: LBB24_25: ## %cond.load34 5089; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm2 5090; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000 5091; SSE42-NEXT: je LBB24_28 5092; SSE42-NEXT: LBB24_27: ## %cond.load37 5093; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm2 5094; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000 5095; SSE42-NEXT: je LBB24_30 5096; SSE42-NEXT: LBB24_29: ## %cond.load40 5097; SSE42-NEXT: pinsrb $14, 14(%rdi), %xmm2 5098; SSE42-NEXT: testw %ax, %ax 5099; SSE42-NEXT: jns LBB24_32 5100; SSE42-NEXT: LBB24_31: ## %cond.load43 5101; SSE42-NEXT: pinsrb $15, 15(%rdi), %xmm2 5102; SSE42-NEXT: testl $65536, %eax ## imm = 0x10000 5103; SSE42-NEXT: je LBB24_34 5104; SSE42-NEXT: LBB24_33: ## %cond.load46 5105; SSE42-NEXT: pinsrb $0, 16(%rdi), %xmm3 5106; SSE42-NEXT: testl $131072, %eax ## imm = 0x20000 5107; SSE42-NEXT: je LBB24_36 5108; SSE42-NEXT: LBB24_35: ## %cond.load49 5109; SSE42-NEXT: pinsrb $1, 17(%rdi), %xmm3 5110; SSE42-NEXT: testl $262144, %eax ## imm = 0x40000 5111; SSE42-NEXT: je LBB24_38 5112; SSE42-NEXT: LBB24_37: ## %cond.load52 5113; SSE42-NEXT: pinsrb $2, 18(%rdi), %xmm3 5114; SSE42-NEXT: testl $524288, %eax ## imm = 0x80000 5115; SSE42-NEXT: je LBB24_40 5116; SSE42-NEXT: LBB24_39: ## %cond.load55 5117; SSE42-NEXT: pinsrb $3, 19(%rdi), %xmm3 5118; SSE42-NEXT: testl $1048576, %eax ## imm = 0x100000 5119; SSE42-NEXT: je LBB24_42 5120; SSE42-NEXT: LBB24_41: ## %cond.load58 5121; SSE42-NEXT: pinsrb $4, 20(%rdi), %xmm3 5122; SSE42-NEXT: testl $2097152, %eax ## imm = 0x200000 5123; SSE42-NEXT: je LBB24_44 5124; SSE42-NEXT: LBB24_43: ## %cond.load61 5125; SSE42-NEXT: pinsrb $5, 21(%rdi), %xmm3 5126; SSE42-NEXT: testl $4194304, %eax ## imm = 0x400000 5127; SSE42-NEXT: je LBB24_46 5128; SSE42-NEXT: LBB24_45: ## %cond.load64 5129; SSE42-NEXT: pinsrb $6, 22(%rdi), %xmm3 5130; SSE42-NEXT: testl $8388608, %eax ## imm = 0x800000 5131; SSE42-NEXT: je LBB24_48 5132; SSE42-NEXT: LBB24_47: ## %cond.load67 5133; SSE42-NEXT: pinsrb $7, 23(%rdi), %xmm3 5134; SSE42-NEXT: testl $16777216, %eax ## imm = 0x1000000 5135; SSE42-NEXT: je LBB24_50 5136; SSE42-NEXT: LBB24_49: ## %cond.load70 5137; SSE42-NEXT: pinsrb $8, 24(%rdi), %xmm3 5138; SSE42-NEXT: testl $33554432, %eax ## imm = 0x2000000 5139; SSE42-NEXT: je LBB24_52 5140; SSE42-NEXT: LBB24_51: ## %cond.load73 5141; SSE42-NEXT: pinsrb $9, 25(%rdi), %xmm3 5142; SSE42-NEXT: testl $67108864, %eax ## imm = 0x4000000 5143; SSE42-NEXT: je LBB24_54 5144; SSE42-NEXT: LBB24_53: ## %cond.load76 5145; SSE42-NEXT: pinsrb $10, 26(%rdi), %xmm3 5146; SSE42-NEXT: testl $134217728, %eax ## imm = 0x8000000 5147; SSE42-NEXT: je LBB24_56 5148; SSE42-NEXT: LBB24_55: ## %cond.load79 5149; SSE42-NEXT: pinsrb $11, 27(%rdi), %xmm3 5150; SSE42-NEXT: testl $268435456, %eax ## imm = 0x10000000 5151; SSE42-NEXT: je LBB24_58 5152; SSE42-NEXT: LBB24_57: ## %cond.load82 5153; SSE42-NEXT: pinsrb $12, 28(%rdi), %xmm3 5154; SSE42-NEXT: testl $536870912, %eax ## imm = 0x20000000 5155; SSE42-NEXT: je LBB24_60 5156; SSE42-NEXT: LBB24_59: ## %cond.load85 5157; SSE42-NEXT: pinsrb $13, 29(%rdi), %xmm3 5158; SSE42-NEXT: testl $1073741824, %eax ## imm = 0x40000000 5159; SSE42-NEXT: je LBB24_62 5160; SSE42-NEXT: LBB24_61: ## %cond.load88 5161; SSE42-NEXT: pinsrb $14, 30(%rdi), %xmm3 5162; SSE42-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 5163; SSE42-NEXT: jne LBB24_63 5164; SSE42-NEXT: jmp LBB24_64 5165; 5166; AVX1-LABEL: load_v32i8_v32i8: 5167; AVX1: ## %bb.0: 5168; AVX1-NEXT: vpmovmskb %xmm0, %ecx 5169; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 5170; AVX1-NEXT: vpmovmskb %xmm0, %eax 5171; AVX1-NEXT: shll $16, %eax 5172; AVX1-NEXT: orl %ecx, %eax 5173; AVX1-NEXT: testb $1, %al 5174; AVX1-NEXT: jne LBB24_1 5175; AVX1-NEXT: ## %bb.2: ## %else 5176; AVX1-NEXT: testb $2, %al 5177; AVX1-NEXT: jne LBB24_3 5178; AVX1-NEXT: LBB24_4: ## %else2 5179; AVX1-NEXT: testb $4, %al 5180; AVX1-NEXT: jne LBB24_5 5181; AVX1-NEXT: LBB24_6: ## %else5 5182; AVX1-NEXT: testb $8, %al 5183; AVX1-NEXT: jne LBB24_7 5184; AVX1-NEXT: LBB24_8: ## %else8 5185; AVX1-NEXT: testb $16, %al 5186; AVX1-NEXT: jne LBB24_9 5187; AVX1-NEXT: LBB24_10: ## %else11 5188; AVX1-NEXT: testb $32, %al 5189; AVX1-NEXT: jne LBB24_11 5190; AVX1-NEXT: LBB24_12: ## %else14 5191; AVX1-NEXT: testb $64, %al 5192; AVX1-NEXT: jne LBB24_13 5193; AVX1-NEXT: LBB24_14: ## %else17 5194; AVX1-NEXT: testb %al, %al 5195; AVX1-NEXT: js LBB24_15 5196; AVX1-NEXT: LBB24_16: ## %else20 5197; AVX1-NEXT: testl $256, %eax ## imm = 0x100 5198; AVX1-NEXT: jne LBB24_17 5199; AVX1-NEXT: LBB24_18: ## %else23 5200; AVX1-NEXT: testl $512, %eax ## imm = 0x200 5201; AVX1-NEXT: jne LBB24_19 5202; AVX1-NEXT: LBB24_20: ## %else26 5203; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 5204; AVX1-NEXT: jne LBB24_21 5205; AVX1-NEXT: LBB24_22: ## %else29 5206; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 5207; AVX1-NEXT: jne LBB24_23 5208; AVX1-NEXT: LBB24_24: ## %else32 5209; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 5210; AVX1-NEXT: jne LBB24_25 5211; AVX1-NEXT: LBB24_26: ## %else35 5212; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 5213; AVX1-NEXT: jne LBB24_27 5214; AVX1-NEXT: LBB24_28: ## %else38 5215; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 5216; AVX1-NEXT: jne LBB24_29 5217; AVX1-NEXT: LBB24_30: ## %else41 5218; AVX1-NEXT: testw %ax, %ax 5219; AVX1-NEXT: js LBB24_31 5220; AVX1-NEXT: LBB24_32: ## %else44 5221; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 5222; AVX1-NEXT: jne LBB24_33 5223; AVX1-NEXT: LBB24_34: ## %else47 5224; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000 5225; AVX1-NEXT: jne LBB24_35 5226; AVX1-NEXT: LBB24_36: ## %else50 5227; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000 5228; AVX1-NEXT: jne LBB24_37 5229; AVX1-NEXT: LBB24_38: ## %else53 5230; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000 5231; AVX1-NEXT: jne LBB24_39 5232; AVX1-NEXT: LBB24_40: ## %else56 5233; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 5234; AVX1-NEXT: jne LBB24_41 5235; AVX1-NEXT: LBB24_42: ## %else59 5236; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 5237; AVX1-NEXT: jne LBB24_43 5238; AVX1-NEXT: LBB24_44: ## %else62 5239; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 5240; AVX1-NEXT: jne LBB24_45 5241; AVX1-NEXT: LBB24_46: ## %else65 5242; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 5243; AVX1-NEXT: jne LBB24_47 5244; AVX1-NEXT: LBB24_48: ## %else68 5245; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 5246; AVX1-NEXT: jne LBB24_49 5247; AVX1-NEXT: LBB24_50: ## %else71 5248; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000 5249; AVX1-NEXT: jne LBB24_51 5250; AVX1-NEXT: LBB24_52: ## %else74 5251; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000 5252; AVX1-NEXT: jne LBB24_53 5253; AVX1-NEXT: LBB24_54: ## %else77 5254; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000 5255; AVX1-NEXT: jne LBB24_55 5256; AVX1-NEXT: LBB24_56: ## %else80 5257; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 5258; AVX1-NEXT: jne LBB24_57 5259; AVX1-NEXT: LBB24_58: ## %else83 5260; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 5261; AVX1-NEXT: jne LBB24_59 5262; AVX1-NEXT: LBB24_60: ## %else86 5263; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 5264; AVX1-NEXT: jne LBB24_61 5265; AVX1-NEXT: LBB24_62: ## %else89 5266; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 5267; AVX1-NEXT: jne LBB24_63 5268; AVX1-NEXT: LBB24_64: ## %else92 5269; AVX1-NEXT: vmovaps %ymm1, %ymm0 5270; AVX1-NEXT: retq 5271; AVX1-NEXT: LBB24_1: ## %cond.load 5272; AVX1-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0 5273; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5274; AVX1-NEXT: testb $2, %al 5275; AVX1-NEXT: je LBB24_4 5276; AVX1-NEXT: LBB24_3: ## %cond.load1 5277; AVX1-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0 5278; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5279; AVX1-NEXT: testb $4, %al 5280; AVX1-NEXT: je LBB24_6 5281; AVX1-NEXT: LBB24_5: ## %cond.load4 5282; AVX1-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0 5283; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5284; AVX1-NEXT: testb $8, %al 5285; AVX1-NEXT: je LBB24_8 5286; AVX1-NEXT: LBB24_7: ## %cond.load7 5287; AVX1-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0 5288; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5289; AVX1-NEXT: testb $16, %al 5290; AVX1-NEXT: je LBB24_10 5291; AVX1-NEXT: LBB24_9: ## %cond.load10 5292; AVX1-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0 5293; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5294; AVX1-NEXT: testb $32, %al 5295; AVX1-NEXT: je LBB24_12 5296; AVX1-NEXT: LBB24_11: ## %cond.load13 5297; AVX1-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0 5298; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5299; AVX1-NEXT: testb $64, %al 5300; AVX1-NEXT: je LBB24_14 5301; AVX1-NEXT: LBB24_13: ## %cond.load16 5302; AVX1-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0 5303; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5304; AVX1-NEXT: testb %al, %al 5305; AVX1-NEXT: jns LBB24_16 5306; AVX1-NEXT: LBB24_15: ## %cond.load19 5307; AVX1-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0 5308; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5309; AVX1-NEXT: testl $256, %eax ## imm = 0x100 5310; AVX1-NEXT: je LBB24_18 5311; AVX1-NEXT: LBB24_17: ## %cond.load22 5312; AVX1-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0 5313; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5314; AVX1-NEXT: testl $512, %eax ## imm = 0x200 5315; AVX1-NEXT: je LBB24_20 5316; AVX1-NEXT: LBB24_19: ## %cond.load25 5317; AVX1-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0 5318; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5319; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 5320; AVX1-NEXT: je LBB24_22 5321; AVX1-NEXT: LBB24_21: ## %cond.load28 5322; AVX1-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0 5323; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5324; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 5325; AVX1-NEXT: je LBB24_24 5326; AVX1-NEXT: LBB24_23: ## %cond.load31 5327; AVX1-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0 5328; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5329; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 5330; AVX1-NEXT: je LBB24_26 5331; AVX1-NEXT: LBB24_25: ## %cond.load34 5332; AVX1-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0 5333; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5334; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 5335; AVX1-NEXT: je LBB24_28 5336; AVX1-NEXT: LBB24_27: ## %cond.load37 5337; AVX1-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0 5338; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5339; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 5340; AVX1-NEXT: je LBB24_30 5341; AVX1-NEXT: LBB24_29: ## %cond.load40 5342; AVX1-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0 5343; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5344; AVX1-NEXT: testw %ax, %ax 5345; AVX1-NEXT: jns LBB24_32 5346; AVX1-NEXT: LBB24_31: ## %cond.load43 5347; AVX1-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0 5348; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5349; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 5350; AVX1-NEXT: je LBB24_34 5351; AVX1-NEXT: LBB24_33: ## %cond.load46 5352; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5353; AVX1-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 5354; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5355; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000 5356; AVX1-NEXT: je LBB24_36 5357; AVX1-NEXT: LBB24_35: ## %cond.load49 5358; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5359; AVX1-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 5360; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5361; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000 5362; AVX1-NEXT: je LBB24_38 5363; AVX1-NEXT: LBB24_37: ## %cond.load52 5364; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5365; AVX1-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 5366; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5367; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000 5368; AVX1-NEXT: je LBB24_40 5369; AVX1-NEXT: LBB24_39: ## %cond.load55 5370; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5371; AVX1-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 5372; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5373; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 5374; AVX1-NEXT: je LBB24_42 5375; AVX1-NEXT: LBB24_41: ## %cond.load58 5376; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5377; AVX1-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 5378; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5379; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 5380; AVX1-NEXT: je LBB24_44 5381; AVX1-NEXT: LBB24_43: ## %cond.load61 5382; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5383; AVX1-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 5384; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5385; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 5386; AVX1-NEXT: je LBB24_46 5387; AVX1-NEXT: LBB24_45: ## %cond.load64 5388; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5389; AVX1-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 5390; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5391; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 5392; AVX1-NEXT: je LBB24_48 5393; AVX1-NEXT: LBB24_47: ## %cond.load67 5394; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5395; AVX1-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 5396; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5397; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 5398; AVX1-NEXT: je LBB24_50 5399; AVX1-NEXT: LBB24_49: ## %cond.load70 5400; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5401; AVX1-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 5402; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5403; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000 5404; AVX1-NEXT: je LBB24_52 5405; AVX1-NEXT: LBB24_51: ## %cond.load73 5406; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5407; AVX1-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 5408; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5409; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000 5410; AVX1-NEXT: je LBB24_54 5411; AVX1-NEXT: LBB24_53: ## %cond.load76 5412; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5413; AVX1-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 5414; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5415; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000 5416; AVX1-NEXT: je LBB24_56 5417; AVX1-NEXT: LBB24_55: ## %cond.load79 5418; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5419; AVX1-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 5420; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5421; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 5422; AVX1-NEXT: je LBB24_58 5423; AVX1-NEXT: LBB24_57: ## %cond.load82 5424; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5425; AVX1-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 5426; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5427; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 5428; AVX1-NEXT: je LBB24_60 5429; AVX1-NEXT: LBB24_59: ## %cond.load85 5430; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5431; AVX1-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 5432; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5433; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 5434; AVX1-NEXT: je LBB24_62 5435; AVX1-NEXT: LBB24_61: ## %cond.load88 5436; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5437; AVX1-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 5438; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5439; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 5440; AVX1-NEXT: je LBB24_64 5441; AVX1-NEXT: LBB24_63: ## %cond.load91 5442; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 5443; AVX1-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 5444; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 5445; AVX1-NEXT: vmovaps %ymm1, %ymm0 5446; AVX1-NEXT: retq 5447; 5448; AVX2-LABEL: load_v32i8_v32i8: 5449; AVX2: ## %bb.0: 5450; AVX2-NEXT: vpmovmskb %ymm0, %eax 5451; AVX2-NEXT: testb $1, %al 5452; AVX2-NEXT: jne LBB24_1 5453; AVX2-NEXT: ## %bb.2: ## %else 5454; AVX2-NEXT: testb $2, %al 5455; AVX2-NEXT: jne LBB24_3 5456; AVX2-NEXT: LBB24_4: ## %else2 5457; AVX2-NEXT: testb $4, %al 5458; AVX2-NEXT: jne LBB24_5 5459; AVX2-NEXT: LBB24_6: ## %else5 5460; AVX2-NEXT: testb $8, %al 5461; AVX2-NEXT: jne LBB24_7 5462; AVX2-NEXT: LBB24_8: ## %else8 5463; AVX2-NEXT: testb $16, %al 5464; AVX2-NEXT: jne LBB24_9 5465; AVX2-NEXT: LBB24_10: ## %else11 5466; AVX2-NEXT: testb $32, %al 5467; AVX2-NEXT: jne LBB24_11 5468; AVX2-NEXT: LBB24_12: ## %else14 5469; AVX2-NEXT: testb $64, %al 5470; AVX2-NEXT: jne LBB24_13 5471; AVX2-NEXT: LBB24_14: ## %else17 5472; AVX2-NEXT: testb %al, %al 5473; AVX2-NEXT: js LBB24_15 5474; AVX2-NEXT: LBB24_16: ## %else20 5475; AVX2-NEXT: testl $256, %eax ## imm = 0x100 5476; AVX2-NEXT: jne LBB24_17 5477; AVX2-NEXT: LBB24_18: ## %else23 5478; AVX2-NEXT: testl $512, %eax ## imm = 0x200 5479; AVX2-NEXT: jne LBB24_19 5480; AVX2-NEXT: LBB24_20: ## %else26 5481; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 5482; AVX2-NEXT: jne LBB24_21 5483; AVX2-NEXT: LBB24_22: ## %else29 5484; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 5485; AVX2-NEXT: jne LBB24_23 5486; AVX2-NEXT: LBB24_24: ## %else32 5487; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 5488; AVX2-NEXT: jne LBB24_25 5489; AVX2-NEXT: LBB24_26: ## %else35 5490; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 5491; AVX2-NEXT: jne LBB24_27 5492; AVX2-NEXT: LBB24_28: ## %else38 5493; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 5494; AVX2-NEXT: jne LBB24_29 5495; AVX2-NEXT: LBB24_30: ## %else41 5496; AVX2-NEXT: testw %ax, %ax 5497; AVX2-NEXT: js LBB24_31 5498; AVX2-NEXT: LBB24_32: ## %else44 5499; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 5500; AVX2-NEXT: jne LBB24_33 5501; AVX2-NEXT: LBB24_34: ## %else47 5502; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 5503; AVX2-NEXT: jne LBB24_35 5504; AVX2-NEXT: LBB24_36: ## %else50 5505; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000 5506; AVX2-NEXT: jne LBB24_37 5507; AVX2-NEXT: LBB24_38: ## %else53 5508; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 5509; AVX2-NEXT: jne LBB24_39 5510; AVX2-NEXT: LBB24_40: ## %else56 5511; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 5512; AVX2-NEXT: jne LBB24_41 5513; AVX2-NEXT: LBB24_42: ## %else59 5514; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 5515; AVX2-NEXT: jne LBB24_43 5516; AVX2-NEXT: LBB24_44: ## %else62 5517; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 5518; AVX2-NEXT: jne LBB24_45 5519; AVX2-NEXT: LBB24_46: ## %else65 5520; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 5521; AVX2-NEXT: jne LBB24_47 5522; AVX2-NEXT: LBB24_48: ## %else68 5523; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 5524; AVX2-NEXT: jne LBB24_49 5525; AVX2-NEXT: LBB24_50: ## %else71 5526; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000 5527; AVX2-NEXT: jne LBB24_51 5528; AVX2-NEXT: LBB24_52: ## %else74 5529; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000 5530; AVX2-NEXT: jne LBB24_53 5531; AVX2-NEXT: LBB24_54: ## %else77 5532; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 5533; AVX2-NEXT: jne LBB24_55 5534; AVX2-NEXT: LBB24_56: ## %else80 5535; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 5536; AVX2-NEXT: jne LBB24_57 5537; AVX2-NEXT: LBB24_58: ## %else83 5538; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 5539; AVX2-NEXT: jne LBB24_59 5540; AVX2-NEXT: LBB24_60: ## %else86 5541; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 5542; AVX2-NEXT: jne LBB24_61 5543; AVX2-NEXT: LBB24_62: ## %else89 5544; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 5545; AVX2-NEXT: jne LBB24_63 5546; AVX2-NEXT: LBB24_64: ## %else92 5547; AVX2-NEXT: vmovdqa %ymm1, %ymm0 5548; AVX2-NEXT: retq 5549; AVX2-NEXT: LBB24_1: ## %cond.load 5550; AVX2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0 5551; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5552; AVX2-NEXT: testb $2, %al 5553; AVX2-NEXT: je LBB24_4 5554; AVX2-NEXT: LBB24_3: ## %cond.load1 5555; AVX2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0 5556; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5557; AVX2-NEXT: testb $4, %al 5558; AVX2-NEXT: je LBB24_6 5559; AVX2-NEXT: LBB24_5: ## %cond.load4 5560; AVX2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0 5561; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5562; AVX2-NEXT: testb $8, %al 5563; AVX2-NEXT: je LBB24_8 5564; AVX2-NEXT: LBB24_7: ## %cond.load7 5565; AVX2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0 5566; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5567; AVX2-NEXT: testb $16, %al 5568; AVX2-NEXT: je LBB24_10 5569; AVX2-NEXT: LBB24_9: ## %cond.load10 5570; AVX2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0 5571; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5572; AVX2-NEXT: testb $32, %al 5573; AVX2-NEXT: je LBB24_12 5574; AVX2-NEXT: LBB24_11: ## %cond.load13 5575; AVX2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0 5576; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5577; AVX2-NEXT: testb $64, %al 5578; AVX2-NEXT: je LBB24_14 5579; AVX2-NEXT: LBB24_13: ## %cond.load16 5580; AVX2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0 5581; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5582; AVX2-NEXT: testb %al, %al 5583; AVX2-NEXT: jns LBB24_16 5584; AVX2-NEXT: LBB24_15: ## %cond.load19 5585; AVX2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0 5586; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5587; AVX2-NEXT: testl $256, %eax ## imm = 0x100 5588; AVX2-NEXT: je LBB24_18 5589; AVX2-NEXT: LBB24_17: ## %cond.load22 5590; AVX2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0 5591; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5592; AVX2-NEXT: testl $512, %eax ## imm = 0x200 5593; AVX2-NEXT: je LBB24_20 5594; AVX2-NEXT: LBB24_19: ## %cond.load25 5595; AVX2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0 5596; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5597; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 5598; AVX2-NEXT: je LBB24_22 5599; AVX2-NEXT: LBB24_21: ## %cond.load28 5600; AVX2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0 5601; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5602; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 5603; AVX2-NEXT: je LBB24_24 5604; AVX2-NEXT: LBB24_23: ## %cond.load31 5605; AVX2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0 5606; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5607; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 5608; AVX2-NEXT: je LBB24_26 5609; AVX2-NEXT: LBB24_25: ## %cond.load34 5610; AVX2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0 5611; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5612; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 5613; AVX2-NEXT: je LBB24_28 5614; AVX2-NEXT: LBB24_27: ## %cond.load37 5615; AVX2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0 5616; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5617; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 5618; AVX2-NEXT: je LBB24_30 5619; AVX2-NEXT: LBB24_29: ## %cond.load40 5620; AVX2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0 5621; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5622; AVX2-NEXT: testw %ax, %ax 5623; AVX2-NEXT: jns LBB24_32 5624; AVX2-NEXT: LBB24_31: ## %cond.load43 5625; AVX2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0 5626; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5627; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 5628; AVX2-NEXT: je LBB24_34 5629; AVX2-NEXT: LBB24_33: ## %cond.load46 5630; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5631; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 5632; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5633; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 5634; AVX2-NEXT: je LBB24_36 5635; AVX2-NEXT: LBB24_35: ## %cond.load49 5636; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5637; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 5638; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5639; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000 5640; AVX2-NEXT: je LBB24_38 5641; AVX2-NEXT: LBB24_37: ## %cond.load52 5642; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5643; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 5644; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5645; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 5646; AVX2-NEXT: je LBB24_40 5647; AVX2-NEXT: LBB24_39: ## %cond.load55 5648; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5649; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 5650; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5651; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 5652; AVX2-NEXT: je LBB24_42 5653; AVX2-NEXT: LBB24_41: ## %cond.load58 5654; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5655; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 5656; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5657; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 5658; AVX2-NEXT: je LBB24_44 5659; AVX2-NEXT: LBB24_43: ## %cond.load61 5660; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5661; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 5662; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5663; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 5664; AVX2-NEXT: je LBB24_46 5665; AVX2-NEXT: LBB24_45: ## %cond.load64 5666; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5667; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 5668; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5669; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 5670; AVX2-NEXT: je LBB24_48 5671; AVX2-NEXT: LBB24_47: ## %cond.load67 5672; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5673; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 5674; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5675; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 5676; AVX2-NEXT: je LBB24_50 5677; AVX2-NEXT: LBB24_49: ## %cond.load70 5678; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5679; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 5680; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5681; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000 5682; AVX2-NEXT: je LBB24_52 5683; AVX2-NEXT: LBB24_51: ## %cond.load73 5684; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5685; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 5686; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5687; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000 5688; AVX2-NEXT: je LBB24_54 5689; AVX2-NEXT: LBB24_53: ## %cond.load76 5690; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5691; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 5692; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5693; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 5694; AVX2-NEXT: je LBB24_56 5695; AVX2-NEXT: LBB24_55: ## %cond.load79 5696; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5697; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 5698; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5699; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 5700; AVX2-NEXT: je LBB24_58 5701; AVX2-NEXT: LBB24_57: ## %cond.load82 5702; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5703; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 5704; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5705; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 5706; AVX2-NEXT: je LBB24_60 5707; AVX2-NEXT: LBB24_59: ## %cond.load85 5708; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5709; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 5710; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5711; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 5712; AVX2-NEXT: je LBB24_62 5713; AVX2-NEXT: LBB24_61: ## %cond.load88 5714; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5715; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 5716; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5717; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 5718; AVX2-NEXT: je LBB24_64 5719; AVX2-NEXT: LBB24_63: ## %cond.load91 5720; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 5721; AVX2-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 5722; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5723; AVX2-NEXT: vmovdqa %ymm1, %ymm0 5724; AVX2-NEXT: retq 5725; 5726; AVX512F-LABEL: load_v32i8_v32i8: 5727; AVX512F: ## %bb.0: 5728; AVX512F-NEXT: vpmovmskb %ymm0, %eax 5729; AVX512F-NEXT: testb $1, %al 5730; AVX512F-NEXT: jne LBB24_1 5731; AVX512F-NEXT: ## %bb.2: ## %else 5732; AVX512F-NEXT: testb $2, %al 5733; AVX512F-NEXT: jne LBB24_3 5734; AVX512F-NEXT: LBB24_4: ## %else2 5735; AVX512F-NEXT: testb $4, %al 5736; AVX512F-NEXT: jne LBB24_5 5737; AVX512F-NEXT: LBB24_6: ## %else5 5738; AVX512F-NEXT: testb $8, %al 5739; AVX512F-NEXT: jne LBB24_7 5740; AVX512F-NEXT: LBB24_8: ## %else8 5741; AVX512F-NEXT: testb $16, %al 5742; AVX512F-NEXT: jne LBB24_9 5743; AVX512F-NEXT: LBB24_10: ## %else11 5744; AVX512F-NEXT: testb $32, %al 5745; AVX512F-NEXT: jne LBB24_11 5746; AVX512F-NEXT: LBB24_12: ## %else14 5747; AVX512F-NEXT: testb $64, %al 5748; AVX512F-NEXT: jne LBB24_13 5749; AVX512F-NEXT: LBB24_14: ## %else17 5750; AVX512F-NEXT: testb %al, %al 5751; AVX512F-NEXT: js LBB24_15 5752; AVX512F-NEXT: LBB24_16: ## %else20 5753; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 5754; AVX512F-NEXT: jne LBB24_17 5755; AVX512F-NEXT: LBB24_18: ## %else23 5756; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 5757; AVX512F-NEXT: jne LBB24_19 5758; AVX512F-NEXT: LBB24_20: ## %else26 5759; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 5760; AVX512F-NEXT: jne LBB24_21 5761; AVX512F-NEXT: LBB24_22: ## %else29 5762; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 5763; AVX512F-NEXT: jne LBB24_23 5764; AVX512F-NEXT: LBB24_24: ## %else32 5765; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 5766; AVX512F-NEXT: jne LBB24_25 5767; AVX512F-NEXT: LBB24_26: ## %else35 5768; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 5769; AVX512F-NEXT: jne LBB24_27 5770; AVX512F-NEXT: LBB24_28: ## %else38 5771; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 5772; AVX512F-NEXT: jne LBB24_29 5773; AVX512F-NEXT: LBB24_30: ## %else41 5774; AVX512F-NEXT: testw %ax, %ax 5775; AVX512F-NEXT: js LBB24_31 5776; AVX512F-NEXT: LBB24_32: ## %else44 5777; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000 5778; AVX512F-NEXT: jne LBB24_33 5779; AVX512F-NEXT: LBB24_34: ## %else47 5780; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000 5781; AVX512F-NEXT: jne LBB24_35 5782; AVX512F-NEXT: LBB24_36: ## %else50 5783; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000 5784; AVX512F-NEXT: jne LBB24_37 5785; AVX512F-NEXT: LBB24_38: ## %else53 5786; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000 5787; AVX512F-NEXT: jne LBB24_39 5788; AVX512F-NEXT: LBB24_40: ## %else56 5789; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000 5790; AVX512F-NEXT: jne LBB24_41 5791; AVX512F-NEXT: LBB24_42: ## %else59 5792; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000 5793; AVX512F-NEXT: jne LBB24_43 5794; AVX512F-NEXT: LBB24_44: ## %else62 5795; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000 5796; AVX512F-NEXT: jne LBB24_45 5797; AVX512F-NEXT: LBB24_46: ## %else65 5798; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000 5799; AVX512F-NEXT: jne LBB24_47 5800; AVX512F-NEXT: LBB24_48: ## %else68 5801; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000 5802; AVX512F-NEXT: jne LBB24_49 5803; AVX512F-NEXT: LBB24_50: ## %else71 5804; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000 5805; AVX512F-NEXT: jne LBB24_51 5806; AVX512F-NEXT: LBB24_52: ## %else74 5807; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000 5808; AVX512F-NEXT: jne LBB24_53 5809; AVX512F-NEXT: LBB24_54: ## %else77 5810; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000 5811; AVX512F-NEXT: jne LBB24_55 5812; AVX512F-NEXT: LBB24_56: ## %else80 5813; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000 5814; AVX512F-NEXT: jne LBB24_57 5815; AVX512F-NEXT: LBB24_58: ## %else83 5816; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000 5817; AVX512F-NEXT: jne LBB24_59 5818; AVX512F-NEXT: LBB24_60: ## %else86 5819; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000 5820; AVX512F-NEXT: jne LBB24_61 5821; AVX512F-NEXT: LBB24_62: ## %else89 5822; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 5823; AVX512F-NEXT: jne LBB24_63 5824; AVX512F-NEXT: LBB24_64: ## %else92 5825; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 5826; AVX512F-NEXT: retq 5827; AVX512F-NEXT: LBB24_1: ## %cond.load 5828; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0 5829; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5830; AVX512F-NEXT: testb $2, %al 5831; AVX512F-NEXT: je LBB24_4 5832; AVX512F-NEXT: LBB24_3: ## %cond.load1 5833; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0 5834; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5835; AVX512F-NEXT: testb $4, %al 5836; AVX512F-NEXT: je LBB24_6 5837; AVX512F-NEXT: LBB24_5: ## %cond.load4 5838; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0 5839; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5840; AVX512F-NEXT: testb $8, %al 5841; AVX512F-NEXT: je LBB24_8 5842; AVX512F-NEXT: LBB24_7: ## %cond.load7 5843; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0 5844; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5845; AVX512F-NEXT: testb $16, %al 5846; AVX512F-NEXT: je LBB24_10 5847; AVX512F-NEXT: LBB24_9: ## %cond.load10 5848; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0 5849; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5850; AVX512F-NEXT: testb $32, %al 5851; AVX512F-NEXT: je LBB24_12 5852; AVX512F-NEXT: LBB24_11: ## %cond.load13 5853; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0 5854; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5855; AVX512F-NEXT: testb $64, %al 5856; AVX512F-NEXT: je LBB24_14 5857; AVX512F-NEXT: LBB24_13: ## %cond.load16 5858; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0 5859; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5860; AVX512F-NEXT: testb %al, %al 5861; AVX512F-NEXT: jns LBB24_16 5862; AVX512F-NEXT: LBB24_15: ## %cond.load19 5863; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0 5864; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5865; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 5866; AVX512F-NEXT: je LBB24_18 5867; AVX512F-NEXT: LBB24_17: ## %cond.load22 5868; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0 5869; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5870; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 5871; AVX512F-NEXT: je LBB24_20 5872; AVX512F-NEXT: LBB24_19: ## %cond.load25 5873; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0 5874; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5875; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 5876; AVX512F-NEXT: je LBB24_22 5877; AVX512F-NEXT: LBB24_21: ## %cond.load28 5878; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0 5879; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5880; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 5881; AVX512F-NEXT: je LBB24_24 5882; AVX512F-NEXT: LBB24_23: ## %cond.load31 5883; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0 5884; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5885; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 5886; AVX512F-NEXT: je LBB24_26 5887; AVX512F-NEXT: LBB24_25: ## %cond.load34 5888; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0 5889; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5890; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 5891; AVX512F-NEXT: je LBB24_28 5892; AVX512F-NEXT: LBB24_27: ## %cond.load37 5893; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0 5894; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5895; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 5896; AVX512F-NEXT: je LBB24_30 5897; AVX512F-NEXT: LBB24_29: ## %cond.load40 5898; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0 5899; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5900; AVX512F-NEXT: testw %ax, %ax 5901; AVX512F-NEXT: jns LBB24_32 5902; AVX512F-NEXT: LBB24_31: ## %cond.load43 5903; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0 5904; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5905; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000 5906; AVX512F-NEXT: je LBB24_34 5907; AVX512F-NEXT: LBB24_33: ## %cond.load46 5908; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5909; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 5910; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5911; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000 5912; AVX512F-NEXT: je LBB24_36 5913; AVX512F-NEXT: LBB24_35: ## %cond.load49 5914; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5915; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 5916; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5917; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000 5918; AVX512F-NEXT: je LBB24_38 5919; AVX512F-NEXT: LBB24_37: ## %cond.load52 5920; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5921; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 5922; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5923; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000 5924; AVX512F-NEXT: je LBB24_40 5925; AVX512F-NEXT: LBB24_39: ## %cond.load55 5926; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5927; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 5928; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5929; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000 5930; AVX512F-NEXT: je LBB24_42 5931; AVX512F-NEXT: LBB24_41: ## %cond.load58 5932; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5933; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 5934; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5935; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000 5936; AVX512F-NEXT: je LBB24_44 5937; AVX512F-NEXT: LBB24_43: ## %cond.load61 5938; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5939; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 5940; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5941; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000 5942; AVX512F-NEXT: je LBB24_46 5943; AVX512F-NEXT: LBB24_45: ## %cond.load64 5944; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5945; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 5946; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5947; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000 5948; AVX512F-NEXT: je LBB24_48 5949; AVX512F-NEXT: LBB24_47: ## %cond.load67 5950; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5951; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 5952; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5953; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000 5954; AVX512F-NEXT: je LBB24_50 5955; AVX512F-NEXT: LBB24_49: ## %cond.load70 5956; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5957; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 5958; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5959; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000 5960; AVX512F-NEXT: je LBB24_52 5961; AVX512F-NEXT: LBB24_51: ## %cond.load73 5962; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5963; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 5964; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5965; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000 5966; AVX512F-NEXT: je LBB24_54 5967; AVX512F-NEXT: LBB24_53: ## %cond.load76 5968; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5969; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 5970; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5971; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000 5972; AVX512F-NEXT: je LBB24_56 5973; AVX512F-NEXT: LBB24_55: ## %cond.load79 5974; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5975; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 5976; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5977; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000 5978; AVX512F-NEXT: je LBB24_58 5979; AVX512F-NEXT: LBB24_57: ## %cond.load82 5980; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5981; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 5982; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5983; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000 5984; AVX512F-NEXT: je LBB24_60 5985; AVX512F-NEXT: LBB24_59: ## %cond.load85 5986; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5987; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 5988; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5989; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000 5990; AVX512F-NEXT: je LBB24_62 5991; AVX512F-NEXT: LBB24_61: ## %cond.load88 5992; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5993; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 5994; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 5995; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 5996; AVX512F-NEXT: je LBB24_64 5997; AVX512F-NEXT: LBB24_63: ## %cond.load91 5998; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 5999; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 6000; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6001; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 6002; AVX512F-NEXT: retq 6003; 6004; AVX512VLDQ-LABEL: load_v32i8_v32i8: 6005; AVX512VLDQ: ## %bb.0: 6006; AVX512VLDQ-NEXT: vpmovmskb %ymm0, %eax 6007; AVX512VLDQ-NEXT: testb $1, %al 6008; AVX512VLDQ-NEXT: jne LBB24_1 6009; AVX512VLDQ-NEXT: ## %bb.2: ## %else 6010; AVX512VLDQ-NEXT: testb $2, %al 6011; AVX512VLDQ-NEXT: jne LBB24_3 6012; AVX512VLDQ-NEXT: LBB24_4: ## %else2 6013; AVX512VLDQ-NEXT: testb $4, %al 6014; AVX512VLDQ-NEXT: jne LBB24_5 6015; AVX512VLDQ-NEXT: LBB24_6: ## %else5 6016; AVX512VLDQ-NEXT: testb $8, %al 6017; AVX512VLDQ-NEXT: jne LBB24_7 6018; AVX512VLDQ-NEXT: LBB24_8: ## %else8 6019; AVX512VLDQ-NEXT: testb $16, %al 6020; AVX512VLDQ-NEXT: jne LBB24_9 6021; AVX512VLDQ-NEXT: LBB24_10: ## %else11 6022; AVX512VLDQ-NEXT: testb $32, %al 6023; AVX512VLDQ-NEXT: jne LBB24_11 6024; AVX512VLDQ-NEXT: LBB24_12: ## %else14 6025; AVX512VLDQ-NEXT: testb $64, %al 6026; AVX512VLDQ-NEXT: jne LBB24_13 6027; AVX512VLDQ-NEXT: LBB24_14: ## %else17 6028; AVX512VLDQ-NEXT: testb %al, %al 6029; AVX512VLDQ-NEXT: js LBB24_15 6030; AVX512VLDQ-NEXT: LBB24_16: ## %else20 6031; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 6032; AVX512VLDQ-NEXT: jne LBB24_17 6033; AVX512VLDQ-NEXT: LBB24_18: ## %else23 6034; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 6035; AVX512VLDQ-NEXT: jne LBB24_19 6036; AVX512VLDQ-NEXT: LBB24_20: ## %else26 6037; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 6038; AVX512VLDQ-NEXT: jne LBB24_21 6039; AVX512VLDQ-NEXT: LBB24_22: ## %else29 6040; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 6041; AVX512VLDQ-NEXT: jne LBB24_23 6042; AVX512VLDQ-NEXT: LBB24_24: ## %else32 6043; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 6044; AVX512VLDQ-NEXT: jne LBB24_25 6045; AVX512VLDQ-NEXT: LBB24_26: ## %else35 6046; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 6047; AVX512VLDQ-NEXT: jne LBB24_27 6048; AVX512VLDQ-NEXT: LBB24_28: ## %else38 6049; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 6050; AVX512VLDQ-NEXT: jne LBB24_29 6051; AVX512VLDQ-NEXT: LBB24_30: ## %else41 6052; AVX512VLDQ-NEXT: testw %ax, %ax 6053; AVX512VLDQ-NEXT: js LBB24_31 6054; AVX512VLDQ-NEXT: LBB24_32: ## %else44 6055; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000 6056; AVX512VLDQ-NEXT: jne LBB24_33 6057; AVX512VLDQ-NEXT: LBB24_34: ## %else47 6058; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000 6059; AVX512VLDQ-NEXT: jne LBB24_35 6060; AVX512VLDQ-NEXT: LBB24_36: ## %else50 6061; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000 6062; AVX512VLDQ-NEXT: jne LBB24_37 6063; AVX512VLDQ-NEXT: LBB24_38: ## %else53 6064; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000 6065; AVX512VLDQ-NEXT: jne LBB24_39 6066; AVX512VLDQ-NEXT: LBB24_40: ## %else56 6067; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000 6068; AVX512VLDQ-NEXT: jne LBB24_41 6069; AVX512VLDQ-NEXT: LBB24_42: ## %else59 6070; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000 6071; AVX512VLDQ-NEXT: jne LBB24_43 6072; AVX512VLDQ-NEXT: LBB24_44: ## %else62 6073; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000 6074; AVX512VLDQ-NEXT: jne LBB24_45 6075; AVX512VLDQ-NEXT: LBB24_46: ## %else65 6076; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000 6077; AVX512VLDQ-NEXT: jne LBB24_47 6078; AVX512VLDQ-NEXT: LBB24_48: ## %else68 6079; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000 6080; AVX512VLDQ-NEXT: jne LBB24_49 6081; AVX512VLDQ-NEXT: LBB24_50: ## %else71 6082; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000 6083; AVX512VLDQ-NEXT: jne LBB24_51 6084; AVX512VLDQ-NEXT: LBB24_52: ## %else74 6085; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000 6086; AVX512VLDQ-NEXT: jne LBB24_53 6087; AVX512VLDQ-NEXT: LBB24_54: ## %else77 6088; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000 6089; AVX512VLDQ-NEXT: jne LBB24_55 6090; AVX512VLDQ-NEXT: LBB24_56: ## %else80 6091; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000 6092; AVX512VLDQ-NEXT: jne LBB24_57 6093; AVX512VLDQ-NEXT: LBB24_58: ## %else83 6094; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000 6095; AVX512VLDQ-NEXT: jne LBB24_59 6096; AVX512VLDQ-NEXT: LBB24_60: ## %else86 6097; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000 6098; AVX512VLDQ-NEXT: jne LBB24_61 6099; AVX512VLDQ-NEXT: LBB24_62: ## %else89 6100; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 6101; AVX512VLDQ-NEXT: jne LBB24_63 6102; AVX512VLDQ-NEXT: LBB24_64: ## %else92 6103; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 6104; AVX512VLDQ-NEXT: retq 6105; AVX512VLDQ-NEXT: LBB24_1: ## %cond.load 6106; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0 6107; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6108; AVX512VLDQ-NEXT: testb $2, %al 6109; AVX512VLDQ-NEXT: je LBB24_4 6110; AVX512VLDQ-NEXT: LBB24_3: ## %cond.load1 6111; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0 6112; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6113; AVX512VLDQ-NEXT: testb $4, %al 6114; AVX512VLDQ-NEXT: je LBB24_6 6115; AVX512VLDQ-NEXT: LBB24_5: ## %cond.load4 6116; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0 6117; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6118; AVX512VLDQ-NEXT: testb $8, %al 6119; AVX512VLDQ-NEXT: je LBB24_8 6120; AVX512VLDQ-NEXT: LBB24_7: ## %cond.load7 6121; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0 6122; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6123; AVX512VLDQ-NEXT: testb $16, %al 6124; AVX512VLDQ-NEXT: je LBB24_10 6125; AVX512VLDQ-NEXT: LBB24_9: ## %cond.load10 6126; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0 6127; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6128; AVX512VLDQ-NEXT: testb $32, %al 6129; AVX512VLDQ-NEXT: je LBB24_12 6130; AVX512VLDQ-NEXT: LBB24_11: ## %cond.load13 6131; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0 6132; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6133; AVX512VLDQ-NEXT: testb $64, %al 6134; AVX512VLDQ-NEXT: je LBB24_14 6135; AVX512VLDQ-NEXT: LBB24_13: ## %cond.load16 6136; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0 6137; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6138; AVX512VLDQ-NEXT: testb %al, %al 6139; AVX512VLDQ-NEXT: jns LBB24_16 6140; AVX512VLDQ-NEXT: LBB24_15: ## %cond.load19 6141; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0 6142; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6143; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 6144; AVX512VLDQ-NEXT: je LBB24_18 6145; AVX512VLDQ-NEXT: LBB24_17: ## %cond.load22 6146; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0 6147; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6148; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 6149; AVX512VLDQ-NEXT: je LBB24_20 6150; AVX512VLDQ-NEXT: LBB24_19: ## %cond.load25 6151; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0 6152; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6153; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 6154; AVX512VLDQ-NEXT: je LBB24_22 6155; AVX512VLDQ-NEXT: LBB24_21: ## %cond.load28 6156; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0 6157; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6158; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 6159; AVX512VLDQ-NEXT: je LBB24_24 6160; AVX512VLDQ-NEXT: LBB24_23: ## %cond.load31 6161; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0 6162; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6163; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 6164; AVX512VLDQ-NEXT: je LBB24_26 6165; AVX512VLDQ-NEXT: LBB24_25: ## %cond.load34 6166; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0 6167; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6168; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 6169; AVX512VLDQ-NEXT: je LBB24_28 6170; AVX512VLDQ-NEXT: LBB24_27: ## %cond.load37 6171; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0 6172; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6173; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 6174; AVX512VLDQ-NEXT: je LBB24_30 6175; AVX512VLDQ-NEXT: LBB24_29: ## %cond.load40 6176; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0 6177; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6178; AVX512VLDQ-NEXT: testw %ax, %ax 6179; AVX512VLDQ-NEXT: jns LBB24_32 6180; AVX512VLDQ-NEXT: LBB24_31: ## %cond.load43 6181; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0 6182; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6183; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000 6184; AVX512VLDQ-NEXT: je LBB24_34 6185; AVX512VLDQ-NEXT: LBB24_33: ## %cond.load46 6186; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6187; AVX512VLDQ-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 6188; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6189; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000 6190; AVX512VLDQ-NEXT: je LBB24_36 6191; AVX512VLDQ-NEXT: LBB24_35: ## %cond.load49 6192; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6193; AVX512VLDQ-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 6194; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6195; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000 6196; AVX512VLDQ-NEXT: je LBB24_38 6197; AVX512VLDQ-NEXT: LBB24_37: ## %cond.load52 6198; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6199; AVX512VLDQ-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 6200; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6201; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000 6202; AVX512VLDQ-NEXT: je LBB24_40 6203; AVX512VLDQ-NEXT: LBB24_39: ## %cond.load55 6204; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6205; AVX512VLDQ-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 6206; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6207; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000 6208; AVX512VLDQ-NEXT: je LBB24_42 6209; AVX512VLDQ-NEXT: LBB24_41: ## %cond.load58 6210; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6211; AVX512VLDQ-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 6212; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6213; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000 6214; AVX512VLDQ-NEXT: je LBB24_44 6215; AVX512VLDQ-NEXT: LBB24_43: ## %cond.load61 6216; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6217; AVX512VLDQ-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 6218; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6219; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000 6220; AVX512VLDQ-NEXT: je LBB24_46 6221; AVX512VLDQ-NEXT: LBB24_45: ## %cond.load64 6222; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6223; AVX512VLDQ-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 6224; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6225; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000 6226; AVX512VLDQ-NEXT: je LBB24_48 6227; AVX512VLDQ-NEXT: LBB24_47: ## %cond.load67 6228; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6229; AVX512VLDQ-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 6230; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6231; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000 6232; AVX512VLDQ-NEXT: je LBB24_50 6233; AVX512VLDQ-NEXT: LBB24_49: ## %cond.load70 6234; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6235; AVX512VLDQ-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 6236; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6237; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000 6238; AVX512VLDQ-NEXT: je LBB24_52 6239; AVX512VLDQ-NEXT: LBB24_51: ## %cond.load73 6240; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6241; AVX512VLDQ-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 6242; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6243; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000 6244; AVX512VLDQ-NEXT: je LBB24_54 6245; AVX512VLDQ-NEXT: LBB24_53: ## %cond.load76 6246; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6247; AVX512VLDQ-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 6248; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6249; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000 6250; AVX512VLDQ-NEXT: je LBB24_56 6251; AVX512VLDQ-NEXT: LBB24_55: ## %cond.load79 6252; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6253; AVX512VLDQ-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 6254; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6255; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000 6256; AVX512VLDQ-NEXT: je LBB24_58 6257; AVX512VLDQ-NEXT: LBB24_57: ## %cond.load82 6258; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6259; AVX512VLDQ-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 6260; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6261; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000 6262; AVX512VLDQ-NEXT: je LBB24_60 6263; AVX512VLDQ-NEXT: LBB24_59: ## %cond.load85 6264; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6265; AVX512VLDQ-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 6266; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6267; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000 6268; AVX512VLDQ-NEXT: je LBB24_62 6269; AVX512VLDQ-NEXT: LBB24_61: ## %cond.load88 6270; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6271; AVX512VLDQ-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 6272; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6273; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 6274; AVX512VLDQ-NEXT: je LBB24_64 6275; AVX512VLDQ-NEXT: LBB24_63: ## %cond.load91 6276; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 6277; AVX512VLDQ-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 6278; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 6279; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 6280; AVX512VLDQ-NEXT: retq 6281; 6282; AVX512VLBW-LABEL: load_v32i8_v32i8: 6283; AVX512VLBW: ## %bb.0: 6284; AVX512VLBW-NEXT: vpmovb2m %ymm0, %k1 6285; AVX512VLBW-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1} 6286; AVX512VLBW-NEXT: retq 6287; 6288; X86-AVX512-LABEL: load_v32i8_v32i8: 6289; X86-AVX512: ## %bb.0: 6290; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6291; X86-AVX512-NEXT: vpmovb2m %ymm0, %k1 6292; X86-AVX512-NEXT: vpblendmb (%eax), %ymm1, %ymm0 {%k1} 6293; X86-AVX512-NEXT: retl 6294 %mask = icmp slt <32 x i8> %trigger, zeroinitializer 6295 %res = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x i8> %dst) 6296 ret <32 x i8> %res 6297} 6298 6299;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend. 6300 6301; 128-bit FP vectors are supported with AVX. 6302 6303define <4 x float> @mload_constmask_v4f32(ptr %addr, <4 x float> %dst) { 6304; SSE2-LABEL: mload_constmask_v4f32: 6305; SSE2: ## %bb.0: 6306; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 6307; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 6308; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 6309; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] 6310; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 6311; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] 6312; SSE2-NEXT: retq 6313; 6314; SSE42-LABEL: mload_constmask_v4f32: 6315; SSE42: ## %bb.0: 6316; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 6317; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 6318; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 6319; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 6320; SSE42-NEXT: retq 6321; 6322; AVX1OR2-LABEL: mload_constmask_v4f32: 6323; AVX1OR2: ## %bb.0: 6324; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3] 6325; AVX1OR2-NEXT: retq 6326; 6327; AVX512F-LABEL: mload_constmask_v4f32: 6328; AVX512F: ## %bb.0: 6329; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 6330; AVX512F-NEXT: movw $13, %ax 6331; AVX512F-NEXT: kmovw %eax, %k1 6332; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} 6333; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 6334; AVX512F-NEXT: vzeroupper 6335; AVX512F-NEXT: retq 6336; 6337; AVX512VLDQ-LABEL: mload_constmask_v4f32: 6338; AVX512VLDQ: ## %bb.0: 6339; AVX512VLDQ-NEXT: movb $13, %al 6340; AVX512VLDQ-NEXT: kmovw %eax, %k1 6341; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1} 6342; AVX512VLDQ-NEXT: retq 6343; 6344; AVX512VLBW-LABEL: mload_constmask_v4f32: 6345; AVX512VLBW: ## %bb.0: 6346; AVX512VLBW-NEXT: movb $13, %al 6347; AVX512VLBW-NEXT: kmovd %eax, %k1 6348; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} 6349; AVX512VLBW-NEXT: retq 6350; 6351; X86-AVX512-LABEL: mload_constmask_v4f32: 6352; X86-AVX512: ## %bb.0: 6353; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6354; X86-AVX512-NEXT: movb $13, %cl 6355; X86-AVX512-NEXT: kmovd %ecx, %k1 6356; X86-AVX512-NEXT: vmovups (%eax), %xmm0 {%k1} 6357; X86-AVX512-NEXT: retl 6358 %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst) 6359 ret <4 x float> %res 6360} 6361 6362define <4 x float> @mload_constmask_v4f32_all(ptr %addr) { 6363; SSE-LABEL: mload_constmask_v4f32_all: 6364; SSE: ## %bb.0: 6365; SSE-NEXT: movups (%rdi), %xmm0 6366; SSE-NEXT: retq 6367; 6368; AVX-LABEL: mload_constmask_v4f32_all: 6369; AVX: ## %bb.0: 6370; AVX-NEXT: vmovups (%rdi), %xmm0 6371; AVX-NEXT: retq 6372; 6373; X86-AVX512-LABEL: mload_constmask_v4f32_all: 6374; X86-AVX512: ## %bb.0: 6375; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6376; X86-AVX512-NEXT: vmovups (%eax), %xmm0 6377; X86-AVX512-NEXT: retl 6378 %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float>undef) 6379 ret <4 x float> %res 6380} 6381 6382define <2 x double> @mload_constmask_v2f64(ptr %addr, <2 x double> %dst) { 6383; SSE-LABEL: mload_constmask_v2f64: 6384; SSE: ## %bb.0: 6385; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 6386; SSE-NEXT: retq 6387; 6388; AVX-LABEL: mload_constmask_v2f64: 6389; AVX: ## %bb.0: 6390; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 6391; AVX-NEXT: retq 6392; 6393; X86-AVX512-LABEL: mload_constmask_v2f64: 6394; X86-AVX512: ## %bb.0: 6395; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6396; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 6397; X86-AVX512-NEXT: retl 6398 %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst) 6399 ret <2 x double> %res 6400} 6401 6402; 128-bit integer vectors are supported with AVX2. 6403 6404define <4 x i32> @mload_constmask_v4i32(ptr %addr, <4 x i32> %dst) { 6405; SSE2-LABEL: mload_constmask_v4i32: 6406; SSE2: ## %bb.0: 6407; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 6408; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 6409; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 6410; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 6411; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 6412; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] 6413; SSE2-NEXT: movaps %xmm1, %xmm0 6414; SSE2-NEXT: retq 6415; 6416; SSE42-LABEL: mload_constmask_v4i32: 6417; SSE42: ## %bb.0: 6418; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0 6419; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0 6420; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0 6421; SSE42-NEXT: retq 6422; 6423; AVX1-LABEL: mload_constmask_v4i32: 6424; AVX1: ## %bb.0: 6425; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] 6426; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1 6427; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 6428; AVX1-NEXT: retq 6429; 6430; AVX2-LABEL: mload_constmask_v4i32: 6431; AVX2: ## %bb.0: 6432; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] 6433; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1 6434; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 6435; AVX2-NEXT: retq 6436; 6437; AVX512F-LABEL: mload_constmask_v4i32: 6438; AVX512F: ## %bb.0: 6439; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 6440; AVX512F-NEXT: movw $14, %ax 6441; AVX512F-NEXT: kmovw %eax, %k1 6442; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} 6443; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 6444; AVX512F-NEXT: vzeroupper 6445; AVX512F-NEXT: retq 6446; 6447; AVX512VLDQ-LABEL: mload_constmask_v4i32: 6448; AVX512VLDQ: ## %bb.0: 6449; AVX512VLDQ-NEXT: movb $14, %al 6450; AVX512VLDQ-NEXT: kmovw %eax, %k1 6451; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} 6452; AVX512VLDQ-NEXT: retq 6453; 6454; AVX512VLBW-LABEL: mload_constmask_v4i32: 6455; AVX512VLBW: ## %bb.0: 6456; AVX512VLBW-NEXT: movb $14, %al 6457; AVX512VLBW-NEXT: kmovd %eax, %k1 6458; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} 6459; AVX512VLBW-NEXT: retq 6460; 6461; X86-AVX512-LABEL: mload_constmask_v4i32: 6462; X86-AVX512: ## %bb.0: 6463; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6464; X86-AVX512-NEXT: movb $14, %cl 6465; X86-AVX512-NEXT: kmovd %ecx, %k1 6466; X86-AVX512-NEXT: vmovdqu32 (%eax), %xmm0 {%k1} 6467; X86-AVX512-NEXT: retl 6468 %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst) 6469 ret <4 x i32> %res 6470} 6471 6472define <2 x i64> @mload_constmask_v2i64(ptr %addr, <2 x i64> %dst) { 6473; SSE2-LABEL: mload_constmask_v2i64: 6474; SSE2: ## %bb.0: 6475; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 6476; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 6477; SSE2-NEXT: retq 6478; 6479; SSE42-LABEL: mload_constmask_v2i64: 6480; SSE42: ## %bb.0: 6481; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm0 6482; SSE42-NEXT: retq 6483; 6484; AVX-LABEL: mload_constmask_v2i64: 6485; AVX: ## %bb.0: 6486; AVX-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm0 6487; AVX-NEXT: retq 6488; 6489; X86-AVX512-LABEL: mload_constmask_v2i64: 6490; X86-AVX512: ## %bb.0: 6491; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6492; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 6493; X86-AVX512-NEXT: retl 6494 %res = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst) 6495 ret <2 x i64> %res 6496} 6497 6498; 256-bit FP vectors are supported with AVX. 6499 6500define <8 x float> @mload_constmask_v8f32(ptr %addr, <8 x float> %dst) { 6501; SSE2-LABEL: mload_constmask_v8f32: 6502; SSE2: ## %bb.0: 6503; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 6504; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 6505; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 6506; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 6507; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] 6508; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] 6509; SSE2-NEXT: movaps %xmm2, %xmm0 6510; SSE2-NEXT: retq 6511; 6512; SSE42-LABEL: mload_constmask_v8f32: 6513; SSE42: ## %bb.0: 6514; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 6515; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] 6516; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 6517; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 6518; SSE42-NEXT: retq 6519; 6520; AVX1OR2-LABEL: mload_constmask_v8f32: 6521; AVX1OR2: ## %bb.0: 6522; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0] 6523; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1 6524; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 6525; AVX1OR2-NEXT: retq 6526; 6527; AVX512F-LABEL: mload_constmask_v8f32: 6528; AVX512F: ## %bb.0: 6529; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 6530; AVX512F-NEXT: movw $7, %ax 6531; AVX512F-NEXT: kmovw %eax, %k1 6532; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} 6533; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 6534; AVX512F-NEXT: retq 6535; 6536; AVX512VLDQ-LABEL: mload_constmask_v8f32: 6537; AVX512VLDQ: ## %bb.0: 6538; AVX512VLDQ-NEXT: movb $7, %al 6539; AVX512VLDQ-NEXT: kmovw %eax, %k1 6540; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1} 6541; AVX512VLDQ-NEXT: retq 6542; 6543; AVX512VLBW-LABEL: mload_constmask_v8f32: 6544; AVX512VLBW: ## %bb.0: 6545; AVX512VLBW-NEXT: movb $7, %al 6546; AVX512VLBW-NEXT: kmovd %eax, %k1 6547; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1} 6548; AVX512VLBW-NEXT: retq 6549; 6550; X86-AVX512-LABEL: mload_constmask_v8f32: 6551; X86-AVX512: ## %bb.0: 6552; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6553; X86-AVX512-NEXT: movb $7, %cl 6554; X86-AVX512-NEXT: kmovd %ecx, %k1 6555; X86-AVX512-NEXT: vmovups (%eax), %ymm0 {%k1} 6556; X86-AVX512-NEXT: retl 6557 %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst) 6558 ret <8 x float> %res 6559} 6560 6561define <8 x float> @mload_constmask_v8f32_zero(ptr %addr, <8 x float> %dst) { 6562; SSE2-LABEL: mload_constmask_v8f32_zero: 6563; SSE2: ## %bb.0: 6564; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 6565; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 6566; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 6567; SSE2-NEXT: xorps %xmm1, %xmm1 6568; SSE2-NEXT: retq 6569; 6570; SSE42-LABEL: mload_constmask_v8f32_zero: 6571; SSE42: ## %bb.0: 6572; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 6573; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],zero 6574; SSE42-NEXT: xorps %xmm1, %xmm1 6575; SSE42-NEXT: retq 6576; 6577; AVX1OR2-LABEL: mload_constmask_v8f32_zero: 6578; AVX1OR2: ## %bb.0: 6579; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,0,0,0,0,0] 6580; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 6581; AVX1OR2-NEXT: retq 6582; 6583; AVX512F-LABEL: mload_constmask_v8f32_zero: 6584; AVX512F: ## %bb.0: 6585; AVX512F-NEXT: movw $7, %ax 6586; AVX512F-NEXT: kmovw %eax, %k1 6587; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} 6588; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 6589; AVX512F-NEXT: retq 6590; 6591; AVX512VLDQ-LABEL: mload_constmask_v8f32_zero: 6592; AVX512VLDQ: ## %bb.0: 6593; AVX512VLDQ-NEXT: movb $7, %al 6594; AVX512VLDQ-NEXT: kmovw %eax, %k1 6595; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1} {z} 6596; AVX512VLDQ-NEXT: retq 6597; 6598; AVX512VLBW-LABEL: mload_constmask_v8f32_zero: 6599; AVX512VLBW: ## %bb.0: 6600; AVX512VLBW-NEXT: movb $7, %al 6601; AVX512VLBW-NEXT: kmovd %eax, %k1 6602; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1} {z} 6603; AVX512VLBW-NEXT: retq 6604; 6605; X86-AVX512-LABEL: mload_constmask_v8f32_zero: 6606; X86-AVX512: ## %bb.0: 6607; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6608; X86-AVX512-NEXT: movb $7, %cl 6609; X86-AVX512-NEXT: kmovd %ecx, %k1 6610; X86-AVX512-NEXT: vmovups (%eax), %ymm0 {%k1} {z} 6611; X86-AVX512-NEXT: retl 6612 %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> zeroinitializer) 6613 ret <8 x float> %res 6614} 6615 6616define <4 x double> @mload_constmask_v4f64(ptr %addr, <4 x double> %dst) { 6617; SSE-LABEL: mload_constmask_v4f64: 6618; SSE: ## %bb.0: 6619; SSE-NEXT: movups (%rdi), %xmm0 6620; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 6621; SSE-NEXT: retq 6622; 6623; AVX1OR2-LABEL: mload_constmask_v4f64: 6624; AVX1OR2: ## %bb.0: 6625; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0] 6626; AVX1OR2-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1 6627; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] 6628; AVX1OR2-NEXT: retq 6629; 6630; AVX512F-LABEL: mload_constmask_v4f64: 6631; AVX512F: ## %bb.0: 6632; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 6633; AVX512F-NEXT: movb $7, %al 6634; AVX512F-NEXT: kmovw %eax, %k1 6635; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} 6636; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 6637; AVX512F-NEXT: retq 6638; 6639; AVX512VLDQ-LABEL: mload_constmask_v4f64: 6640; AVX512VLDQ: ## %bb.0: 6641; AVX512VLDQ-NEXT: movb $7, %al 6642; AVX512VLDQ-NEXT: kmovw %eax, %k1 6643; AVX512VLDQ-NEXT: vmovupd (%rdi), %ymm0 {%k1} 6644; AVX512VLDQ-NEXT: retq 6645; 6646; AVX512VLBW-LABEL: mload_constmask_v4f64: 6647; AVX512VLBW: ## %bb.0: 6648; AVX512VLBW-NEXT: movb $7, %al 6649; AVX512VLBW-NEXT: kmovd %eax, %k1 6650; AVX512VLBW-NEXT: vmovupd (%rdi), %ymm0 {%k1} 6651; AVX512VLBW-NEXT: retq 6652; 6653; X86-AVX512-LABEL: mload_constmask_v4f64: 6654; X86-AVX512: ## %bb.0: 6655; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6656; X86-AVX512-NEXT: movb $7, %cl 6657; X86-AVX512-NEXT: kmovd %ecx, %k1 6658; X86-AVX512-NEXT: vmovupd (%eax), %ymm0 {%k1} 6659; X86-AVX512-NEXT: retl 6660 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst) 6661 ret <4 x double> %res 6662} 6663 6664; 256-bit integer vectors are supported with AVX2. 6665 6666define <8 x i32> @mload_constmask_v8i32(ptr %addr, <8 x i32> %dst) { 6667; SSE2-LABEL: mload_constmask_v8i32: 6668; SSE2: ## %bb.0: 6669; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 6670; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 6671; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 6672; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 6673; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] 6674; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] 6675; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 6676; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 6677; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 6678; SSE2-NEXT: movaps %xmm2, %xmm0 6679; SSE2-NEXT: retq 6680; 6681; SSE42-LABEL: mload_constmask_v8i32: 6682; SSE42: ## %bb.0: 6683; SSE42-NEXT: pinsrd $0, (%rdi), %xmm0 6684; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0 6685; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0 6686; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1 6687; SSE42-NEXT: retq 6688; 6689; AVX1OR2-LABEL: mload_constmask_v8i32: 6690; AVX1OR2: ## %bb.0: 6691; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7] 6692; AVX1OR2-NEXT: retq 6693; 6694; AVX512F-LABEL: mload_constmask_v8i32: 6695; AVX512F: ## %bb.0: 6696; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 6697; AVX512F-NEXT: movw $135, %ax 6698; AVX512F-NEXT: kmovw %eax, %k1 6699; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} 6700; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 6701; AVX512F-NEXT: retq 6702; 6703; AVX512VLDQ-LABEL: mload_constmask_v8i32: 6704; AVX512VLDQ: ## %bb.0: 6705; AVX512VLDQ-NEXT: movb $-121, %al 6706; AVX512VLDQ-NEXT: kmovw %eax, %k1 6707; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} 6708; AVX512VLDQ-NEXT: retq 6709; 6710; AVX512VLBW-LABEL: mload_constmask_v8i32: 6711; AVX512VLBW: ## %bb.0: 6712; AVX512VLBW-NEXT: movb $-121, %al 6713; AVX512VLBW-NEXT: kmovd %eax, %k1 6714; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} 6715; AVX512VLBW-NEXT: retq 6716; 6717; X86-AVX512-LABEL: mload_constmask_v8i32: 6718; X86-AVX512: ## %bb.0: 6719; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6720; X86-AVX512-NEXT: movb $-121, %cl 6721; X86-AVX512-NEXT: kmovd %ecx, %k1 6722; X86-AVX512-NEXT: vmovdqu32 (%eax), %ymm0 {%k1} 6723; X86-AVX512-NEXT: retl 6724 %res = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst) 6725 ret <8 x i32> %res 6726} 6727 6728define <4 x i64> @mload_constmask_v4i64(ptr %addr, <4 x i64> %dst) { 6729; SSE2-LABEL: mload_constmask_v4i64: 6730; SSE2: ## %bb.0: 6731; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 6732; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero 6733; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 6734; SSE2-NEXT: retq 6735; 6736; SSE42-LABEL: mload_constmask_v4i64: 6737; SSE42: ## %bb.0: 6738; SSE42-NEXT: pinsrq $0, (%rdi), %xmm0 6739; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm1 6740; SSE42-NEXT: retq 6741; 6742; AVX1OR2-LABEL: mload_constmask_v4i64: 6743; AVX1OR2: ## %bb.0: 6744; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7] 6745; AVX1OR2-NEXT: retq 6746; 6747; AVX512F-LABEL: mload_constmask_v4i64: 6748; AVX512F: ## %bb.0: 6749; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 6750; AVX512F-NEXT: movb $9, %al 6751; AVX512F-NEXT: kmovw %eax, %k1 6752; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} 6753; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 6754; AVX512F-NEXT: retq 6755; 6756; AVX512VLDQ-LABEL: mload_constmask_v4i64: 6757; AVX512VLDQ: ## %bb.0: 6758; AVX512VLDQ-NEXT: movb $9, %al 6759; AVX512VLDQ-NEXT: kmovw %eax, %k1 6760; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} 6761; AVX512VLDQ-NEXT: retq 6762; 6763; AVX512VLBW-LABEL: mload_constmask_v4i64: 6764; AVX512VLBW: ## %bb.0: 6765; AVX512VLBW-NEXT: movb $9, %al 6766; AVX512VLBW-NEXT: kmovd %eax, %k1 6767; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} 6768; AVX512VLBW-NEXT: retq 6769; 6770; X86-AVX512-LABEL: mload_constmask_v4i64: 6771; X86-AVX512: ## %bb.0: 6772; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6773; X86-AVX512-NEXT: movb $9, %cl 6774; X86-AVX512-NEXT: kmovd %ecx, %k1 6775; X86-AVX512-NEXT: vmovdqu64 (%eax), %ymm0 {%k1} 6776; X86-AVX512-NEXT: retl 6777 %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst) 6778 ret <4 x i64> %res 6779} 6780 6781; 512-bit FP vectors are supported with AVX512. 6782 6783define <8 x double> @mload_constmask_v8f64(ptr %addr, <8 x double> %dst) { 6784; SSE-LABEL: mload_constmask_v8f64: 6785; SSE: ## %bb.0: 6786; SSE-NEXT: movups (%rdi), %xmm0 6787; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 6788; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] 6789; SSE-NEXT: retq 6790; 6791; AVX1OR2-LABEL: mload_constmask_v8f64: 6792; AVX1OR2: ## %bb.0: 6793; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] 6794; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] 6795; AVX1OR2-NEXT: retq 6796; 6797; AVX512F-LABEL: mload_constmask_v8f64: 6798; AVX512F: ## %bb.0: 6799; AVX512F-NEXT: movb $-121, %al 6800; AVX512F-NEXT: kmovw %eax, %k1 6801; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} 6802; AVX512F-NEXT: retq 6803; 6804; AVX512VLDQ-LABEL: mload_constmask_v8f64: 6805; AVX512VLDQ: ## %bb.0: 6806; AVX512VLDQ-NEXT: movb $-121, %al 6807; AVX512VLDQ-NEXT: kmovw %eax, %k1 6808; AVX512VLDQ-NEXT: vmovupd (%rdi), %zmm0 {%k1} 6809; AVX512VLDQ-NEXT: retq 6810; 6811; AVX512VLBW-LABEL: mload_constmask_v8f64: 6812; AVX512VLBW: ## %bb.0: 6813; AVX512VLBW-NEXT: movb $-121, %al 6814; AVX512VLBW-NEXT: kmovd %eax, %k1 6815; AVX512VLBW-NEXT: vmovupd (%rdi), %zmm0 {%k1} 6816; AVX512VLBW-NEXT: retq 6817; 6818; X86-AVX512-LABEL: mload_constmask_v8f64: 6819; X86-AVX512: ## %bb.0: 6820; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6821; X86-AVX512-NEXT: movb $-121, %cl 6822; X86-AVX512-NEXT: kmovd %ecx, %k1 6823; X86-AVX512-NEXT: vmovupd (%eax), %zmm0 {%k1} 6824; X86-AVX512-NEXT: retl 6825 %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst) 6826 ret <8 x double> %res 6827} 6828 6829; Make sure we detect the mask is all ones after type 6830; legalization to use an unmasked load for some of the avx512 instructions. 6831define <16 x double> @mload_constmask_v16f64_allones_split(ptr %addr, <16 x double> %dst) { 6832; SSE-LABEL: mload_constmask_v16f64_allones_split: 6833; SSE: ## %bb.0: 6834; SSE-NEXT: movq %rdi, %rax 6835; SSE-NEXT: movups (%rsi), %xmm0 6836; SSE-NEXT: movups 16(%rsi), %xmm1 6837; SSE-NEXT: movups 32(%rsi), %xmm2 6838; SSE-NEXT: movups 48(%rsi), %xmm3 6839; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] 6840; SSE-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] 6841; SSE-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] 6842; SSE-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] 6843; SSE-NEXT: movaps %xmm7, 112(%rdi) 6844; SSE-NEXT: movaps %xmm6, 96(%rdi) 6845; SSE-NEXT: movaps %xmm5, 80(%rdi) 6846; SSE-NEXT: movaps %xmm4, 64(%rdi) 6847; SSE-NEXT: movaps %xmm3, 48(%rdi) 6848; SSE-NEXT: movaps %xmm2, 32(%rdi) 6849; SSE-NEXT: movaps %xmm1, 16(%rdi) 6850; SSE-NEXT: movaps %xmm0, (%rdi) 6851; SSE-NEXT: retq 6852; 6853; AVX1OR2-LABEL: mload_constmask_v16f64_allones_split: 6854; AVX1OR2: ## %bb.0: 6855; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0] 6856; AVX1OR2-NEXT: ## ymm0 = mem[0,1,0,1] 6857; AVX1OR2-NEXT: vmaskmovpd 64(%rdi), %ymm0, %ymm1 6858; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] 6859; AVX1OR2-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm0 6860; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] 6861; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 6862; AVX1OR2-NEXT: vmovups 32(%rdi), %ymm1 6863; AVX1OR2-NEXT: retq 6864; 6865; AVX512F-LABEL: mload_constmask_v16f64_allones_split: 6866; AVX512F: ## %bb.0: 6867; AVX512F-NEXT: movb $85, %al 6868; AVX512F-NEXT: kmovw %eax, %k1 6869; AVX512F-NEXT: vmovupd 64(%rdi), %zmm1 {%k1} 6870; AVX512F-NEXT: vmovups (%rdi), %zmm0 6871; AVX512F-NEXT: retq 6872; 6873; AVX512VLDQ-LABEL: mload_constmask_v16f64_allones_split: 6874; AVX512VLDQ: ## %bb.0: 6875; AVX512VLDQ-NEXT: movb $85, %al 6876; AVX512VLDQ-NEXT: kmovw %eax, %k1 6877; AVX512VLDQ-NEXT: vmovupd 64(%rdi), %zmm1 {%k1} 6878; AVX512VLDQ-NEXT: vmovups (%rdi), %zmm0 6879; AVX512VLDQ-NEXT: retq 6880; 6881; AVX512VLBW-LABEL: mload_constmask_v16f64_allones_split: 6882; AVX512VLBW: ## %bb.0: 6883; AVX512VLBW-NEXT: movb $85, %al 6884; AVX512VLBW-NEXT: kmovd %eax, %k1 6885; AVX512VLBW-NEXT: vmovupd 64(%rdi), %zmm1 {%k1} 6886; AVX512VLBW-NEXT: vmovups (%rdi), %zmm0 6887; AVX512VLBW-NEXT: retq 6888; 6889; X86-AVX512-LABEL: mload_constmask_v16f64_allones_split: 6890; X86-AVX512: ## %bb.0: 6891; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6892; X86-AVX512-NEXT: movb $85, %cl 6893; X86-AVX512-NEXT: kmovd %ecx, %k1 6894; X86-AVX512-NEXT: vmovupd 64(%eax), %zmm1 {%k1} 6895; X86-AVX512-NEXT: vmovups (%eax), %zmm0 6896; X86-AVX512-NEXT: retl 6897 %res = call <16 x double> @llvm.masked.load.v16f64.p0(ptr %addr, i32 4, <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <16 x double> %dst) 6898 ret <16 x double> %res 6899} 6900 6901; If the pass-through operand is undef, no blend is needed. 6902 6903define <4 x double> @mload_constmask_v4f64_undef_passthrough(ptr %addr) { 6904; SSE-LABEL: mload_constmask_v4f64_undef_passthrough: 6905; SSE: ## %bb.0: 6906; SSE-NEXT: movups (%rdi), %xmm0 6907; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 6908; SSE-NEXT: retq 6909; 6910; AVX1OR2-LABEL: mload_constmask_v4f64_undef_passthrough: 6911; AVX1OR2: ## %bb.0: 6912; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0] 6913; AVX1OR2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 6914; AVX1OR2-NEXT: retq 6915; 6916; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough: 6917; AVX512F: ## %bb.0: 6918; AVX512F-NEXT: movb $7, %al 6919; AVX512F-NEXT: kmovw %eax, %k1 6920; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} 6921; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 6922; AVX512F-NEXT: retq 6923; 6924; AVX512VLDQ-LABEL: mload_constmask_v4f64_undef_passthrough: 6925; AVX512VLDQ: ## %bb.0: 6926; AVX512VLDQ-NEXT: movb $7, %al 6927; AVX512VLDQ-NEXT: kmovw %eax, %k1 6928; AVX512VLDQ-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} 6929; AVX512VLDQ-NEXT: retq 6930; 6931; AVX512VLBW-LABEL: mload_constmask_v4f64_undef_passthrough: 6932; AVX512VLBW: ## %bb.0: 6933; AVX512VLBW-NEXT: movb $7, %al 6934; AVX512VLBW-NEXT: kmovd %eax, %k1 6935; AVX512VLBW-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} 6936; AVX512VLBW-NEXT: retq 6937; 6938; X86-AVX512-LABEL: mload_constmask_v4f64_undef_passthrough: 6939; X86-AVX512: ## %bb.0: 6940; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6941; X86-AVX512-NEXT: movb $7, %cl 6942; X86-AVX512-NEXT: kmovd %ecx, %k1 6943; X86-AVX512-NEXT: vmovupd (%eax), %ymm0 {%k1} {z} 6944; X86-AVX512-NEXT: retl 6945 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef) 6946 ret <4 x double> %res 6947} 6948 6949define <4 x i64> @mload_constmask_v4i64_undef_passthrough(ptr %addr) { 6950; SSE-LABEL: mload_constmask_v4i64_undef_passthrough: 6951; SSE: ## %bb.0: 6952; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 6953; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 6954; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 6955; SSE-NEXT: retq 6956; 6957; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough: 6958; AVX1: ## %bb.0: 6959; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] 6960; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 6961; AVX1-NEXT: retq 6962; 6963; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough: 6964; AVX2: ## %bb.0: 6965; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] 6966; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 6967; AVX2-NEXT: retq 6968; 6969; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough: 6970; AVX512F: ## %bb.0: 6971; AVX512F-NEXT: movb $6, %al 6972; AVX512F-NEXT: kmovw %eax, %k1 6973; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} 6974; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 6975; AVX512F-NEXT: retq 6976; 6977; AVX512VLDQ-LABEL: mload_constmask_v4i64_undef_passthrough: 6978; AVX512VLDQ: ## %bb.0: 6979; AVX512VLDQ-NEXT: movb $6, %al 6980; AVX512VLDQ-NEXT: kmovw %eax, %k1 6981; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} 6982; AVX512VLDQ-NEXT: retq 6983; 6984; AVX512VLBW-LABEL: mload_constmask_v4i64_undef_passthrough: 6985; AVX512VLBW: ## %bb.0: 6986; AVX512VLBW-NEXT: movb $6, %al 6987; AVX512VLBW-NEXT: kmovd %eax, %k1 6988; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} 6989; AVX512VLBW-NEXT: retq 6990; 6991; X86-AVX512-LABEL: mload_constmask_v4i64_undef_passthrough: 6992; X86-AVX512: ## %bb.0: 6993; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6994; X86-AVX512-NEXT: movb $6, %cl 6995; X86-AVX512-NEXT: kmovd %ecx, %k1 6996; X86-AVX512-NEXT: vmovdqu64 (%eax), %ymm0 {%k1} {z} 6997; X86-AVX512-NEXT: retl 6998 %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef) 6999 ret <4 x i64> %res 7000} 7001 7002; When only one element of the mask is set, reduce to a scalar load. 7003 7004define <4 x i32> @load_one_mask_bit_set1(ptr %addr, <4 x i32> %val) { 7005; SSE2-LABEL: load_one_mask_bit_set1: 7006; SSE2: ## %bb.0: 7007; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 7008; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 7009; SSE2-NEXT: retq 7010; 7011; SSE42-LABEL: load_one_mask_bit_set1: 7012; SSE42: ## %bb.0: 7013; SSE42-NEXT: pinsrd $0, (%rdi), %xmm0 7014; SSE42-NEXT: retq 7015; 7016; AVX-LABEL: load_one_mask_bit_set1: 7017; AVX: ## %bb.0: 7018; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 7019; AVX-NEXT: retq 7020; 7021; X86-AVX512-LABEL: load_one_mask_bit_set1: 7022; X86-AVX512: ## %bb.0: 7023; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 7024; X86-AVX512-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0 7025; X86-AVX512-NEXT: retl 7026 %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val) 7027 ret <4 x i32> %res 7028} 7029 7030; Choose a different element to show that the correct address offset is produced. 7031 7032define <4 x float> @load_one_mask_bit_set2(ptr %addr, <4 x float> %val) { 7033; SSE2-LABEL: load_one_mask_bit_set2: 7034; SSE2: ## %bb.0: 7035; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 7036; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 7037; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 7038; SSE2-NEXT: retq 7039; 7040; SSE42-LABEL: load_one_mask_bit_set2: 7041; SSE42: ## %bb.0: 7042; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 7043; SSE42-NEXT: retq 7044; 7045; AVX-LABEL: load_one_mask_bit_set2: 7046; AVX: ## %bb.0: 7047; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 7048; AVX-NEXT: retq 7049; 7050; X86-AVX512-LABEL: load_one_mask_bit_set2: 7051; X86-AVX512: ## %bb.0: 7052; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 7053; X86-AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 7054; X86-AVX512-NEXT: retl 7055 %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x float> %val) 7056 ret <4 x float> %res 7057} 7058 7059; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. 7060 7061define <4 x i64> @load_one_mask_bit_set3(ptr %addr, <4 x i64> %val) { 7062; SSE2-LABEL: load_one_mask_bit_set3: 7063; SSE2: ## %bb.0: 7064; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 7065; SSE2-NEXT: retq 7066; 7067; SSE42-LABEL: load_one_mask_bit_set3: 7068; SSE42: ## %bb.0: 7069; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm1 7070; SSE42-NEXT: retq 7071; 7072; AVX-LABEL: load_one_mask_bit_set3: 7073; AVX: ## %bb.0: 7074; AVX-NEXT: vbroadcastsd 16(%rdi), %ymm1 7075; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 7076; AVX-NEXT: retq 7077; 7078; X86-AVX512-LABEL: load_one_mask_bit_set3: 7079; X86-AVX512: ## %bb.0: 7080; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 7081; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %ymm1 7082; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 7083; X86-AVX512-NEXT: retl 7084 %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val) 7085 ret <4 x i64> %res 7086} 7087 7088; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. 7089 7090define <4 x double> @load_one_mask_bit_set4(ptr %addr, <4 x double> %val) { 7091; SSE-LABEL: load_one_mask_bit_set4: 7092; SSE: ## %bb.0: 7093; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] 7094; SSE-NEXT: retq 7095; 7096; AVX-LABEL: load_one_mask_bit_set4: 7097; AVX: ## %bb.0: 7098; AVX-NEXT: vbroadcastsd 24(%rdi), %ymm1 7099; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7100; AVX-NEXT: retq 7101; 7102; X86-AVX512-LABEL: load_one_mask_bit_set4: 7103; X86-AVX512: ## %bb.0: 7104; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 7105; X86-AVX512-NEXT: vbroadcastsd 24(%eax), %ymm1 7106; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7107; X86-AVX512-NEXT: retl 7108 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x double> %val) 7109 ret <4 x double> %res 7110} 7111 7112; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected. 7113 7114define <8 x double> @load_one_mask_bit_set5(ptr %addr, <8 x double> %val) { 7115; SSE-LABEL: load_one_mask_bit_set5: 7116; SSE: ## %bb.0: 7117; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] 7118; SSE-NEXT: retq 7119; 7120; AVX1OR2-LABEL: load_one_mask_bit_set5: 7121; AVX1OR2: ## %bb.0: 7122; AVX1OR2-NEXT: vbroadcastsd 56(%rdi), %ymm2 7123; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] 7124; AVX1OR2-NEXT: retq 7125; 7126; AVX512F-LABEL: load_one_mask_bit_set5: 7127; AVX512F: ## %bb.0: 7128; AVX512F-NEXT: movb $-128, %al 7129; AVX512F-NEXT: kmovw %eax, %k1 7130; AVX512F-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} 7131; AVX512F-NEXT: retq 7132; 7133; AVX512VLDQ-LABEL: load_one_mask_bit_set5: 7134; AVX512VLDQ: ## %bb.0: 7135; AVX512VLDQ-NEXT: movb $-128, %al 7136; AVX512VLDQ-NEXT: kmovw %eax, %k1 7137; AVX512VLDQ-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} 7138; AVX512VLDQ-NEXT: retq 7139; 7140; AVX512VLBW-LABEL: load_one_mask_bit_set5: 7141; AVX512VLBW: ## %bb.0: 7142; AVX512VLBW-NEXT: movb $-128, %al 7143; AVX512VLBW-NEXT: kmovd %eax, %k1 7144; AVX512VLBW-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} 7145; AVX512VLBW-NEXT: retq 7146; 7147; X86-AVX512-LABEL: load_one_mask_bit_set5: 7148; X86-AVX512: ## %bb.0: 7149; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 7150; X86-AVX512-NEXT: movb $-128, %cl 7151; X86-AVX512-NEXT: kmovd %ecx, %k1 7152; X86-AVX512-NEXT: vbroadcastsd 56(%eax), %zmm0 {%k1} 7153; X86-AVX512-NEXT: retl 7154 %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val) 7155 ret <8 x double> %res 7156} 7157 7158define <16 x i64> @load_one_mask_bit_set6(ptr %addr, <16 x i64> %val) { 7159; SSE2-LABEL: load_one_mask_bit_set6: 7160; SSE2: ## %bb.0: 7161; SSE2-NEXT: movq %rdi, %rax 7162; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 7163; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] 7164; SSE2-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero 7165; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] 7166; SSE2-NEXT: movaps %xmm7, 112(%rdi) 7167; SSE2-NEXT: movaps %xmm5, 80(%rdi) 7168; SSE2-NEXT: movaps %xmm4, 64(%rdi) 7169; SSE2-NEXT: movaps %xmm3, 48(%rdi) 7170; SSE2-NEXT: movaps %xmm2, 32(%rdi) 7171; SSE2-NEXT: movaps %xmm1, 16(%rdi) 7172; SSE2-NEXT: movaps %xmm0, (%rdi) 7173; SSE2-NEXT: movaps %xmm6, 96(%rdi) 7174; SSE2-NEXT: retq 7175; 7176; SSE42-LABEL: load_one_mask_bit_set6: 7177; SSE42: ## %bb.0: 7178; SSE42-NEXT: movq %rdi, %rax 7179; SSE42-NEXT: pinsrq $0, 16(%rsi), %xmm1 7180; SSE42-NEXT: pinsrq $0, 80(%rsi), %xmm5 7181; SSE42-NEXT: pinsrq $1, 104(%rsi), %xmm6 7182; SSE42-NEXT: movaps %xmm7, 112(%rdi) 7183; SSE42-NEXT: movdqa %xmm6, 96(%rdi) 7184; SSE42-NEXT: movdqa %xmm5, 80(%rdi) 7185; SSE42-NEXT: movaps %xmm4, 64(%rdi) 7186; SSE42-NEXT: movaps %xmm3, 48(%rdi) 7187; SSE42-NEXT: movaps %xmm2, 32(%rdi) 7188; SSE42-NEXT: movdqa %xmm1, 16(%rdi) 7189; SSE42-NEXT: movaps %xmm0, (%rdi) 7190; SSE42-NEXT: retq 7191; 7192; AVX1-LABEL: load_one_mask_bit_set6: 7193; AVX1: ## %bb.0: 7194; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,18446744073709551615,0] 7195; AVX1-NEXT: vmaskmovpd (%rdi), %ymm4, %ymm5 7196; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] 7197; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm4, %ymm4 7198; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] 7199; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [0,18446744073709551615,0,0] 7200; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm4, %ymm4 7201; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3] 7202; AVX1-NEXT: retq 7203; 7204; AVX2-LABEL: load_one_mask_bit_set6: 7205; AVX2: ## %bb.0: 7206; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,18446744073709551615,0] 7207; AVX2-NEXT: vpmaskmovq (%rdi), %ymm4, %ymm5 7208; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] 7209; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm4, %ymm4 7210; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] 7211; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,18446744073709551615,0,0] 7212; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm4, %ymm4 7213; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] 7214; AVX2-NEXT: retq 7215; 7216; AVX512F-LABEL: load_one_mask_bit_set6: 7217; AVX512F: ## %bb.0: 7218; AVX512F-NEXT: movb $4, %al 7219; AVX512F-NEXT: kmovw %eax, %k1 7220; AVX512F-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1} 7221; AVX512F-NEXT: movb $36, %al 7222; AVX512F-NEXT: kmovw %eax, %k1 7223; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} 7224; AVX512F-NEXT: retq 7225; 7226; AVX512VLDQ-LABEL: load_one_mask_bit_set6: 7227; AVX512VLDQ: ## %bb.0: 7228; AVX512VLDQ-NEXT: movb $4, %al 7229; AVX512VLDQ-NEXT: kmovw %eax, %k1 7230; AVX512VLDQ-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1} 7231; AVX512VLDQ-NEXT: movb $36, %al 7232; AVX512VLDQ-NEXT: kmovw %eax, %k1 7233; AVX512VLDQ-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} 7234; AVX512VLDQ-NEXT: retq 7235; 7236; AVX512VLBW-LABEL: load_one_mask_bit_set6: 7237; AVX512VLBW: ## %bb.0: 7238; AVX512VLBW-NEXT: movb $4, %al 7239; AVX512VLBW-NEXT: kmovd %eax, %k1 7240; AVX512VLBW-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1} 7241; AVX512VLBW-NEXT: movb $36, %al 7242; AVX512VLBW-NEXT: kmovd %eax, %k1 7243; AVX512VLBW-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} 7244; AVX512VLBW-NEXT: retq 7245; 7246; X86-AVX512-LABEL: load_one_mask_bit_set6: 7247; X86-AVX512: ## %bb.0: 7248; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 7249; X86-AVX512-NEXT: movb $4, %cl 7250; X86-AVX512-NEXT: kmovd %ecx, %k1 7251; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %zmm0 {%k1} 7252; X86-AVX512-NEXT: movb $36, %cl 7253; X86-AVX512-NEXT: kmovd %ecx, %k1 7254; X86-AVX512-NEXT: vmovdqu64 64(%eax), %zmm1 {%k1} 7255; X86-AVX512-NEXT: retl 7256 %res = call <16 x i64> @llvm.masked.load.v16i64.p0(ptr %addr, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>, <16 x i64> %val) 7257 ret <16 x i64> %res 7258} 7259 7260define i32 @pr38986(i1 %c, ptr %p) { 7261; SSE-LABEL: pr38986: 7262; SSE: ## %bb.0: 7263; SSE-NEXT: testb $1, %dil 7264; SSE-NEXT: ## implicit-def: $eax 7265; SSE-NEXT: je LBB45_2 7266; SSE-NEXT: ## %bb.1: ## %cond.load 7267; SSE-NEXT: movl (%rsi), %eax 7268; SSE-NEXT: LBB45_2: ## %else 7269; SSE-NEXT: retq 7270; 7271; AVX-LABEL: pr38986: 7272; AVX: ## %bb.0: 7273; AVX-NEXT: testb $1, %dil 7274; AVX-NEXT: ## implicit-def: $eax 7275; AVX-NEXT: je LBB45_2 7276; AVX-NEXT: ## %bb.1: ## %cond.load 7277; AVX-NEXT: movl (%rsi), %eax 7278; AVX-NEXT: LBB45_2: ## %else 7279; AVX-NEXT: retq 7280; 7281; X86-AVX512-LABEL: pr38986: 7282; X86-AVX512: ## %bb.0: 7283; X86-AVX512-NEXT: testb $1, {{[0-9]+}}(%esp) 7284; X86-AVX512-NEXT: ## implicit-def: $eax 7285; X86-AVX512-NEXT: je LBB45_2 7286; X86-AVX512-NEXT: ## %bb.1: ## %cond.load 7287; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 7288; X86-AVX512-NEXT: movl (%eax), %eax 7289; X86-AVX512-NEXT: LBB45_2: ## %else 7290; X86-AVX512-NEXT: retl 7291 %vc = insertelement <1 x i1> undef, i1 %c, i32 0 7292 %L = call <1 x i32> @llvm.masked.load.v1i32.p0 (ptr %p, i32 4, <1 x i1> %vc, <1 x i32> undef) 7293 %ret = bitcast <1 x i32> %L to i32 7294 ret i32 %ret 7295} 7296 7297define <2 x double> @zero_mask(ptr %addr, <2 x double> %dst) { 7298; SSE-LABEL: zero_mask: 7299; SSE: ## %bb.0: 7300; SSE-NEXT: retq 7301; 7302; AVX-LABEL: zero_mask: 7303; AVX: ## %bb.0: 7304; AVX-NEXT: retq 7305; 7306; X86-AVX512-LABEL: zero_mask: 7307; X86-AVX512: ## %bb.0: 7308; X86-AVX512-NEXT: retl 7309 %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> zeroinitializer, <2 x double> %dst) 7310 ret <2 x double> %res 7311} 7312 7313declare <16 x double> @llvm.masked.load.v16f64.p0(ptr, i32, <16 x i1>, <16 x double>) 7314declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>) 7315declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>) 7316declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>) 7317declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>) 7318 7319declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>) 7320declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>) 7321declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>) 7322declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>) 7323 7324declare <16 x i64> @llvm.masked.load.v16i64.p0(ptr, i32, <16 x i1>, <16 x i64>) 7325declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>) 7326declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>) 7327declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>) 7328declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>) 7329 7330declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>) 7331declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>) 7332declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) 7333declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>) 7334declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>) 7335 7336declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>) 7337declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>) 7338declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>) 7339declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>) 7340 7341declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>) 7342declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>) 7343declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>) 7344declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>) 7345