1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-sse2 | FileCheck %s --check-prefixes=ALL,SCALAR 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE2-ONLY 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE3 5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSSE3-ONLY 6; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE41 7; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE42 8; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX2-ONLY 10; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512F 11; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512BW 12 13define void @vec32_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 14; SCALAR-LABEL: vec32_v2i8: 15; SCALAR: # %bb.0: 16; SCALAR-NEXT: movzbl (%rdi), %eax 17; SCALAR-NEXT: movzbl 1(%rdi), %ecx 18; SCALAR-NEXT: notb %al 19; SCALAR-NEXT: notb %cl 20; SCALAR-NEXT: movb %cl, 1(%rsi) 21; SCALAR-NEXT: movb %al, (%rsi) 22; SCALAR-NEXT: movb %cl, 1(%rdx) 23; SCALAR-NEXT: movb %al, (%rdx) 24; SCALAR-NEXT: movb %cl, 3(%rdx) 25; SCALAR-NEXT: movb %al, 2(%rdx) 26; SCALAR-NEXT: retq 27; 28; SSE-LABEL: vec32_v2i8: 29; SSE: # %bb.0: 30; SSE-NEXT: movl (%rdi), %eax 31; SSE-NEXT: notl %eax 32; SSE-NEXT: movw %ax, (%rsi) 33; SSE-NEXT: movw %ax, (%rdx) 34; SSE-NEXT: movw %ax, 2(%rdx) 35; SSE-NEXT: retq 36 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 37 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> 38 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 39 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 40 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 41 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 42 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 43 ret void 44} 45 46define void @vec64_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 47; SCALAR-LABEL: vec64_v2i8: 48; SCALAR: # %bb.0: 49; SCALAR-NEXT: movzbl (%rdi), %eax 50; SCALAR-NEXT: movzbl 1(%rdi), %ecx 51; SCALAR-NEXT: notb %al 52; SCALAR-NEXT: notb %cl 53; SCALAR-NEXT: movb %cl, 1(%rsi) 54; SCALAR-NEXT: movb %al, (%rsi) 55; SCALAR-NEXT: movb %cl, 1(%rdx) 56; SCALAR-NEXT: movb %al, (%rdx) 57; SCALAR-NEXT: movb %cl, 3(%rdx) 58; SCALAR-NEXT: movb %al, 2(%rdx) 59; SCALAR-NEXT: movb %cl, 5(%rdx) 60; SCALAR-NEXT: movb %al, 4(%rdx) 61; SCALAR-NEXT: movb %cl, 7(%rdx) 62; SCALAR-NEXT: movb %al, 6(%rdx) 63; SCALAR-NEXT: retq 64; 65; SSE-LABEL: vec64_v2i8: 66; SSE: # %bb.0: 67; SSE-NEXT: movl (%rdi), %eax 68; SSE-NEXT: notl %eax 69; SSE-NEXT: movw %ax, (%rsi) 70; SSE-NEXT: movw %ax, (%rdx) 71; SSE-NEXT: movw %ax, 2(%rdx) 72; SSE-NEXT: movw %ax, 4(%rdx) 73; SSE-NEXT: movw %ax, 6(%rdx) 74; SSE-NEXT: retq 75 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 76 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> 77 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 78 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 79 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 80 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 81 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 82 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 83 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 84 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 85 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 86 ret void 87} 88 89define void @vec64_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 90; SCALAR-LABEL: vec64_v2i16: 91; SCALAR: # %bb.0: 92; SCALAR-NEXT: movzwl 2(%rdi), %eax 93; SCALAR-NEXT: movl (%rdi), %ecx 94; SCALAR-NEXT: notl %ecx 95; SCALAR-NEXT: notl %eax 96; SCALAR-NEXT: movw %ax, 2(%rsi) 97; SCALAR-NEXT: movw %cx, (%rsi) 98; SCALAR-NEXT: movw %ax, 2(%rdx) 99; SCALAR-NEXT: movw %cx, (%rdx) 100; SCALAR-NEXT: movw %ax, 6(%rdx) 101; SCALAR-NEXT: movw %cx, 4(%rdx) 102; SCALAR-NEXT: retq 103; 104; SSE-LABEL: vec64_v2i16: 105; SSE: # %bb.0: 106; SSE-NEXT: movl (%rdi), %eax 107; SSE-NEXT: notl %eax 108; SSE-NEXT: movl %eax, (%rsi) 109; SSE-NEXT: movl %eax, (%rdx) 110; SSE-NEXT: movl %eax, 4(%rdx) 111; SSE-NEXT: retq 112 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64 113 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1> 114 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 115 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 116 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 117 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 118 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 119 ret void 120} 121 122define void @vec64_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 123; SCALAR-LABEL: vec64_v4i8: 124; SCALAR: # %bb.0: 125; SCALAR-NEXT: movzbl 3(%rdi), %eax 126; SCALAR-NEXT: movzbl 2(%rdi), %ecx 127; SCALAR-NEXT: movzbl (%rdi), %r8d 128; SCALAR-NEXT: movzbl 1(%rdi), %edi 129; SCALAR-NEXT: notb %r8b 130; SCALAR-NEXT: notb %dil 131; SCALAR-NEXT: notb %cl 132; SCALAR-NEXT: notb %al 133; SCALAR-NEXT: movb %al, 3(%rsi) 134; SCALAR-NEXT: movb %cl, 2(%rsi) 135; SCALAR-NEXT: movb %dil, 1(%rsi) 136; SCALAR-NEXT: movb %r8b, (%rsi) 137; SCALAR-NEXT: movb %al, 3(%rdx) 138; SCALAR-NEXT: movb %cl, 2(%rdx) 139; SCALAR-NEXT: movb %dil, 1(%rdx) 140; SCALAR-NEXT: movb %r8b, (%rdx) 141; SCALAR-NEXT: movb %al, 7(%rdx) 142; SCALAR-NEXT: movb %cl, 6(%rdx) 143; SCALAR-NEXT: movb %dil, 5(%rdx) 144; SCALAR-NEXT: movb %r8b, 4(%rdx) 145; SCALAR-NEXT: retq 146; 147; SSE-LABEL: vec64_v4i8: 148; SSE: # %bb.0: 149; SSE-NEXT: movl (%rdi), %eax 150; SSE-NEXT: notl %eax 151; SSE-NEXT: movl %eax, (%rsi) 152; SSE-NEXT: movl %eax, (%rdx) 153; SSE-NEXT: movl %eax, 4(%rdx) 154; SSE-NEXT: retq 155 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64 156 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1> 157 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 158 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 159 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 160 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 161 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 162 ret void 163} 164 165define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 166; SCALAR-LABEL: vec128_v2i8: 167; SCALAR: # %bb.0: 168; SCALAR-NEXT: movzbl (%rdi), %eax 169; SCALAR-NEXT: movzbl 1(%rdi), %ecx 170; SCALAR-NEXT: notb %al 171; SCALAR-NEXT: notb %cl 172; SCALAR-NEXT: movb %cl, 1(%rsi) 173; SCALAR-NEXT: movb %al, (%rsi) 174; SCALAR-NEXT: movb %cl, 1(%rdx) 175; SCALAR-NEXT: movb %al, (%rdx) 176; SCALAR-NEXT: movb %cl, 3(%rdx) 177; SCALAR-NEXT: movb %al, 2(%rdx) 178; SCALAR-NEXT: movb %cl, 5(%rdx) 179; SCALAR-NEXT: movb %al, 4(%rdx) 180; SCALAR-NEXT: movb %cl, 7(%rdx) 181; SCALAR-NEXT: movb %al, 6(%rdx) 182; SCALAR-NEXT: movb %cl, 9(%rdx) 183; SCALAR-NEXT: movb %al, 8(%rdx) 184; SCALAR-NEXT: movb %cl, 11(%rdx) 185; SCALAR-NEXT: movb %al, 10(%rdx) 186; SCALAR-NEXT: movb %cl, 13(%rdx) 187; SCALAR-NEXT: movb %al, 12(%rdx) 188; SCALAR-NEXT: movb %cl, 15(%rdx) 189; SCALAR-NEXT: movb %al, 14(%rdx) 190; SCALAR-NEXT: retq 191; 192; SSE2-ONLY-LABEL: vec128_v2i8: 193; SSE2-ONLY: # %bb.0: 194; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 195; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 196; SSE2-ONLY-NEXT: movd %xmm0, %eax 197; SSE2-ONLY-NEXT: movw %ax, (%rsi) 198; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 199; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 200; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) 201; SSE2-ONLY-NEXT: retq 202; 203; SSE3-LABEL: vec128_v2i8: 204; SSE3: # %bb.0: 205; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 206; SSE3-NEXT: pxor (%rdi), %xmm0 207; SSE3-NEXT: movd %xmm0, %eax 208; SSE3-NEXT: movw %ax, (%rsi) 209; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 210; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 211; SSE3-NEXT: movdqa %xmm0, (%rdx) 212; SSE3-NEXT: retq 213; 214; SSSE3-ONLY-LABEL: vec128_v2i8: 215; SSSE3-ONLY: # %bb.0: 216; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 217; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 218; SSSE3-ONLY-NEXT: movd %xmm0, %eax 219; SSSE3-ONLY-NEXT: movw %ax, (%rsi) 220; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 221; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 222; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) 223; SSSE3-ONLY-NEXT: retq 224; 225; SSE41-LABEL: vec128_v2i8: 226; SSE41: # %bb.0: 227; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 228; SSE41-NEXT: pxor (%rdi), %xmm0 229; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) 230; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 231; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 232; SSE41-NEXT: movdqa %xmm0, (%rdx) 233; SSE41-NEXT: retq 234; 235; SSE42-LABEL: vec128_v2i8: 236; SSE42: # %bb.0: 237; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 238; SSE42-NEXT: pxor (%rdi), %xmm0 239; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 240; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 241; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 242; SSE42-NEXT: movdqa %xmm0, (%rdx) 243; SSE42-NEXT: retq 244; 245; AVX1-LABEL: vec128_v2i8: 246; AVX1: # %bb.0: 247; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 248; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 249; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) 250; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 251; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 252; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 253; AVX1-NEXT: retq 254; 255; AVX2-LABEL: vec128_v2i8: 256; AVX2: # %bb.0: 257; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 258; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 259; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi) 260; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 261; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 262; AVX2-NEXT: retq 263 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 264 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> 265 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 266 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 267 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 268 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 269 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 270 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 271 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 272 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 273 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 274 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 275 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 276 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 277 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 278 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 279 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 280 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 281 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 282 ret void 283} 284 285define void @vec128_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 286; SCALAR-LABEL: vec128_v2i16: 287; SCALAR: # %bb.0: 288; SCALAR-NEXT: movzwl 2(%rdi), %eax 289; SCALAR-NEXT: movl (%rdi), %ecx 290; SCALAR-NEXT: notl %ecx 291; SCALAR-NEXT: notl %eax 292; SCALAR-NEXT: movw %ax, 2(%rsi) 293; SCALAR-NEXT: movw %cx, (%rsi) 294; SCALAR-NEXT: movw %ax, 2(%rdx) 295; SCALAR-NEXT: movw %cx, (%rdx) 296; SCALAR-NEXT: movw %ax, 6(%rdx) 297; SCALAR-NEXT: movw %cx, 4(%rdx) 298; SCALAR-NEXT: movw %ax, 10(%rdx) 299; SCALAR-NEXT: movw %cx, 8(%rdx) 300; SCALAR-NEXT: movw %ax, 14(%rdx) 301; SCALAR-NEXT: movw %cx, 12(%rdx) 302; SCALAR-NEXT: retq 303; 304; SSE2-LABEL: vec128_v2i16: 305; SSE2: # %bb.0: 306; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 307; SSE2-NEXT: pxor (%rdi), %xmm0 308; SSE2-NEXT: movd %xmm0, (%rsi) 309; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 310; SSE2-NEXT: movdqa %xmm0, (%rdx) 311; SSE2-NEXT: retq 312; 313; AVX1-LABEL: vec128_v2i16: 314; AVX1: # %bb.0: 315; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 316; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 317; AVX1-NEXT: vmovd %xmm0, (%rsi) 318; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 319; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 320; AVX1-NEXT: retq 321; 322; AVX2-LABEL: vec128_v2i16: 323; AVX2: # %bb.0: 324; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 325; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 326; AVX2-NEXT: vmovd %xmm0, (%rsi) 327; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 328; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 329; AVX2-NEXT: retq 330 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64 331 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1> 332 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 333 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 334 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 335 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 336 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 337 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 338 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 339 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 340 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 341 ret void 342} 343 344define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 345; SCALAR-LABEL: vec128_v2i32: 346; SCALAR: # %bb.0: 347; SCALAR-NEXT: movl (%rdi), %eax 348; SCALAR-NEXT: movl 4(%rdi), %ecx 349; SCALAR-NEXT: notl %eax 350; SCALAR-NEXT: notl %ecx 351; SCALAR-NEXT: movl %ecx, 4(%rsi) 352; SCALAR-NEXT: movl %eax, (%rsi) 353; SCALAR-NEXT: movl %ecx, 4(%rdx) 354; SCALAR-NEXT: movl %eax, (%rdx) 355; SCALAR-NEXT: movl %ecx, 12(%rdx) 356; SCALAR-NEXT: movl %eax, 8(%rdx) 357; SCALAR-NEXT: retq 358; 359; SSE2-LABEL: vec128_v2i32: 360; SSE2: # %bb.0: 361; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 362; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 363; SSE2-NEXT: pxor %xmm0, %xmm1 364; SSE2-NEXT: movq %xmm1, (%rsi) 365; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 366; SSE2-NEXT: movdqa %xmm0, (%rdx) 367; SSE2-NEXT: retq 368; 369; AVX1-LABEL: vec128_v2i32: 370; AVX1: # %bb.0: 371; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 372; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 373; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 374; AVX1-NEXT: vmovq %xmm0, (%rsi) 375; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 376; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 377; AVX1-NEXT: retq 378; 379; AVX2-ONLY-LABEL: vec128_v2i32: 380; AVX2-ONLY: # %bb.0: 381; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 382; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 383; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 384; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 385; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 386; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) 387; AVX2-ONLY-NEXT: retq 388; 389; AVX512-LABEL: vec128_v2i32: 390; AVX512: # %bb.0: 391; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 392; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 393; AVX512-NEXT: vmovq %xmm0, (%rsi) 394; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 395; AVX512-NEXT: vmovdqa %xmm0, (%rdx) 396; AVX512-NEXT: retq 397 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 398 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> 399 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 400 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 401 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 402 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 403 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 404 ret void 405} 406 407define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 408; SCALAR-LABEL: vec128_v2f32: 409; SCALAR: # %bb.0: 410; SCALAR-NEXT: movl (%rdi), %eax 411; SCALAR-NEXT: movl 4(%rdi), %ecx 412; SCALAR-NEXT: notl %eax 413; SCALAR-NEXT: notl %ecx 414; SCALAR-NEXT: movl %ecx, 4(%rsi) 415; SCALAR-NEXT: movl %eax, (%rsi) 416; SCALAR-NEXT: movl %ecx, 4(%rdx) 417; SCALAR-NEXT: movl %eax, (%rdx) 418; SCALAR-NEXT: movl %ecx, 12(%rdx) 419; SCALAR-NEXT: movl %eax, 8(%rdx) 420; SCALAR-NEXT: retq 421; 422; SSE2-LABEL: vec128_v2f32: 423; SSE2: # %bb.0: 424; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 425; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 426; SSE2-NEXT: pxor %xmm0, %xmm1 427; SSE2-NEXT: movq %xmm1, (%rsi) 428; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 429; SSE2-NEXT: movdqa %xmm0, (%rdx) 430; SSE2-NEXT: retq 431; 432; AVX1-LABEL: vec128_v2f32: 433; AVX1: # %bb.0: 434; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 435; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 436; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 437; AVX1-NEXT: vmovq %xmm0, (%rsi) 438; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 439; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 440; AVX1-NEXT: retq 441; 442; AVX2-ONLY-LABEL: vec128_v2f32: 443; AVX2-ONLY: # %bb.0: 444; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 445; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 446; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 447; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 448; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 449; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) 450; AVX2-ONLY-NEXT: retq 451; 452; AVX512-LABEL: vec128_v2f32: 453; AVX512: # %bb.0: 454; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 455; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 456; AVX512-NEXT: vmovq %xmm0, (%rsi) 457; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 458; AVX512-NEXT: vmovdqa %xmm0, (%rdx) 459; AVX512-NEXT: retq 460 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 461 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> 462 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> 463 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64 464 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 465 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 466 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 467 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 468 ret void 469} 470 471define void @vec128_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 472; SCALAR-LABEL: vec128_v4i8: 473; SCALAR: # %bb.0: 474; SCALAR-NEXT: movzbl 3(%rdi), %eax 475; SCALAR-NEXT: movzbl 2(%rdi), %ecx 476; SCALAR-NEXT: movzbl (%rdi), %r8d 477; SCALAR-NEXT: movzbl 1(%rdi), %edi 478; SCALAR-NEXT: notb %r8b 479; SCALAR-NEXT: notb %dil 480; SCALAR-NEXT: notb %cl 481; SCALAR-NEXT: notb %al 482; SCALAR-NEXT: movb %al, 3(%rsi) 483; SCALAR-NEXT: movb %cl, 2(%rsi) 484; SCALAR-NEXT: movb %dil, 1(%rsi) 485; SCALAR-NEXT: movb %r8b, (%rsi) 486; SCALAR-NEXT: movb %al, 3(%rdx) 487; SCALAR-NEXT: movb %cl, 2(%rdx) 488; SCALAR-NEXT: movb %dil, 1(%rdx) 489; SCALAR-NEXT: movb %r8b, (%rdx) 490; SCALAR-NEXT: movb %al, 7(%rdx) 491; SCALAR-NEXT: movb %cl, 6(%rdx) 492; SCALAR-NEXT: movb %dil, 5(%rdx) 493; SCALAR-NEXT: movb %r8b, 4(%rdx) 494; SCALAR-NEXT: movb %al, 11(%rdx) 495; SCALAR-NEXT: movb %cl, 10(%rdx) 496; SCALAR-NEXT: movb %dil, 9(%rdx) 497; SCALAR-NEXT: movb %r8b, 8(%rdx) 498; SCALAR-NEXT: movb %al, 15(%rdx) 499; SCALAR-NEXT: movb %cl, 14(%rdx) 500; SCALAR-NEXT: movb %dil, 13(%rdx) 501; SCALAR-NEXT: movb %r8b, 12(%rdx) 502; SCALAR-NEXT: retq 503; 504; SSE2-LABEL: vec128_v4i8: 505; SSE2: # %bb.0: 506; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 507; SSE2-NEXT: pxor (%rdi), %xmm0 508; SSE2-NEXT: movd %xmm0, (%rsi) 509; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 510; SSE2-NEXT: movdqa %xmm0, (%rdx) 511; SSE2-NEXT: retq 512; 513; AVX1-LABEL: vec128_v4i8: 514; AVX1: # %bb.0: 515; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 516; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 517; AVX1-NEXT: vmovd %xmm0, (%rsi) 518; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 519; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 520; AVX1-NEXT: retq 521; 522; AVX2-LABEL: vec128_v4i8: 523; AVX2: # %bb.0: 524; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 525; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 526; AVX2-NEXT: vmovd %xmm0, (%rsi) 527; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 528; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 529; AVX2-NEXT: retq 530 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64 531 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1> 532 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 533 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 534 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 535 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 536 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 537 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 538 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 539 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 540 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 541 ret void 542} 543 544define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 545; SCALAR-LABEL: vec128_v4i16: 546; SCALAR: # %bb.0: 547; SCALAR-NEXT: movzwl 6(%rdi), %eax 548; SCALAR-NEXT: movzwl 2(%rdi), %ecx 549; SCALAR-NEXT: movl (%rdi), %r8d 550; SCALAR-NEXT: movl 4(%rdi), %edi 551; SCALAR-NEXT: notl %r8d 552; SCALAR-NEXT: notl %ecx 553; SCALAR-NEXT: notl %edi 554; SCALAR-NEXT: notl %eax 555; SCALAR-NEXT: movw %ax, 6(%rsi) 556; SCALAR-NEXT: movw %di, 4(%rsi) 557; SCALAR-NEXT: movw %cx, 2(%rsi) 558; SCALAR-NEXT: movw %r8w, (%rsi) 559; SCALAR-NEXT: movw %ax, 6(%rdx) 560; SCALAR-NEXT: movw %di, 4(%rdx) 561; SCALAR-NEXT: movw %cx, 2(%rdx) 562; SCALAR-NEXT: movw %r8w, (%rdx) 563; SCALAR-NEXT: movw %ax, 14(%rdx) 564; SCALAR-NEXT: movw %di, 12(%rdx) 565; SCALAR-NEXT: movw %cx, 10(%rdx) 566; SCALAR-NEXT: movw %r8w, 8(%rdx) 567; SCALAR-NEXT: retq 568; 569; SSE2-LABEL: vec128_v4i16: 570; SSE2: # %bb.0: 571; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 572; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 573; SSE2-NEXT: pxor %xmm0, %xmm1 574; SSE2-NEXT: movq %xmm1, (%rsi) 575; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 576; SSE2-NEXT: movdqa %xmm0, (%rdx) 577; SSE2-NEXT: retq 578; 579; AVX1-LABEL: vec128_v4i16: 580; AVX1: # %bb.0: 581; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 582; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 583; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 584; AVX1-NEXT: vmovq %xmm0, (%rsi) 585; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 586; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 587; AVX1-NEXT: retq 588; 589; AVX2-ONLY-LABEL: vec128_v4i16: 590; AVX2-ONLY: # %bb.0: 591; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 592; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 593; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 594; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 595; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 596; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) 597; AVX2-ONLY-NEXT: retq 598; 599; AVX512-LABEL: vec128_v4i16: 600; AVX512: # %bb.0: 601; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 602; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 603; AVX512-NEXT: vmovq %xmm0, (%rsi) 604; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 605; AVX512-NEXT: vmovdqa %xmm0, (%rdx) 606; AVX512-NEXT: retq 607 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 608 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1> 609 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 610 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 611 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 612 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 613 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 614 ret void 615} 616 617define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 618; SCALAR-LABEL: vec128_v8i8: 619; SCALAR: # %bb.0: 620; SCALAR-NEXT: pushq %rbx 621; SCALAR-NEXT: movzbl 7(%rdi), %ebx 622; SCALAR-NEXT: movzbl 6(%rdi), %r11d 623; SCALAR-NEXT: movzbl 5(%rdi), %r10d 624; SCALAR-NEXT: movzbl 4(%rdi), %r9d 625; SCALAR-NEXT: movzbl 3(%rdi), %r8d 626; SCALAR-NEXT: movzbl 2(%rdi), %ecx 627; SCALAR-NEXT: movzbl (%rdi), %eax 628; SCALAR-NEXT: movzbl 1(%rdi), %edi 629; SCALAR-NEXT: notb %al 630; SCALAR-NEXT: notb %dil 631; SCALAR-NEXT: notb %cl 632; SCALAR-NEXT: notb %r8b 633; SCALAR-NEXT: notb %r9b 634; SCALAR-NEXT: notb %r10b 635; SCALAR-NEXT: notb %r11b 636; SCALAR-NEXT: notb %bl 637; SCALAR-NEXT: movb %bl, 7(%rsi) 638; SCALAR-NEXT: movb %r11b, 6(%rsi) 639; SCALAR-NEXT: movb %r10b, 5(%rsi) 640; SCALAR-NEXT: movb %r9b, 4(%rsi) 641; SCALAR-NEXT: movb %r8b, 3(%rsi) 642; SCALAR-NEXT: movb %cl, 2(%rsi) 643; SCALAR-NEXT: movb %dil, 1(%rsi) 644; SCALAR-NEXT: movb %al, (%rsi) 645; SCALAR-NEXT: movb %bl, 7(%rdx) 646; SCALAR-NEXT: movb %r11b, 6(%rdx) 647; SCALAR-NEXT: movb %r10b, 5(%rdx) 648; SCALAR-NEXT: movb %r9b, 4(%rdx) 649; SCALAR-NEXT: movb %r8b, 3(%rdx) 650; SCALAR-NEXT: movb %cl, 2(%rdx) 651; SCALAR-NEXT: movb %dil, 1(%rdx) 652; SCALAR-NEXT: movb %al, (%rdx) 653; SCALAR-NEXT: movb %bl, 15(%rdx) 654; SCALAR-NEXT: movb %r11b, 14(%rdx) 655; SCALAR-NEXT: movb %r10b, 13(%rdx) 656; SCALAR-NEXT: movb %r9b, 12(%rdx) 657; SCALAR-NEXT: movb %r8b, 11(%rdx) 658; SCALAR-NEXT: movb %cl, 10(%rdx) 659; SCALAR-NEXT: movb %dil, 9(%rdx) 660; SCALAR-NEXT: movb %al, 8(%rdx) 661; SCALAR-NEXT: popq %rbx 662; SCALAR-NEXT: retq 663; 664; SSE2-LABEL: vec128_v8i8: 665; SSE2: # %bb.0: 666; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 667; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 668; SSE2-NEXT: pxor %xmm0, %xmm1 669; SSE2-NEXT: movq %xmm1, (%rsi) 670; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 671; SSE2-NEXT: movdqa %xmm0, (%rdx) 672; SSE2-NEXT: retq 673; 674; AVX1-LABEL: vec128_v8i8: 675; AVX1: # %bb.0: 676; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 677; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 678; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 679; AVX1-NEXT: vmovq %xmm0, (%rsi) 680; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 681; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 682; AVX1-NEXT: retq 683; 684; AVX2-ONLY-LABEL: vec128_v8i8: 685; AVX2-ONLY: # %bb.0: 686; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 687; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 688; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 689; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 690; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 691; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) 692; AVX2-ONLY-NEXT: retq 693; 694; AVX512-LABEL: vec128_v8i8: 695; AVX512: # %bb.0: 696; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 697; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 698; AVX512-NEXT: vmovq %xmm0, (%rsi) 699; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 700; AVX512-NEXT: vmovdqa %xmm0, (%rdx) 701; AVX512-NEXT: retq 702 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 703 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 704 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 705 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 706 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 707 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 708 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 709 ret void 710} 711 712define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 713; SCALAR-LABEL: vec256_v2i8: 714; SCALAR: # %bb.0: 715; SCALAR-NEXT: movzbl (%rdi), %eax 716; SCALAR-NEXT: movzbl 1(%rdi), %ecx 717; SCALAR-NEXT: notb %al 718; SCALAR-NEXT: notb %cl 719; SCALAR-NEXT: movb %cl, 1(%rsi) 720; SCALAR-NEXT: movb %al, (%rsi) 721; SCALAR-NEXT: movb %cl, 1(%rdx) 722; SCALAR-NEXT: movb %al, (%rdx) 723; SCALAR-NEXT: movb %cl, 3(%rdx) 724; SCALAR-NEXT: movb %al, 2(%rdx) 725; SCALAR-NEXT: movb %cl, 5(%rdx) 726; SCALAR-NEXT: movb %al, 4(%rdx) 727; SCALAR-NEXT: movb %cl, 7(%rdx) 728; SCALAR-NEXT: movb %al, 6(%rdx) 729; SCALAR-NEXT: movb %cl, 9(%rdx) 730; SCALAR-NEXT: movb %al, 8(%rdx) 731; SCALAR-NEXT: movb %cl, 11(%rdx) 732; SCALAR-NEXT: movb %al, 10(%rdx) 733; SCALAR-NEXT: movb %cl, 13(%rdx) 734; SCALAR-NEXT: movb %al, 12(%rdx) 735; SCALAR-NEXT: movb %cl, 15(%rdx) 736; SCALAR-NEXT: movb %al, 14(%rdx) 737; SCALAR-NEXT: movb %cl, 17(%rdx) 738; SCALAR-NEXT: movb %al, 16(%rdx) 739; SCALAR-NEXT: movb %cl, 19(%rdx) 740; SCALAR-NEXT: movb %al, 18(%rdx) 741; SCALAR-NEXT: movb %cl, 21(%rdx) 742; SCALAR-NEXT: movb %al, 20(%rdx) 743; SCALAR-NEXT: movb %cl, 23(%rdx) 744; SCALAR-NEXT: movb %al, 22(%rdx) 745; SCALAR-NEXT: movb %cl, 25(%rdx) 746; SCALAR-NEXT: movb %al, 24(%rdx) 747; SCALAR-NEXT: movb %cl, 27(%rdx) 748; SCALAR-NEXT: movb %al, 26(%rdx) 749; SCALAR-NEXT: movb %cl, 29(%rdx) 750; SCALAR-NEXT: movb %al, 28(%rdx) 751; SCALAR-NEXT: movb %cl, 31(%rdx) 752; SCALAR-NEXT: movb %al, 30(%rdx) 753; SCALAR-NEXT: retq 754; 755; SSE2-ONLY-LABEL: vec256_v2i8: 756; SSE2-ONLY: # %bb.0: 757; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 758; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 759; SSE2-ONLY-NEXT: movd %xmm0, %eax 760; SSE2-ONLY-NEXT: movw %ax, (%rsi) 761; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 762; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 763; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) 764; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx) 765; SSE2-ONLY-NEXT: retq 766; 767; SSE3-LABEL: vec256_v2i8: 768; SSE3: # %bb.0: 769; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 770; SSE3-NEXT: pxor (%rdi), %xmm0 771; SSE3-NEXT: movd %xmm0, %eax 772; SSE3-NEXT: movw %ax, (%rsi) 773; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 774; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 775; SSE3-NEXT: movdqa %xmm0, (%rdx) 776; SSE3-NEXT: movdqa %xmm0, 16(%rdx) 777; SSE3-NEXT: retq 778; 779; SSSE3-ONLY-LABEL: vec256_v2i8: 780; SSSE3-ONLY: # %bb.0: 781; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 782; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 783; SSSE3-ONLY-NEXT: movd %xmm0, %eax 784; SSSE3-ONLY-NEXT: movw %ax, (%rsi) 785; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 786; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 787; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) 788; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx) 789; SSSE3-ONLY-NEXT: retq 790; 791; SSE41-LABEL: vec256_v2i8: 792; SSE41: # %bb.0: 793; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 794; SSE41-NEXT: pxor (%rdi), %xmm0 795; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) 796; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 797; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 798; SSE41-NEXT: movdqa %xmm0, (%rdx) 799; SSE41-NEXT: movdqa %xmm0, 16(%rdx) 800; SSE41-NEXT: retq 801; 802; SSE42-LABEL: vec256_v2i8: 803; SSE42: # %bb.0: 804; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 805; SSE42-NEXT: pxor (%rdi), %xmm0 806; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 807; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 808; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 809; SSE42-NEXT: movdqa %xmm0, (%rdx) 810; SSE42-NEXT: movdqa %xmm0, 16(%rdx) 811; SSE42-NEXT: retq 812; 813; AVX1-LABEL: vec256_v2i8: 814; AVX1: # %bb.0: 815; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 816; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 817; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) 818; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 819; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 820; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 821; AVX1-NEXT: vmovaps %ymm0, (%rdx) 822; AVX1-NEXT: vzeroupper 823; AVX1-NEXT: retq 824; 825; AVX2-LABEL: vec256_v2i8: 826; AVX2: # %bb.0: 827; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 828; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 829; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi) 830; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 831; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 832; AVX2-NEXT: vzeroupper 833; AVX2-NEXT: retq 834 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 835 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> 836 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 837 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 838 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 839 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 840 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 841 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 842 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 843 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 844 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 845 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 846 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 847 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 848 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 849 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 850 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 851 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 852 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 853 %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8 854 store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16 855 %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9 856 store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2 857 %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10 858 store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4 859 %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11 860 store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2 861 %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12 862 store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8 863 %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13 864 store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2 865 %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14 866 store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4 867 %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15 868 store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2 869 ret void 870} 871 872define void @vec256_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 873; SCALAR-LABEL: vec256_v2i16: 874; SCALAR: # %bb.0: 875; SCALAR-NEXT: movzwl 2(%rdi), %eax 876; SCALAR-NEXT: movl (%rdi), %ecx 877; SCALAR-NEXT: notl %ecx 878; SCALAR-NEXT: notl %eax 879; SCALAR-NEXT: movw %ax, 2(%rsi) 880; SCALAR-NEXT: movw %cx, (%rsi) 881; SCALAR-NEXT: movw %ax, 2(%rdx) 882; SCALAR-NEXT: movw %cx, (%rdx) 883; SCALAR-NEXT: movw %ax, 6(%rdx) 884; SCALAR-NEXT: movw %cx, 4(%rdx) 885; SCALAR-NEXT: movw %ax, 10(%rdx) 886; SCALAR-NEXT: movw %cx, 8(%rdx) 887; SCALAR-NEXT: movw %ax, 14(%rdx) 888; SCALAR-NEXT: movw %cx, 12(%rdx) 889; SCALAR-NEXT: movw %ax, 18(%rdx) 890; SCALAR-NEXT: movw %cx, 16(%rdx) 891; SCALAR-NEXT: movw %ax, 22(%rdx) 892; SCALAR-NEXT: movw %cx, 20(%rdx) 893; SCALAR-NEXT: movw %ax, 26(%rdx) 894; SCALAR-NEXT: movw %cx, 24(%rdx) 895; SCALAR-NEXT: movw %ax, 30(%rdx) 896; SCALAR-NEXT: movw %cx, 28(%rdx) 897; SCALAR-NEXT: retq 898; 899; SSE2-LABEL: vec256_v2i16: 900; SSE2: # %bb.0: 901; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 902; SSE2-NEXT: pxor (%rdi), %xmm0 903; SSE2-NEXT: movd %xmm0, (%rsi) 904; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 905; SSE2-NEXT: movdqa %xmm0, (%rdx) 906; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 907; SSE2-NEXT: retq 908; 909; AVX1-LABEL: vec256_v2i16: 910; AVX1: # %bb.0: 911; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 912; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 913; AVX1-NEXT: vmovd %xmm0, (%rsi) 914; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 915; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx) 916; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 917; AVX1-NEXT: retq 918; 919; AVX2-LABEL: vec256_v2i16: 920; AVX2: # %bb.0: 921; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 922; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 923; AVX2-NEXT: vmovd %xmm0, (%rsi) 924; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 925; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 926; AVX2-NEXT: vzeroupper 927; AVX2-NEXT: retq 928 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64 929 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1> 930 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 931 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 932 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 933 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 934 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 935 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 936 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 937 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 938 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 939 %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4 940 store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16 941 %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5 942 store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4 943 %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6 944 store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8 945 %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7 946 store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4 947 ret void 948} 949 950define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 951; SCALAR-LABEL: vec256_v2i32: 952; SCALAR: # %bb.0: 953; SCALAR-NEXT: movl (%rdi), %eax 954; SCALAR-NEXT: movl 4(%rdi), %ecx 955; SCALAR-NEXT: notl %eax 956; SCALAR-NEXT: notl %ecx 957; SCALAR-NEXT: movl %ecx, 4(%rsi) 958; SCALAR-NEXT: movl %eax, (%rsi) 959; SCALAR-NEXT: movl %ecx, 4(%rdx) 960; SCALAR-NEXT: movl %eax, (%rdx) 961; SCALAR-NEXT: movl %ecx, 12(%rdx) 962; SCALAR-NEXT: movl %eax, 8(%rdx) 963; SCALAR-NEXT: movl %ecx, 20(%rdx) 964; SCALAR-NEXT: movl %eax, 16(%rdx) 965; SCALAR-NEXT: movl %ecx, 28(%rdx) 966; SCALAR-NEXT: movl %eax, 24(%rdx) 967; SCALAR-NEXT: retq 968; 969; SSE2-LABEL: vec256_v2i32: 970; SSE2: # %bb.0: 971; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 972; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 973; SSE2-NEXT: pxor %xmm0, %xmm1 974; SSE2-NEXT: movq %xmm1, (%rsi) 975; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 976; SSE2-NEXT: movdqa %xmm0, (%rdx) 977; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 978; SSE2-NEXT: retq 979; 980; AVX1-LABEL: vec256_v2i32: 981; AVX1: # %bb.0: 982; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 983; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 984; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 985; AVX1-NEXT: vmovq %xmm0, (%rsi) 986; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 987; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 988; AVX1-NEXT: vmovaps %ymm0, (%rdx) 989; AVX1-NEXT: vzeroupper 990; AVX1-NEXT: retq 991; 992; AVX2-ONLY-LABEL: vec256_v2i32: 993; AVX2-ONLY: # %bb.0: 994; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 995; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 996; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 997; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 998; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 999; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 1000; AVX2-ONLY-NEXT: vzeroupper 1001; AVX2-ONLY-NEXT: retq 1002; 1003; AVX512-LABEL: vec256_v2i32: 1004; AVX512: # %bb.0: 1005; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1006; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 1007; AVX512-NEXT: vmovq %xmm0, (%rsi) 1008; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 1009; AVX512-NEXT: vmovdqa %ymm0, (%rdx) 1010; AVX512-NEXT: vzeroupper 1011; AVX512-NEXT: retq 1012 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 1013 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> 1014 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 1015 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 1016 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 1017 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 1018 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 1019 %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2 1020 store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16 1021 %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3 1022 store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8 1023 ret void 1024} 1025 1026define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1027; SCALAR-LABEL: vec256_v2f32: 1028; SCALAR: # %bb.0: 1029; SCALAR-NEXT: movl (%rdi), %eax 1030; SCALAR-NEXT: movl 4(%rdi), %ecx 1031; SCALAR-NEXT: notl %eax 1032; SCALAR-NEXT: notl %ecx 1033; SCALAR-NEXT: movl %ecx, 4(%rsi) 1034; SCALAR-NEXT: movl %eax, (%rsi) 1035; SCALAR-NEXT: movl %ecx, 4(%rdx) 1036; SCALAR-NEXT: movl %eax, (%rdx) 1037; SCALAR-NEXT: movl %ecx, 12(%rdx) 1038; SCALAR-NEXT: movl %eax, 8(%rdx) 1039; SCALAR-NEXT: movl %ecx, 20(%rdx) 1040; SCALAR-NEXT: movl %eax, 16(%rdx) 1041; SCALAR-NEXT: movl %ecx, 28(%rdx) 1042; SCALAR-NEXT: movl %eax, 24(%rdx) 1043; SCALAR-NEXT: retq 1044; 1045; SSE2-LABEL: vec256_v2f32: 1046; SSE2: # %bb.0: 1047; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1048; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 1049; SSE2-NEXT: pxor %xmm0, %xmm1 1050; SSE2-NEXT: movq %xmm1, (%rsi) 1051; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 1052; SSE2-NEXT: movdqa %xmm0, (%rdx) 1053; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 1054; SSE2-NEXT: retq 1055; 1056; AVX1-LABEL: vec256_v2f32: 1057; AVX1: # %bb.0: 1058; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1059; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1060; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1061; AVX1-NEXT: vmovq %xmm0, (%rsi) 1062; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1063; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1064; AVX1-NEXT: vmovaps %ymm0, (%rdx) 1065; AVX1-NEXT: vzeroupper 1066; AVX1-NEXT: retq 1067; 1068; AVX2-ONLY-LABEL: vec256_v2f32: 1069; AVX2-ONLY: # %bb.0: 1070; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1071; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1072; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 1073; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 1074; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 1075; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 1076; AVX2-ONLY-NEXT: vzeroupper 1077; AVX2-ONLY-NEXT: retq 1078; 1079; AVX512-LABEL: vec256_v2f32: 1080; AVX512: # %bb.0: 1081; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1082; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 1083; AVX512-NEXT: vmovq %xmm0, (%rsi) 1084; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 1085; AVX512-NEXT: vmovdqa %ymm0, (%rdx) 1086; AVX512-NEXT: vzeroupper 1087; AVX512-NEXT: retq 1088 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 1089 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> 1090 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> 1091 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64 1092 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 1093 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 1094 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 1095 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 1096 %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2 1097 store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16 1098 %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3 1099 store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8 1100 ret void 1101} 1102 1103define void @vec256_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1104; SCALAR-LABEL: vec256_v2i64: 1105; SCALAR: # %bb.0: 1106; SCALAR-NEXT: movq (%rdi), %rax 1107; SCALAR-NEXT: movq 8(%rdi), %rcx 1108; SCALAR-NEXT: notq %rax 1109; SCALAR-NEXT: notq %rcx 1110; SCALAR-NEXT: movq %rcx, 8(%rsi) 1111; SCALAR-NEXT: movq %rax, (%rsi) 1112; SCALAR-NEXT: movq %rcx, 8(%rdx) 1113; SCALAR-NEXT: movq %rax, (%rdx) 1114; SCALAR-NEXT: movq %rcx, 24(%rdx) 1115; SCALAR-NEXT: movq %rax, 16(%rdx) 1116; SCALAR-NEXT: retq 1117; 1118; SSE2-LABEL: vec256_v2i64: 1119; SSE2: # %bb.0: 1120; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 1121; SSE2-NEXT: pxor (%rdi), %xmm0 1122; SSE2-NEXT: movdqa %xmm0, (%rsi) 1123; SSE2-NEXT: movdqa %xmm0, (%rdx) 1124; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 1125; SSE2-NEXT: retq 1126; 1127; AVX-LABEL: vec256_v2i64: 1128; AVX: # %bb.0: 1129; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1130; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 1131; AVX-NEXT: vmovdqa %xmm0, (%rsi) 1132; AVX-NEXT: vmovdqa %xmm0, (%rdx) 1133; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 1134; AVX-NEXT: retq 1135 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 1136 %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> 1137 store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 1138 %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0 1139 store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 1140 %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1 1141 store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16 1142 ret void 1143} 1144 1145define void @vec256_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1146; SCALAR-LABEL: vec256_v2f64: 1147; SCALAR: # %bb.0: 1148; SCALAR-NEXT: movq (%rdi), %rax 1149; SCALAR-NEXT: movq 8(%rdi), %rcx 1150; SCALAR-NEXT: notq %rax 1151; SCALAR-NEXT: notq %rcx 1152; SCALAR-NEXT: movq %rcx, 8(%rsi) 1153; SCALAR-NEXT: movq %rax, (%rsi) 1154; SCALAR-NEXT: movq %rcx, 8(%rdx) 1155; SCALAR-NEXT: movq %rax, (%rdx) 1156; SCALAR-NEXT: movq %rcx, 24(%rdx) 1157; SCALAR-NEXT: movq %rax, 16(%rdx) 1158; SCALAR-NEXT: retq 1159; 1160; SSE2-LABEL: vec256_v2f64: 1161; SSE2: # %bb.0: 1162; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 1163; SSE2-NEXT: pxor (%rdi), %xmm0 1164; SSE2-NEXT: movdqa %xmm0, (%rsi) 1165; SSE2-NEXT: movdqa %xmm0, (%rdx) 1166; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 1167; SSE2-NEXT: retq 1168; 1169; AVX-LABEL: vec256_v2f64: 1170; AVX: # %bb.0: 1171; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1172; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 1173; AVX-NEXT: vmovdqa %xmm0, (%rsi) 1174; AVX-NEXT: vmovdqa %xmm0, (%rdx) 1175; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 1176; AVX-NEXT: retq 1177 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 1178 %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> 1179 %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double> 1180 store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64 1181 %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0 1182 store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 1183 %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1 1184 store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16 1185 ret void 1186} 1187 1188define void @vec256_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1189; SCALAR-LABEL: vec256_v4i8: 1190; SCALAR: # %bb.0: 1191; SCALAR-NEXT: movzbl 3(%rdi), %r8d 1192; SCALAR-NEXT: movzbl 2(%rdi), %ecx 1193; SCALAR-NEXT: movzbl (%rdi), %eax 1194; SCALAR-NEXT: movzbl 1(%rdi), %edi 1195; SCALAR-NEXT: notb %al 1196; SCALAR-NEXT: notb %dil 1197; SCALAR-NEXT: notb %cl 1198; SCALAR-NEXT: notb %r8b 1199; SCALAR-NEXT: movb %r8b, 3(%rsi) 1200; SCALAR-NEXT: movb %cl, 2(%rsi) 1201; SCALAR-NEXT: movb %dil, 1(%rsi) 1202; SCALAR-NEXT: movb %al, (%rsi) 1203; SCALAR-NEXT: movb %r8b, 3(%rdx) 1204; SCALAR-NEXT: movb %cl, 2(%rdx) 1205; SCALAR-NEXT: movb %dil, 1(%rdx) 1206; SCALAR-NEXT: movb %al, (%rdx) 1207; SCALAR-NEXT: movb %r8b, 7(%rdx) 1208; SCALAR-NEXT: movb %cl, 6(%rdx) 1209; SCALAR-NEXT: movb %dil, 5(%rdx) 1210; SCALAR-NEXT: movb %al, 4(%rdx) 1211; SCALAR-NEXT: movb %r8b, 11(%rdx) 1212; SCALAR-NEXT: movb %cl, 10(%rdx) 1213; SCALAR-NEXT: movb %dil, 9(%rdx) 1214; SCALAR-NEXT: movb %al, 8(%rdx) 1215; SCALAR-NEXT: movb %r8b, 15(%rdx) 1216; SCALAR-NEXT: movb %cl, 14(%rdx) 1217; SCALAR-NEXT: movb %dil, 13(%rdx) 1218; SCALAR-NEXT: movb %al, 12(%rdx) 1219; SCALAR-NEXT: movb %r8b, 19(%rdx) 1220; SCALAR-NEXT: movb %cl, 18(%rdx) 1221; SCALAR-NEXT: movb %dil, 17(%rdx) 1222; SCALAR-NEXT: movb %al, 16(%rdx) 1223; SCALAR-NEXT: movb %r8b, 23(%rdx) 1224; SCALAR-NEXT: movb %cl, 22(%rdx) 1225; SCALAR-NEXT: movb %dil, 21(%rdx) 1226; SCALAR-NEXT: movb %al, 20(%rdx) 1227; SCALAR-NEXT: movb %r8b, 27(%rdx) 1228; SCALAR-NEXT: movb %cl, 26(%rdx) 1229; SCALAR-NEXT: movb %dil, 25(%rdx) 1230; SCALAR-NEXT: movb %al, 24(%rdx) 1231; SCALAR-NEXT: movb %r8b, 31(%rdx) 1232; SCALAR-NEXT: movb %cl, 30(%rdx) 1233; SCALAR-NEXT: movb %dil, 29(%rdx) 1234; SCALAR-NEXT: movb %al, 28(%rdx) 1235; SCALAR-NEXT: retq 1236; 1237; SSE2-LABEL: vec256_v4i8: 1238; SSE2: # %bb.0: 1239; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 1240; SSE2-NEXT: pxor (%rdi), %xmm0 1241; SSE2-NEXT: movd %xmm0, (%rsi) 1242; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1243; SSE2-NEXT: movdqa %xmm0, (%rdx) 1244; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 1245; SSE2-NEXT: retq 1246; 1247; AVX1-LABEL: vec256_v4i8: 1248; AVX1: # %bb.0: 1249; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1250; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 1251; AVX1-NEXT: vmovd %xmm0, (%rsi) 1252; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1253; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx) 1254; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 1255; AVX1-NEXT: retq 1256; 1257; AVX2-LABEL: vec256_v4i8: 1258; AVX2: # %bb.0: 1259; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1260; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 1261; AVX2-NEXT: vmovd %xmm0, (%rsi) 1262; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 1263; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 1264; AVX2-NEXT: vzeroupper 1265; AVX2-NEXT: retq 1266 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64 1267 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1> 1268 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 1269 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 1270 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 1271 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 1272 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 1273 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 1274 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 1275 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 1276 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 1277 %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4 1278 store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16 1279 %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5 1280 store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4 1281 %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6 1282 store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8 1283 %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7 1284 store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4 1285 ret void 1286} 1287 1288define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1289; SCALAR-LABEL: vec256_v4i16: 1290; SCALAR: # %bb.0: 1291; SCALAR-NEXT: movzwl 6(%rdi), %r8d 1292; SCALAR-NEXT: movzwl 2(%rdi), %ecx 1293; SCALAR-NEXT: movl (%rdi), %eax 1294; SCALAR-NEXT: movl 4(%rdi), %edi 1295; SCALAR-NEXT: notl %eax 1296; SCALAR-NEXT: notl %ecx 1297; SCALAR-NEXT: notl %edi 1298; SCALAR-NEXT: notl %r8d 1299; SCALAR-NEXT: movw %r8w, 6(%rsi) 1300; SCALAR-NEXT: movw %di, 4(%rsi) 1301; SCALAR-NEXT: movw %cx, 2(%rsi) 1302; SCALAR-NEXT: movw %ax, (%rsi) 1303; SCALAR-NEXT: movw %r8w, 6(%rdx) 1304; SCALAR-NEXT: movw %di, 4(%rdx) 1305; SCALAR-NEXT: movw %cx, 2(%rdx) 1306; SCALAR-NEXT: movw %ax, (%rdx) 1307; SCALAR-NEXT: movw %r8w, 14(%rdx) 1308; SCALAR-NEXT: movw %di, 12(%rdx) 1309; SCALAR-NEXT: movw %cx, 10(%rdx) 1310; SCALAR-NEXT: movw %ax, 8(%rdx) 1311; SCALAR-NEXT: movw %r8w, 22(%rdx) 1312; SCALAR-NEXT: movw %di, 20(%rdx) 1313; SCALAR-NEXT: movw %cx, 18(%rdx) 1314; SCALAR-NEXT: movw %ax, 16(%rdx) 1315; SCALAR-NEXT: movw %r8w, 30(%rdx) 1316; SCALAR-NEXT: movw %di, 28(%rdx) 1317; SCALAR-NEXT: movw %cx, 26(%rdx) 1318; SCALAR-NEXT: movw %ax, 24(%rdx) 1319; SCALAR-NEXT: retq 1320; 1321; SSE2-LABEL: vec256_v4i16: 1322; SSE2: # %bb.0: 1323; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1324; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 1325; SSE2-NEXT: pxor %xmm0, %xmm1 1326; SSE2-NEXT: movq %xmm1, (%rsi) 1327; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 1328; SSE2-NEXT: movdqa %xmm0, (%rdx) 1329; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 1330; SSE2-NEXT: retq 1331; 1332; AVX1-LABEL: vec256_v4i16: 1333; AVX1: # %bb.0: 1334; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1335; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1336; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1337; AVX1-NEXT: vmovq %xmm0, (%rsi) 1338; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1339; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1340; AVX1-NEXT: vmovaps %ymm0, (%rdx) 1341; AVX1-NEXT: vzeroupper 1342; AVX1-NEXT: retq 1343; 1344; AVX2-ONLY-LABEL: vec256_v4i16: 1345; AVX2-ONLY: # %bb.0: 1346; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1347; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1348; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 1349; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 1350; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 1351; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 1352; AVX2-ONLY-NEXT: vzeroupper 1353; AVX2-ONLY-NEXT: retq 1354; 1355; AVX512-LABEL: vec256_v4i16: 1356; AVX512: # %bb.0: 1357; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1358; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 1359; AVX512-NEXT: vmovq %xmm0, (%rsi) 1360; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 1361; AVX512-NEXT: vmovdqa %ymm0, (%rdx) 1362; AVX512-NEXT: vzeroupper 1363; AVX512-NEXT: retq 1364 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 1365 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1> 1366 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 1367 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 1368 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 1369 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 1370 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 1371 %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2 1372 store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16 1373 %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3 1374 store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8 1375 ret void 1376} 1377 1378define void @vec256_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1379; SCALAR-LABEL: vec256_v4i32: 1380; SCALAR: # %bb.0: 1381; SCALAR-NEXT: movaps (%rdi), %xmm0 1382; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1383; SCALAR-NEXT: movaps %xmm0, (%rsi) 1384; SCALAR-NEXT: movaps %xmm0, (%rdx) 1385; SCALAR-NEXT: movaps %xmm0, 16(%rdx) 1386; SCALAR-NEXT: retq 1387; 1388; SSE2-LABEL: vec256_v4i32: 1389; SSE2: # %bb.0: 1390; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 1391; SSE2-NEXT: pxor (%rdi), %xmm0 1392; SSE2-NEXT: movdqa %xmm0, (%rsi) 1393; SSE2-NEXT: movdqa %xmm0, (%rdx) 1394; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 1395; SSE2-NEXT: retq 1396; 1397; AVX-LABEL: vec256_v4i32: 1398; AVX: # %bb.0: 1399; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1400; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 1401; AVX-NEXT: vmovdqa %xmm0, (%rsi) 1402; AVX-NEXT: vmovdqa %xmm0, (%rdx) 1403; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 1404; AVX-NEXT: retq 1405 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 1406 %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> 1407 store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 1408 %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0 1409 store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 1410 %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1 1411 store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16 1412 ret void 1413} 1414 1415define void @vec256_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1416; SCALAR-LABEL: vec256_v4f32: 1417; SCALAR: # %bb.0: 1418; SCALAR-NEXT: movaps (%rdi), %xmm0 1419; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1420; SCALAR-NEXT: movaps %xmm0, (%rsi) 1421; SCALAR-NEXT: movaps %xmm0, (%rdx) 1422; SCALAR-NEXT: movaps %xmm0, 16(%rdx) 1423; SCALAR-NEXT: retq 1424; 1425; SSE2-LABEL: vec256_v4f32: 1426; SSE2: # %bb.0: 1427; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 1428; SSE2-NEXT: pxor (%rdi), %xmm0 1429; SSE2-NEXT: movdqa %xmm0, (%rsi) 1430; SSE2-NEXT: movdqa %xmm0, (%rdx) 1431; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 1432; SSE2-NEXT: retq 1433; 1434; AVX-LABEL: vec256_v4f32: 1435; AVX: # %bb.0: 1436; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1437; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 1438; AVX-NEXT: vmovdqa %xmm0, (%rsi) 1439; AVX-NEXT: vmovdqa %xmm0, (%rdx) 1440; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 1441; AVX-NEXT: retq 1442 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 1443 %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> 1444 %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float> 1445 store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64 1446 %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0 1447 store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 1448 %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1 1449 store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16 1450 ret void 1451} 1452 1453define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1454; SCALAR-LABEL: vec256_v8i8: 1455; SCALAR: # %bb.0: 1456; SCALAR-NEXT: pushq %rbx 1457; SCALAR-NEXT: movzbl 7(%rdi), %ebx 1458; SCALAR-NEXT: movzbl 6(%rdi), %r11d 1459; SCALAR-NEXT: movzbl 5(%rdi), %r10d 1460; SCALAR-NEXT: movzbl 4(%rdi), %r9d 1461; SCALAR-NEXT: movzbl 3(%rdi), %r8d 1462; SCALAR-NEXT: movzbl 2(%rdi), %ecx 1463; SCALAR-NEXT: movzbl (%rdi), %eax 1464; SCALAR-NEXT: movzbl 1(%rdi), %edi 1465; SCALAR-NEXT: notb %al 1466; SCALAR-NEXT: notb %dil 1467; SCALAR-NEXT: notb %cl 1468; SCALAR-NEXT: notb %r8b 1469; SCALAR-NEXT: notb %r9b 1470; SCALAR-NEXT: notb %r10b 1471; SCALAR-NEXT: notb %r11b 1472; SCALAR-NEXT: notb %bl 1473; SCALAR-NEXT: movb %bl, 7(%rsi) 1474; SCALAR-NEXT: movb %r11b, 6(%rsi) 1475; SCALAR-NEXT: movb %r10b, 5(%rsi) 1476; SCALAR-NEXT: movb %r9b, 4(%rsi) 1477; SCALAR-NEXT: movb %r8b, 3(%rsi) 1478; SCALAR-NEXT: movb %cl, 2(%rsi) 1479; SCALAR-NEXT: movb %dil, 1(%rsi) 1480; SCALAR-NEXT: movb %al, (%rsi) 1481; SCALAR-NEXT: movb %bl, 7(%rdx) 1482; SCALAR-NEXT: movb %r11b, 6(%rdx) 1483; SCALAR-NEXT: movb %r10b, 5(%rdx) 1484; SCALAR-NEXT: movb %r9b, 4(%rdx) 1485; SCALAR-NEXT: movb %r8b, 3(%rdx) 1486; SCALAR-NEXT: movb %cl, 2(%rdx) 1487; SCALAR-NEXT: movb %dil, 1(%rdx) 1488; SCALAR-NEXT: movb %al, (%rdx) 1489; SCALAR-NEXT: movb %bl, 15(%rdx) 1490; SCALAR-NEXT: movb %r11b, 14(%rdx) 1491; SCALAR-NEXT: movb %r10b, 13(%rdx) 1492; SCALAR-NEXT: movb %r9b, 12(%rdx) 1493; SCALAR-NEXT: movb %r8b, 11(%rdx) 1494; SCALAR-NEXT: movb %cl, 10(%rdx) 1495; SCALAR-NEXT: movb %dil, 9(%rdx) 1496; SCALAR-NEXT: movb %al, 8(%rdx) 1497; SCALAR-NEXT: movb %bl, 23(%rdx) 1498; SCALAR-NEXT: movb %r11b, 22(%rdx) 1499; SCALAR-NEXT: movb %r10b, 21(%rdx) 1500; SCALAR-NEXT: movb %r9b, 20(%rdx) 1501; SCALAR-NEXT: movb %r8b, 19(%rdx) 1502; SCALAR-NEXT: movb %cl, 18(%rdx) 1503; SCALAR-NEXT: movb %dil, 17(%rdx) 1504; SCALAR-NEXT: movb %al, 16(%rdx) 1505; SCALAR-NEXT: movb %bl, 31(%rdx) 1506; SCALAR-NEXT: movb %r11b, 30(%rdx) 1507; SCALAR-NEXT: movb %r10b, 29(%rdx) 1508; SCALAR-NEXT: movb %r9b, 28(%rdx) 1509; SCALAR-NEXT: movb %r8b, 27(%rdx) 1510; SCALAR-NEXT: movb %cl, 26(%rdx) 1511; SCALAR-NEXT: movb %dil, 25(%rdx) 1512; SCALAR-NEXT: movb %al, 24(%rdx) 1513; SCALAR-NEXT: popq %rbx 1514; SCALAR-NEXT: retq 1515; 1516; SSE2-LABEL: vec256_v8i8: 1517; SSE2: # %bb.0: 1518; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1519; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 1520; SSE2-NEXT: pxor %xmm0, %xmm1 1521; SSE2-NEXT: movq %xmm1, (%rsi) 1522; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 1523; SSE2-NEXT: movdqa %xmm0, (%rdx) 1524; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 1525; SSE2-NEXT: retq 1526; 1527; AVX1-LABEL: vec256_v8i8: 1528; AVX1: # %bb.0: 1529; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1530; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1531; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1532; AVX1-NEXT: vmovq %xmm0, (%rsi) 1533; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1534; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1535; AVX1-NEXT: vmovaps %ymm0, (%rdx) 1536; AVX1-NEXT: vzeroupper 1537; AVX1-NEXT: retq 1538; 1539; AVX2-ONLY-LABEL: vec256_v8i8: 1540; AVX2-ONLY: # %bb.0: 1541; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1542; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1543; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 1544; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 1545; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 1546; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 1547; AVX2-ONLY-NEXT: vzeroupper 1548; AVX2-ONLY-NEXT: retq 1549; 1550; AVX512-LABEL: vec256_v8i8: 1551; AVX512: # %bb.0: 1552; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1553; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 1554; AVX512-NEXT: vmovq %xmm0, (%rsi) 1555; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 1556; AVX512-NEXT: vmovdqa %ymm0, (%rdx) 1557; AVX512-NEXT: vzeroupper 1558; AVX512-NEXT: retq 1559 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 1560 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 1561 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 1562 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 1563 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 1564 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 1565 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 1566 %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2 1567 store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16 1568 %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3 1569 store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8 1570 ret void 1571} 1572 1573define void @vec256_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1574; SCALAR-LABEL: vec256_v8i16: 1575; SCALAR: # %bb.0: 1576; SCALAR-NEXT: pushq %rbx 1577; SCALAR-NEXT: movzwl 14(%rdi), %ebx 1578; SCALAR-NEXT: movl 12(%rdi), %r11d 1579; SCALAR-NEXT: movzwl 10(%rdi), %r10d 1580; SCALAR-NEXT: movl 8(%rdi), %r9d 1581; SCALAR-NEXT: movzwl 6(%rdi), %r8d 1582; SCALAR-NEXT: movzwl 2(%rdi), %ecx 1583; SCALAR-NEXT: movl (%rdi), %eax 1584; SCALAR-NEXT: movl 4(%rdi), %edi 1585; SCALAR-NEXT: notl %eax 1586; SCALAR-NEXT: notl %ecx 1587; SCALAR-NEXT: notl %edi 1588; SCALAR-NEXT: notl %r8d 1589; SCALAR-NEXT: notl %r9d 1590; SCALAR-NEXT: notl %r10d 1591; SCALAR-NEXT: notl %r11d 1592; SCALAR-NEXT: notl %ebx 1593; SCALAR-NEXT: movw %bx, 14(%rsi) 1594; SCALAR-NEXT: movw %r11w, 12(%rsi) 1595; SCALAR-NEXT: movw %r10w, 10(%rsi) 1596; SCALAR-NEXT: movw %r9w, 8(%rsi) 1597; SCALAR-NEXT: movw %r8w, 6(%rsi) 1598; SCALAR-NEXT: movw %di, 4(%rsi) 1599; SCALAR-NEXT: movw %cx, 2(%rsi) 1600; SCALAR-NEXT: movw %ax, (%rsi) 1601; SCALAR-NEXT: movw %bx, 14(%rdx) 1602; SCALAR-NEXT: movw %r11w, 12(%rdx) 1603; SCALAR-NEXT: movw %r10w, 10(%rdx) 1604; SCALAR-NEXT: movw %r9w, 8(%rdx) 1605; SCALAR-NEXT: movw %r8w, 6(%rdx) 1606; SCALAR-NEXT: movw %di, 4(%rdx) 1607; SCALAR-NEXT: movw %cx, 2(%rdx) 1608; SCALAR-NEXT: movw %ax, (%rdx) 1609; SCALAR-NEXT: movw %bx, 30(%rdx) 1610; SCALAR-NEXT: movw %r11w, 28(%rdx) 1611; SCALAR-NEXT: movw %r10w, 26(%rdx) 1612; SCALAR-NEXT: movw %r9w, 24(%rdx) 1613; SCALAR-NEXT: movw %r8w, 22(%rdx) 1614; SCALAR-NEXT: movw %di, 20(%rdx) 1615; SCALAR-NEXT: movw %cx, 18(%rdx) 1616; SCALAR-NEXT: movw %ax, 16(%rdx) 1617; SCALAR-NEXT: popq %rbx 1618; SCALAR-NEXT: retq 1619; 1620; SSE2-LABEL: vec256_v8i16: 1621; SSE2: # %bb.0: 1622; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 1623; SSE2-NEXT: pxor (%rdi), %xmm0 1624; SSE2-NEXT: movdqa %xmm0, (%rsi) 1625; SSE2-NEXT: movdqa %xmm0, (%rdx) 1626; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 1627; SSE2-NEXT: retq 1628; 1629; AVX-LABEL: vec256_v8i16: 1630; AVX: # %bb.0: 1631; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1632; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 1633; AVX-NEXT: vmovdqa %xmm0, (%rsi) 1634; AVX-NEXT: vmovdqa %xmm0, (%rdx) 1635; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 1636; AVX-NEXT: retq 1637 %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64 1638 %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> 1639 store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 1640 %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0 1641 store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 1642 %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1 1643 store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16 1644 ret void 1645} 1646 1647define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1648; SCALAR-LABEL: vec256_v16i8: 1649; SCALAR: # %bb.0: 1650; SCALAR-NEXT: pushq %rbp 1651; SCALAR-NEXT: pushq %r15 1652; SCALAR-NEXT: pushq %r14 1653; SCALAR-NEXT: pushq %r13 1654; SCALAR-NEXT: pushq %r12 1655; SCALAR-NEXT: pushq %rbx 1656; SCALAR-NEXT: movzbl 15(%rdi), %eax 1657; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1658; SCALAR-NEXT: movzbl 14(%rdi), %eax 1659; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1660; SCALAR-NEXT: movzbl 13(%rdi), %eax 1661; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1662; SCALAR-NEXT: movzbl 12(%rdi), %r15d 1663; SCALAR-NEXT: movzbl 11(%rdi), %eax 1664; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1665; SCALAR-NEXT: movzbl 10(%rdi), %ebp 1666; SCALAR-NEXT: movzbl 9(%rdi), %r14d 1667; SCALAR-NEXT: movzbl 8(%rdi), %eax 1668; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1669; SCALAR-NEXT: movzbl 7(%rdi), %r12d 1670; SCALAR-NEXT: movzbl 6(%rdi), %r10d 1671; SCALAR-NEXT: movzbl 5(%rdi), %r9d 1672; SCALAR-NEXT: movzbl 4(%rdi), %ebx 1673; SCALAR-NEXT: movzbl 3(%rdi), %r8d 1674; SCALAR-NEXT: movzbl 2(%rdi), %ecx 1675; SCALAR-NEXT: movzbl (%rdi), %eax 1676; SCALAR-NEXT: movzbl 1(%rdi), %r13d 1677; SCALAR-NEXT: notb %al 1678; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1679; SCALAR-NEXT: notb %r13b 1680; SCALAR-NEXT: notb %cl 1681; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1682; SCALAR-NEXT: notb %r8b 1683; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1684; SCALAR-NEXT: notb %bl 1685; SCALAR-NEXT: notb %r9b 1686; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1687; SCALAR-NEXT: notb %r10b 1688; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1689; SCALAR-NEXT: notb %r12b 1690; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1691; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload 1692; SCALAR-NEXT: notb %r11b 1693; SCALAR-NEXT: movl %r14d, %r10d 1694; SCALAR-NEXT: notb %r10b 1695; SCALAR-NEXT: notb %bpl 1696; SCALAR-NEXT: movl %ebp, %r14d 1697; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload 1698; SCALAR-NEXT: notb %r8b 1699; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1700; SCALAR-NEXT: movl %r15d, %edi 1701; SCALAR-NEXT: notb %dil 1702; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1703; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload 1704; SCALAR-NEXT: notb %r9b 1705; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload 1706; SCALAR-NEXT: notb %bpl 1707; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload 1708; SCALAR-NEXT: notb %r15b 1709; SCALAR-NEXT: movb %r15b, 15(%rsi) 1710; SCALAR-NEXT: movb %bpl, 14(%rsi) 1711; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1712; SCALAR-NEXT: movl %r9d, %eax 1713; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1714; SCALAR-NEXT: movb %r9b, 13(%rsi) 1715; SCALAR-NEXT: movb %dil, 12(%rsi) 1716; SCALAR-NEXT: movb %r8b, 11(%rsi) 1717; SCALAR-NEXT: movb %r14b, 10(%rsi) 1718; SCALAR-NEXT: movb %r10b, 9(%rsi) 1719; SCALAR-NEXT: movl %r10d, %r8d 1720; SCALAR-NEXT: movb %r11b, 8(%rsi) 1721; SCALAR-NEXT: movl %r11d, %r9d 1722; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 1723; SCALAR-NEXT: movb %r12b, 7(%rsi) 1724; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 1725; SCALAR-NEXT: movb %cl, 6(%rsi) 1726; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 1727; SCALAR-NEXT: movb %dil, 5(%rsi) 1728; SCALAR-NEXT: movb %bl, 4(%rsi) 1729; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 1730; SCALAR-NEXT: movb %cl, 3(%rsi) 1731; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 1732; SCALAR-NEXT: movb %cl, 2(%rsi) 1733; SCALAR-NEXT: movb %r13b, 1(%rsi) 1734; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload 1735; SCALAR-NEXT: movb %r10b, (%rsi) 1736; SCALAR-NEXT: movb %r15b, 15(%rdx) 1737; SCALAR-NEXT: movl %r15d, %r11d 1738; SCALAR-NEXT: movb %bpl, 14(%rdx) 1739; SCALAR-NEXT: movb %al, 13(%rdx) 1740; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload 1741; SCALAR-NEXT: movb %r12b, 12(%rdx) 1742; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload 1743; SCALAR-NEXT: movb %r15b, 11(%rdx) 1744; SCALAR-NEXT: movb %r14b, 10(%rdx) 1745; SCALAR-NEXT: movb %r8b, 9(%rdx) 1746; SCALAR-NEXT: movb %r9b, 8(%rdx) 1747; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload 1748; SCALAR-NEXT: movb %r9b, 7(%rdx) 1749; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 1750; SCALAR-NEXT: movb %al, 6(%rdx) 1751; SCALAR-NEXT: movb %dil, 5(%rdx) 1752; SCALAR-NEXT: movb %bl, 4(%rdx) 1753; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 1754; SCALAR-NEXT: movb %sil, 3(%rdx) 1755; SCALAR-NEXT: movb %cl, 2(%rdx) 1756; SCALAR-NEXT: movb %r13b, 1(%rdx) 1757; SCALAR-NEXT: movl %r10d, %edi 1758; SCALAR-NEXT: movb %r10b, (%rdx) 1759; SCALAR-NEXT: movb %r11b, 31(%rdx) 1760; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload 1761; SCALAR-NEXT: movb %r10b, 30(%rdx) 1762; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload 1763; SCALAR-NEXT: movb %r10b, 29(%rdx) 1764; SCALAR-NEXT: movb %r12b, 28(%rdx) 1765; SCALAR-NEXT: movb %r15b, 27(%rdx) 1766; SCALAR-NEXT: movb %r14b, 26(%rdx) 1767; SCALAR-NEXT: movb %r8b, 25(%rdx) 1768; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload 1769; SCALAR-NEXT: movb %r10b, 24(%rdx) 1770; SCALAR-NEXT: movb %r9b, 23(%rdx) 1771; SCALAR-NEXT: movb %al, 22(%rdx) 1772; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 1773; SCALAR-NEXT: movb %al, 21(%rdx) 1774; SCALAR-NEXT: movb %bl, 20(%rdx) 1775; SCALAR-NEXT: movb %sil, 19(%rdx) 1776; SCALAR-NEXT: movb %cl, 18(%rdx) 1777; SCALAR-NEXT: movb %r13b, 17(%rdx) 1778; SCALAR-NEXT: movb %dil, 16(%rdx) 1779; SCALAR-NEXT: popq %rbx 1780; SCALAR-NEXT: popq %r12 1781; SCALAR-NEXT: popq %r13 1782; SCALAR-NEXT: popq %r14 1783; SCALAR-NEXT: popq %r15 1784; SCALAR-NEXT: popq %rbp 1785; SCALAR-NEXT: retq 1786; 1787; SSE2-LABEL: vec256_v16i8: 1788; SSE2: # %bb.0: 1789; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 1790; SSE2-NEXT: pxor (%rdi), %xmm0 1791; SSE2-NEXT: movdqa %xmm0, (%rsi) 1792; SSE2-NEXT: movdqa %xmm0, (%rdx) 1793; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 1794; SSE2-NEXT: retq 1795; 1796; AVX-LABEL: vec256_v16i8: 1797; AVX: # %bb.0: 1798; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1799; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 1800; AVX-NEXT: vmovdqa %xmm0, (%rsi) 1801; AVX-NEXT: vmovdqa %xmm0, (%rdx) 1802; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 1803; AVX-NEXT: retq 1804 %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64 1805 %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 1806 store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 1807 %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0 1808 store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 1809 %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1 1810 store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16 1811 ret void 1812} 1813 1814define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 1815; SCALAR-LABEL: vec384_v2i8: 1816; SCALAR: # %bb.0: 1817; SCALAR-NEXT: movzbl (%rdi), %eax 1818; SCALAR-NEXT: movzbl 1(%rdi), %ecx 1819; SCALAR-NEXT: notb %al 1820; SCALAR-NEXT: notb %cl 1821; SCALAR-NEXT: movb %cl, 1(%rsi) 1822; SCALAR-NEXT: movb %al, (%rsi) 1823; SCALAR-NEXT: movb %cl, 1(%rdx) 1824; SCALAR-NEXT: movb %al, (%rdx) 1825; SCALAR-NEXT: movb %cl, 3(%rdx) 1826; SCALAR-NEXT: movb %al, 2(%rdx) 1827; SCALAR-NEXT: movb %cl, 5(%rdx) 1828; SCALAR-NEXT: movb %al, 4(%rdx) 1829; SCALAR-NEXT: movb %cl, 7(%rdx) 1830; SCALAR-NEXT: movb %al, 6(%rdx) 1831; SCALAR-NEXT: movb %cl, 9(%rdx) 1832; SCALAR-NEXT: movb %al, 8(%rdx) 1833; SCALAR-NEXT: movb %cl, 11(%rdx) 1834; SCALAR-NEXT: movb %al, 10(%rdx) 1835; SCALAR-NEXT: movb %cl, 13(%rdx) 1836; SCALAR-NEXT: movb %al, 12(%rdx) 1837; SCALAR-NEXT: movb %cl, 15(%rdx) 1838; SCALAR-NEXT: movb %al, 14(%rdx) 1839; SCALAR-NEXT: movb %cl, 17(%rdx) 1840; SCALAR-NEXT: movb %al, 16(%rdx) 1841; SCALAR-NEXT: movb %cl, 19(%rdx) 1842; SCALAR-NEXT: movb %al, 18(%rdx) 1843; SCALAR-NEXT: movb %cl, 21(%rdx) 1844; SCALAR-NEXT: movb %al, 20(%rdx) 1845; SCALAR-NEXT: movb %cl, 23(%rdx) 1846; SCALAR-NEXT: movb %al, 22(%rdx) 1847; SCALAR-NEXT: movb %cl, 25(%rdx) 1848; SCALAR-NEXT: movb %al, 24(%rdx) 1849; SCALAR-NEXT: movb %cl, 27(%rdx) 1850; SCALAR-NEXT: movb %al, 26(%rdx) 1851; SCALAR-NEXT: movb %cl, 29(%rdx) 1852; SCALAR-NEXT: movb %al, 28(%rdx) 1853; SCALAR-NEXT: movb %cl, 31(%rdx) 1854; SCALAR-NEXT: movb %al, 30(%rdx) 1855; SCALAR-NEXT: movb %cl, 33(%rdx) 1856; SCALAR-NEXT: movb %al, 32(%rdx) 1857; SCALAR-NEXT: movb %cl, 35(%rdx) 1858; SCALAR-NEXT: movb %al, 34(%rdx) 1859; SCALAR-NEXT: movb %cl, 37(%rdx) 1860; SCALAR-NEXT: movb %al, 36(%rdx) 1861; SCALAR-NEXT: movb %cl, 39(%rdx) 1862; SCALAR-NEXT: movb %al, 38(%rdx) 1863; SCALAR-NEXT: movb %cl, 41(%rdx) 1864; SCALAR-NEXT: movb %al, 40(%rdx) 1865; SCALAR-NEXT: movb %cl, 43(%rdx) 1866; SCALAR-NEXT: movb %al, 42(%rdx) 1867; SCALAR-NEXT: movb %cl, 45(%rdx) 1868; SCALAR-NEXT: movb %al, 44(%rdx) 1869; SCALAR-NEXT: movb %cl, 47(%rdx) 1870; SCALAR-NEXT: movb %al, 46(%rdx) 1871; SCALAR-NEXT: retq 1872; 1873; SSE2-ONLY-LABEL: vec384_v2i8: 1874; SSE2-ONLY: # %bb.0: 1875; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 1876; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 1877; SSE2-ONLY-NEXT: movd %xmm0, %eax 1878; SSE2-ONLY-NEXT: movw %ax, (%rsi) 1879; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1880; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1881; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) 1882; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx) 1883; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx) 1884; SSE2-ONLY-NEXT: retq 1885; 1886; SSE3-LABEL: vec384_v2i8: 1887; SSE3: # %bb.0: 1888; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 1889; SSE3-NEXT: pxor (%rdi), %xmm0 1890; SSE3-NEXT: movd %xmm0, %eax 1891; SSE3-NEXT: movw %ax, (%rsi) 1892; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1893; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1894; SSE3-NEXT: movdqa %xmm0, (%rdx) 1895; SSE3-NEXT: movdqa %xmm0, 16(%rdx) 1896; SSE3-NEXT: movdqa %xmm0, 32(%rdx) 1897; SSE3-NEXT: retq 1898; 1899; SSSE3-ONLY-LABEL: vec384_v2i8: 1900; SSSE3-ONLY: # %bb.0: 1901; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 1902; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 1903; SSSE3-ONLY-NEXT: movd %xmm0, %eax 1904; SSSE3-ONLY-NEXT: movw %ax, (%rsi) 1905; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1906; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1907; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) 1908; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx) 1909; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx) 1910; SSSE3-ONLY-NEXT: retq 1911; 1912; SSE41-LABEL: vec384_v2i8: 1913; SSE41: # %bb.0: 1914; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 1915; SSE41-NEXT: pxor (%rdi), %xmm0 1916; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) 1917; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1918; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1919; SSE41-NEXT: movdqa %xmm0, (%rdx) 1920; SSE41-NEXT: movdqa %xmm0, 16(%rdx) 1921; SSE41-NEXT: movdqa %xmm0, 32(%rdx) 1922; SSE41-NEXT: retq 1923; 1924; SSE42-LABEL: vec384_v2i8: 1925; SSE42: # %bb.0: 1926; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 1927; SSE42-NEXT: pxor (%rdi), %xmm0 1928; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 1929; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1930; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1931; SSE42-NEXT: movdqa %xmm0, (%rdx) 1932; SSE42-NEXT: movdqa %xmm0, 16(%rdx) 1933; SSE42-NEXT: movdqa %xmm0, 32(%rdx) 1934; SSE42-NEXT: retq 1935; 1936; AVX1-LABEL: vec384_v2i8: 1937; AVX1: # %bb.0: 1938; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1939; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 1940; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) 1941; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1942; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1943; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 1944; AVX1-NEXT: vmovaps %ymm1, (%rdx) 1945; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) 1946; AVX1-NEXT: vzeroupper 1947; AVX1-NEXT: retq 1948; 1949; AVX2-LABEL: vec384_v2i8: 1950; AVX2: # %bb.0: 1951; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1952; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 1953; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi) 1954; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 1955; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 1956; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx) 1957; AVX2-NEXT: vzeroupper 1958; AVX2-NEXT: retq 1959 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 1960 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> 1961 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 1962 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 1963 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 1964 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 1965 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 1966 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 1967 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 1968 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 1969 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 1970 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 1971 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 1972 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 1973 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 1974 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 1975 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 1976 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 1977 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 1978 %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8 1979 store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16 1980 %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9 1981 store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2 1982 %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10 1983 store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4 1984 %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11 1985 store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2 1986 %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12 1987 store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8 1988 %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13 1989 store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2 1990 %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14 1991 store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4 1992 %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15 1993 store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2 1994 %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16 1995 store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32 1996 %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17 1997 store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2 1998 %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18 1999 store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4 2000 %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19 2001 store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2 2002 %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20 2003 store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8 2004 %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21 2005 store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2 2006 %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22 2007 store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4 2008 %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23 2009 store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2 2010 ret void 2011} 2012 2013define void @vec384_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 2014; SCALAR-LABEL: vec384_v2i16: 2015; SCALAR: # %bb.0: 2016; SCALAR-NEXT: movzwl 2(%rdi), %ecx 2017; SCALAR-NEXT: movl (%rdi), %eax 2018; SCALAR-NEXT: notl %eax 2019; SCALAR-NEXT: notl %ecx 2020; SCALAR-NEXT: movw %cx, 2(%rsi) 2021; SCALAR-NEXT: movw %ax, (%rsi) 2022; SCALAR-NEXT: movw %cx, 2(%rdx) 2023; SCALAR-NEXT: movw %ax, (%rdx) 2024; SCALAR-NEXT: movw %cx, 6(%rdx) 2025; SCALAR-NEXT: movw %ax, 4(%rdx) 2026; SCALAR-NEXT: movw %cx, 10(%rdx) 2027; SCALAR-NEXT: movw %ax, 8(%rdx) 2028; SCALAR-NEXT: movw %cx, 14(%rdx) 2029; SCALAR-NEXT: movw %ax, 12(%rdx) 2030; SCALAR-NEXT: movw %cx, 18(%rdx) 2031; SCALAR-NEXT: movw %ax, 16(%rdx) 2032; SCALAR-NEXT: movw %cx, 22(%rdx) 2033; SCALAR-NEXT: movw %ax, 20(%rdx) 2034; SCALAR-NEXT: movw %cx, 26(%rdx) 2035; SCALAR-NEXT: movw %ax, 24(%rdx) 2036; SCALAR-NEXT: movw %cx, 30(%rdx) 2037; SCALAR-NEXT: movw %ax, 28(%rdx) 2038; SCALAR-NEXT: movw %cx, 34(%rdx) 2039; SCALAR-NEXT: movw %ax, 32(%rdx) 2040; SCALAR-NEXT: movw %cx, 38(%rdx) 2041; SCALAR-NEXT: movw %ax, 36(%rdx) 2042; SCALAR-NEXT: movw %cx, 42(%rdx) 2043; SCALAR-NEXT: movw %ax, 40(%rdx) 2044; SCALAR-NEXT: movw %cx, 46(%rdx) 2045; SCALAR-NEXT: movw %ax, 44(%rdx) 2046; SCALAR-NEXT: retq 2047; 2048; SSE2-LABEL: vec384_v2i16: 2049; SSE2: # %bb.0: 2050; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 2051; SSE2-NEXT: pxor (%rdi), %xmm0 2052; SSE2-NEXT: movd %xmm0, (%rsi) 2053; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2054; SSE2-NEXT: movdqa %xmm0, (%rdx) 2055; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 2056; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 2057; SSE2-NEXT: retq 2058; 2059; AVX1-LABEL: vec384_v2i16: 2060; AVX1: # %bb.0: 2061; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2062; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 2063; AVX1-NEXT: vmovd %xmm0, (%rsi) 2064; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2065; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx) 2066; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 2067; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) 2068; AVX1-NEXT: retq 2069; 2070; AVX2-LABEL: vec384_v2i16: 2071; AVX2: # %bb.0: 2072; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2073; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 2074; AVX2-NEXT: vmovd %xmm0, (%rsi) 2075; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 2076; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 2077; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx) 2078; AVX2-NEXT: vzeroupper 2079; AVX2-NEXT: retq 2080 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64 2081 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1> 2082 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 2083 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 2084 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 2085 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 2086 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 2087 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 2088 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 2089 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 2090 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 2091 %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4 2092 store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16 2093 %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5 2094 store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4 2095 %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6 2096 store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8 2097 %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7 2098 store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4 2099 %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8 2100 store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32 2101 %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9 2102 store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4 2103 %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10 2104 store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8 2105 %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11 2106 store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4 2107 ret void 2108} 2109 2110define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 2111; SCALAR-LABEL: vec384_v2i32: 2112; SCALAR: # %bb.0: 2113; SCALAR-NEXT: movl (%rdi), %eax 2114; SCALAR-NEXT: movl 4(%rdi), %ecx 2115; SCALAR-NEXT: notl %eax 2116; SCALAR-NEXT: notl %ecx 2117; SCALAR-NEXT: movl %ecx, 4(%rsi) 2118; SCALAR-NEXT: movl %eax, (%rsi) 2119; SCALAR-NEXT: movl %ecx, 4(%rdx) 2120; SCALAR-NEXT: movl %eax, (%rdx) 2121; SCALAR-NEXT: movl %ecx, 12(%rdx) 2122; SCALAR-NEXT: movl %eax, 8(%rdx) 2123; SCALAR-NEXT: movl %ecx, 20(%rdx) 2124; SCALAR-NEXT: movl %eax, 16(%rdx) 2125; SCALAR-NEXT: movl %ecx, 28(%rdx) 2126; SCALAR-NEXT: movl %eax, 24(%rdx) 2127; SCALAR-NEXT: movl %ecx, 36(%rdx) 2128; SCALAR-NEXT: movl %eax, 32(%rdx) 2129; SCALAR-NEXT: movl %ecx, 44(%rdx) 2130; SCALAR-NEXT: movl %eax, 40(%rdx) 2131; SCALAR-NEXT: retq 2132; 2133; SSE2-LABEL: vec384_v2i32: 2134; SSE2: # %bb.0: 2135; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2136; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 2137; SSE2-NEXT: pxor %xmm0, %xmm1 2138; SSE2-NEXT: movq %xmm1, (%rsi) 2139; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 2140; SSE2-NEXT: movdqa %xmm0, (%rdx) 2141; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 2142; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 2143; SSE2-NEXT: retq 2144; 2145; AVX1-LABEL: vec384_v2i32: 2146; AVX1: # %bb.0: 2147; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2148; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2149; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2150; AVX1-NEXT: vmovq %xmm0, (%rsi) 2151; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 2152; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 2153; AVX1-NEXT: vmovaps %ymm1, (%rdx) 2154; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) 2155; AVX1-NEXT: vzeroupper 2156; AVX1-NEXT: retq 2157; 2158; AVX2-ONLY-LABEL: vec384_v2i32: 2159; AVX2-ONLY: # %bb.0: 2160; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2161; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2162; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 2163; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 2164; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 2165; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 2166; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) 2167; AVX2-ONLY-NEXT: vzeroupper 2168; AVX2-ONLY-NEXT: retq 2169; 2170; AVX512-LABEL: vec384_v2i32: 2171; AVX512: # %bb.0: 2172; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2173; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 2174; AVX512-NEXT: vmovq %xmm0, (%rsi) 2175; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 2176; AVX512-NEXT: vmovdqa %ymm0, (%rdx) 2177; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) 2178; AVX512-NEXT: vzeroupper 2179; AVX512-NEXT: retq 2180 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 2181 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> 2182 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 2183 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 2184 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 2185 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 2186 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 2187 %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2 2188 store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16 2189 %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3 2190 store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8 2191 %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4 2192 store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32 2193 %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5 2194 store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8 2195 ret void 2196} 2197 2198define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 2199; SCALAR-LABEL: vec384_v2f32: 2200; SCALAR: # %bb.0: 2201; SCALAR-NEXT: movl (%rdi), %eax 2202; SCALAR-NEXT: movl 4(%rdi), %ecx 2203; SCALAR-NEXT: notl %eax 2204; SCALAR-NEXT: notl %ecx 2205; SCALAR-NEXT: movl %ecx, 4(%rsi) 2206; SCALAR-NEXT: movl %eax, (%rsi) 2207; SCALAR-NEXT: movl %ecx, 4(%rdx) 2208; SCALAR-NEXT: movl %eax, (%rdx) 2209; SCALAR-NEXT: movl %ecx, 12(%rdx) 2210; SCALAR-NEXT: movl %eax, 8(%rdx) 2211; SCALAR-NEXT: movl %ecx, 20(%rdx) 2212; SCALAR-NEXT: movl %eax, 16(%rdx) 2213; SCALAR-NEXT: movl %ecx, 28(%rdx) 2214; SCALAR-NEXT: movl %eax, 24(%rdx) 2215; SCALAR-NEXT: movl %ecx, 36(%rdx) 2216; SCALAR-NEXT: movl %eax, 32(%rdx) 2217; SCALAR-NEXT: movl %ecx, 44(%rdx) 2218; SCALAR-NEXT: movl %eax, 40(%rdx) 2219; SCALAR-NEXT: retq 2220; 2221; SSE2-LABEL: vec384_v2f32: 2222; SSE2: # %bb.0: 2223; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2224; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 2225; SSE2-NEXT: pxor %xmm0, %xmm1 2226; SSE2-NEXT: movq %xmm1, (%rsi) 2227; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 2228; SSE2-NEXT: movdqa %xmm0, (%rdx) 2229; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 2230; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 2231; SSE2-NEXT: retq 2232; 2233; AVX1-LABEL: vec384_v2f32: 2234; AVX1: # %bb.0: 2235; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2236; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2237; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2238; AVX1-NEXT: vmovq %xmm0, (%rsi) 2239; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 2240; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 2241; AVX1-NEXT: vmovaps %ymm1, (%rdx) 2242; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) 2243; AVX1-NEXT: vzeroupper 2244; AVX1-NEXT: retq 2245; 2246; AVX2-ONLY-LABEL: vec384_v2f32: 2247; AVX2-ONLY: # %bb.0: 2248; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2249; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2250; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 2251; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 2252; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 2253; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 2254; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) 2255; AVX2-ONLY-NEXT: vzeroupper 2256; AVX2-ONLY-NEXT: retq 2257; 2258; AVX512-LABEL: vec384_v2f32: 2259; AVX512: # %bb.0: 2260; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2261; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 2262; AVX512-NEXT: vmovq %xmm0, (%rsi) 2263; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 2264; AVX512-NEXT: vmovdqa %ymm0, (%rdx) 2265; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) 2266; AVX512-NEXT: vzeroupper 2267; AVX512-NEXT: retq 2268 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 2269 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> 2270 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> 2271 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64 2272 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 2273 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 2274 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 2275 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 2276 %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2 2277 store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16 2278 %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3 2279 store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8 2280 %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4 2281 store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32 2282 %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5 2283 store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8 2284 ret void 2285} 2286 2287define void @vec384_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 2288; SCALAR-LABEL: vec384_v2i64: 2289; SCALAR: # %bb.0: 2290; SCALAR-NEXT: movq (%rdi), %rax 2291; SCALAR-NEXT: movq 8(%rdi), %rcx 2292; SCALAR-NEXT: notq %rax 2293; SCALAR-NEXT: notq %rcx 2294; SCALAR-NEXT: movq %rcx, 8(%rsi) 2295; SCALAR-NEXT: movq %rax, (%rsi) 2296; SCALAR-NEXT: movq %rcx, 8(%rdx) 2297; SCALAR-NEXT: movq %rax, (%rdx) 2298; SCALAR-NEXT: movq %rcx, 24(%rdx) 2299; SCALAR-NEXT: movq %rax, 16(%rdx) 2300; SCALAR-NEXT: movq %rcx, 40(%rdx) 2301; SCALAR-NEXT: movq %rax, 32(%rdx) 2302; SCALAR-NEXT: retq 2303; 2304; SSE2-LABEL: vec384_v2i64: 2305; SSE2: # %bb.0: 2306; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 2307; SSE2-NEXT: pxor (%rdi), %xmm0 2308; SSE2-NEXT: movdqa %xmm0, (%rsi) 2309; SSE2-NEXT: movdqa %xmm0, (%rdx) 2310; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 2311; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 2312; SSE2-NEXT: retq 2313; 2314; AVX-LABEL: vec384_v2i64: 2315; AVX: # %bb.0: 2316; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2317; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 2318; AVX-NEXT: vmovdqa %xmm0, (%rsi) 2319; AVX-NEXT: vmovdqa %xmm0, (%rdx) 2320; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 2321; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 2322; AVX-NEXT: retq 2323 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 2324 %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> 2325 store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 2326 %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0 2327 store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 2328 %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1 2329 store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16 2330 %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2 2331 store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32 2332 ret void 2333} 2334 2335define void @vec384_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 2336; SCALAR-LABEL: vec384_v2f64: 2337; SCALAR: # %bb.0: 2338; SCALAR-NEXT: movq (%rdi), %rax 2339; SCALAR-NEXT: movq 8(%rdi), %rcx 2340; SCALAR-NEXT: notq %rax 2341; SCALAR-NEXT: notq %rcx 2342; SCALAR-NEXT: movq %rcx, 8(%rsi) 2343; SCALAR-NEXT: movq %rax, (%rsi) 2344; SCALAR-NEXT: movq %rcx, 8(%rdx) 2345; SCALAR-NEXT: movq %rax, (%rdx) 2346; SCALAR-NEXT: movq %rcx, 24(%rdx) 2347; SCALAR-NEXT: movq %rax, 16(%rdx) 2348; SCALAR-NEXT: movq %rcx, 40(%rdx) 2349; SCALAR-NEXT: movq %rax, 32(%rdx) 2350; SCALAR-NEXT: retq 2351; 2352; SSE2-LABEL: vec384_v2f64: 2353; SSE2: # %bb.0: 2354; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 2355; SSE2-NEXT: pxor (%rdi), %xmm0 2356; SSE2-NEXT: movdqa %xmm0, (%rsi) 2357; SSE2-NEXT: movdqa %xmm0, (%rdx) 2358; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 2359; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 2360; SSE2-NEXT: retq 2361; 2362; AVX-LABEL: vec384_v2f64: 2363; AVX: # %bb.0: 2364; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2365; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 2366; AVX-NEXT: vmovdqa %xmm0, (%rsi) 2367; AVX-NEXT: vmovdqa %xmm0, (%rdx) 2368; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 2369; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 2370; AVX-NEXT: retq 2371 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 2372 %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> 2373 %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double> 2374 store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64 2375 %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0 2376 store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 2377 %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1 2378 store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16 2379 %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2 2380 store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32 2381 ret void 2382} 2383 2384define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 2385; SCALAR-LABEL: vec384_v3i8: 2386; SCALAR: # %bb.0: 2387; SCALAR-NEXT: movl (%rdi), %eax 2388; SCALAR-NEXT: movl %eax, %ecx 2389; SCALAR-NEXT: shrl $16, %ecx 2390; SCALAR-NEXT: notb %cl 2391; SCALAR-NEXT: notl %eax 2392; SCALAR-NEXT: movw %ax, (%rsi) 2393; SCALAR-NEXT: movb %cl, 2(%rsi) 2394; SCALAR-NEXT: movb %cl, 2(%rdx) 2395; SCALAR-NEXT: movw %ax, (%rdx) 2396; SCALAR-NEXT: movb %cl, 6(%rdx) 2397; SCALAR-NEXT: movw %ax, 4(%rdx) 2398; SCALAR-NEXT: movb %cl, 10(%rdx) 2399; SCALAR-NEXT: movw %ax, 8(%rdx) 2400; SCALAR-NEXT: movb %cl, 14(%rdx) 2401; SCALAR-NEXT: movw %ax, 12(%rdx) 2402; SCALAR-NEXT: movb %cl, 18(%rdx) 2403; SCALAR-NEXT: movw %ax, 16(%rdx) 2404; SCALAR-NEXT: movb %cl, 22(%rdx) 2405; SCALAR-NEXT: movw %ax, 20(%rdx) 2406; SCALAR-NEXT: movb %cl, 26(%rdx) 2407; SCALAR-NEXT: movw %ax, 24(%rdx) 2408; SCALAR-NEXT: movb %cl, 30(%rdx) 2409; SCALAR-NEXT: movw %ax, 28(%rdx) 2410; SCALAR-NEXT: movb %cl, 34(%rdx) 2411; SCALAR-NEXT: movw %ax, 32(%rdx) 2412; SCALAR-NEXT: movb %cl, 38(%rdx) 2413; SCALAR-NEXT: movw %ax, 36(%rdx) 2414; SCALAR-NEXT: movb %cl, 42(%rdx) 2415; SCALAR-NEXT: movw %ax, 40(%rdx) 2416; SCALAR-NEXT: movb %cl, 46(%rdx) 2417; SCALAR-NEXT: movw %ax, 44(%rdx) 2418; SCALAR-NEXT: movb %cl, 50(%rdx) 2419; SCALAR-NEXT: movw %ax, 48(%rdx) 2420; SCALAR-NEXT: movb %cl, 54(%rdx) 2421; SCALAR-NEXT: movw %ax, 52(%rdx) 2422; SCALAR-NEXT: movb %cl, 58(%rdx) 2423; SCALAR-NEXT: movw %ax, 56(%rdx) 2424; SCALAR-NEXT: movb %cl, 62(%rdx) 2425; SCALAR-NEXT: movw %ax, 60(%rdx) 2426; SCALAR-NEXT: retq 2427; 2428; SSE2-ONLY-LABEL: vec384_v3i8: 2429; SSE2-ONLY: # %bb.0: 2430; SSE2-ONLY-NEXT: movl (%rdi), %eax 2431; SSE2-ONLY-NEXT: notl %eax 2432; SSE2-ONLY-NEXT: movw %ax, (%rsi) 2433; SSE2-ONLY-NEXT: movl %eax, %ecx 2434; SSE2-ONLY-NEXT: shrl $16, %ecx 2435; SSE2-ONLY-NEXT: movb %cl, 2(%rsi) 2436; SSE2-ONLY-NEXT: movb %cl, 2(%rdx) 2437; SSE2-ONLY-NEXT: movw %ax, (%rdx) 2438; SSE2-ONLY-NEXT: movb %cl, 6(%rdx) 2439; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) 2440; SSE2-ONLY-NEXT: movb %cl, 10(%rdx) 2441; SSE2-ONLY-NEXT: movw %ax, 8(%rdx) 2442; SSE2-ONLY-NEXT: movb %cl, 14(%rdx) 2443; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) 2444; SSE2-ONLY-NEXT: movb %cl, 18(%rdx) 2445; SSE2-ONLY-NEXT: movw %ax, 16(%rdx) 2446; SSE2-ONLY-NEXT: movb %cl, 22(%rdx) 2447; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) 2448; SSE2-ONLY-NEXT: movb %cl, 26(%rdx) 2449; SSE2-ONLY-NEXT: movw %ax, 24(%rdx) 2450; SSE2-ONLY-NEXT: movb %cl, 30(%rdx) 2451; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) 2452; SSE2-ONLY-NEXT: movb %cl, 34(%rdx) 2453; SSE2-ONLY-NEXT: movw %ax, 32(%rdx) 2454; SSE2-ONLY-NEXT: movb %cl, 38(%rdx) 2455; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) 2456; SSE2-ONLY-NEXT: movb %cl, 42(%rdx) 2457; SSE2-ONLY-NEXT: movw %ax, 40(%rdx) 2458; SSE2-ONLY-NEXT: movb %cl, 46(%rdx) 2459; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) 2460; SSE2-ONLY-NEXT: movb %cl, 50(%rdx) 2461; SSE2-ONLY-NEXT: movw %ax, 48(%rdx) 2462; SSE2-ONLY-NEXT: movb %cl, 54(%rdx) 2463; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) 2464; SSE2-ONLY-NEXT: movb %cl, 58(%rdx) 2465; SSE2-ONLY-NEXT: movw %ax, 56(%rdx) 2466; SSE2-ONLY-NEXT: movb %cl, 62(%rdx) 2467; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) 2468; SSE2-ONLY-NEXT: retq 2469; 2470; SSE3-LABEL: vec384_v3i8: 2471; SSE3: # %bb.0: 2472; SSE3-NEXT: movl (%rdi), %eax 2473; SSE3-NEXT: notl %eax 2474; SSE3-NEXT: movw %ax, (%rsi) 2475; SSE3-NEXT: movl %eax, %ecx 2476; SSE3-NEXT: shrl $16, %ecx 2477; SSE3-NEXT: movb %cl, 2(%rsi) 2478; SSE3-NEXT: movb %cl, 2(%rdx) 2479; SSE3-NEXT: movw %ax, (%rdx) 2480; SSE3-NEXT: movb %cl, 6(%rdx) 2481; SSE3-NEXT: movw %ax, 4(%rdx) 2482; SSE3-NEXT: movb %cl, 10(%rdx) 2483; SSE3-NEXT: movw %ax, 8(%rdx) 2484; SSE3-NEXT: movb %cl, 14(%rdx) 2485; SSE3-NEXT: movw %ax, 12(%rdx) 2486; SSE3-NEXT: movb %cl, 18(%rdx) 2487; SSE3-NEXT: movw %ax, 16(%rdx) 2488; SSE3-NEXT: movb %cl, 22(%rdx) 2489; SSE3-NEXT: movw %ax, 20(%rdx) 2490; SSE3-NEXT: movb %cl, 26(%rdx) 2491; SSE3-NEXT: movw %ax, 24(%rdx) 2492; SSE3-NEXT: movb %cl, 30(%rdx) 2493; SSE3-NEXT: movw %ax, 28(%rdx) 2494; SSE3-NEXT: movb %cl, 34(%rdx) 2495; SSE3-NEXT: movw %ax, 32(%rdx) 2496; SSE3-NEXT: movb %cl, 38(%rdx) 2497; SSE3-NEXT: movw %ax, 36(%rdx) 2498; SSE3-NEXT: movb %cl, 42(%rdx) 2499; SSE3-NEXT: movw %ax, 40(%rdx) 2500; SSE3-NEXT: movb %cl, 46(%rdx) 2501; SSE3-NEXT: movw %ax, 44(%rdx) 2502; SSE3-NEXT: movb %cl, 50(%rdx) 2503; SSE3-NEXT: movw %ax, 48(%rdx) 2504; SSE3-NEXT: movb %cl, 54(%rdx) 2505; SSE3-NEXT: movw %ax, 52(%rdx) 2506; SSE3-NEXT: movb %cl, 58(%rdx) 2507; SSE3-NEXT: movw %ax, 56(%rdx) 2508; SSE3-NEXT: movb %cl, 62(%rdx) 2509; SSE3-NEXT: movw %ax, 60(%rdx) 2510; SSE3-NEXT: retq 2511; 2512; SSSE3-ONLY-LABEL: vec384_v3i8: 2513; SSSE3-ONLY: # %bb.0: 2514; SSSE3-ONLY-NEXT: movl (%rdi), %eax 2515; SSSE3-ONLY-NEXT: notl %eax 2516; SSSE3-ONLY-NEXT: movw %ax, (%rsi) 2517; SSSE3-ONLY-NEXT: movl %eax, %ecx 2518; SSSE3-ONLY-NEXT: shrl $16, %ecx 2519; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi) 2520; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx) 2521; SSSE3-ONLY-NEXT: movw %ax, (%rdx) 2522; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx) 2523; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) 2524; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx) 2525; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx) 2526; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx) 2527; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) 2528; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx) 2529; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx) 2530; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx) 2531; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) 2532; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx) 2533; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx) 2534; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx) 2535; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) 2536; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx) 2537; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx) 2538; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx) 2539; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) 2540; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx) 2541; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx) 2542; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx) 2543; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) 2544; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx) 2545; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx) 2546; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx) 2547; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) 2548; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx) 2549; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx) 2550; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx) 2551; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) 2552; SSSE3-ONLY-NEXT: retq 2553; 2554; SSE41-LABEL: vec384_v3i8: 2555; SSE41: # %bb.0: 2556; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2557; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 2558; SSE41-NEXT: pxor %xmm1, %xmm0 2559; SSE41-NEXT: pextrb $2, %xmm0, 2(%rsi) 2560; SSE41-NEXT: movd %xmm0, %eax 2561; SSE41-NEXT: movw %ax, (%rsi) 2562; SSE41-NEXT: pextrb $2, %xmm0, 2(%rdx) 2563; SSE41-NEXT: movw %ax, (%rdx) 2564; SSE41-NEXT: pextrb $2, %xmm0, 6(%rdx) 2565; SSE41-NEXT: movw %ax, 4(%rdx) 2566; SSE41-NEXT: pextrb $2, %xmm0, 10(%rdx) 2567; SSE41-NEXT: movw %ax, 8(%rdx) 2568; SSE41-NEXT: pextrb $2, %xmm0, 14(%rdx) 2569; SSE41-NEXT: movw %ax, 12(%rdx) 2570; SSE41-NEXT: pextrb $2, %xmm0, 18(%rdx) 2571; SSE41-NEXT: movw %ax, 16(%rdx) 2572; SSE41-NEXT: pextrb $2, %xmm0, 22(%rdx) 2573; SSE41-NEXT: movw %ax, 20(%rdx) 2574; SSE41-NEXT: pextrb $2, %xmm0, 26(%rdx) 2575; SSE41-NEXT: movw %ax, 24(%rdx) 2576; SSE41-NEXT: pextrb $2, %xmm0, 30(%rdx) 2577; SSE41-NEXT: movw %ax, 28(%rdx) 2578; SSE41-NEXT: pextrb $2, %xmm0, 34(%rdx) 2579; SSE41-NEXT: movw %ax, 32(%rdx) 2580; SSE41-NEXT: pextrb $2, %xmm0, 38(%rdx) 2581; SSE41-NEXT: movw %ax, 36(%rdx) 2582; SSE41-NEXT: pextrb $2, %xmm0, 42(%rdx) 2583; SSE41-NEXT: movw %ax, 40(%rdx) 2584; SSE41-NEXT: pextrb $2, %xmm0, 46(%rdx) 2585; SSE41-NEXT: movw %ax, 44(%rdx) 2586; SSE41-NEXT: pextrb $2, %xmm0, 50(%rdx) 2587; SSE41-NEXT: movw %ax, 48(%rdx) 2588; SSE41-NEXT: pextrb $2, %xmm0, 54(%rdx) 2589; SSE41-NEXT: movw %ax, 52(%rdx) 2590; SSE41-NEXT: pextrb $2, %xmm0, 58(%rdx) 2591; SSE41-NEXT: movw %ax, 56(%rdx) 2592; SSE41-NEXT: pextrb $2, %xmm0, 62(%rdx) 2593; SSE41-NEXT: movw %ax, 60(%rdx) 2594; SSE41-NEXT: retq 2595; 2596; SSE42-LABEL: vec384_v3i8: 2597; SSE42: # %bb.0: 2598; SSE42-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2599; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 2600; SSE42-NEXT: pxor %xmm1, %xmm0 2601; SSE42-NEXT: pextrb $2, %xmm0, 2(%rsi) 2602; SSE42-NEXT: movd %xmm0, %eax 2603; SSE42-NEXT: movw %ax, (%rsi) 2604; SSE42-NEXT: pextrb $2, %xmm0, 2(%rdx) 2605; SSE42-NEXT: movw %ax, (%rdx) 2606; SSE42-NEXT: pextrb $2, %xmm0, 6(%rdx) 2607; SSE42-NEXT: movw %ax, 4(%rdx) 2608; SSE42-NEXT: pextrb $2, %xmm0, 10(%rdx) 2609; SSE42-NEXT: movw %ax, 8(%rdx) 2610; SSE42-NEXT: pextrb $2, %xmm0, 14(%rdx) 2611; SSE42-NEXT: movw %ax, 12(%rdx) 2612; SSE42-NEXT: pextrb $2, %xmm0, 18(%rdx) 2613; SSE42-NEXT: movw %ax, 16(%rdx) 2614; SSE42-NEXT: pextrb $2, %xmm0, 22(%rdx) 2615; SSE42-NEXT: movw %ax, 20(%rdx) 2616; SSE42-NEXT: pextrb $2, %xmm0, 26(%rdx) 2617; SSE42-NEXT: movw %ax, 24(%rdx) 2618; SSE42-NEXT: pextrb $2, %xmm0, 30(%rdx) 2619; SSE42-NEXT: movw %ax, 28(%rdx) 2620; SSE42-NEXT: pextrb $2, %xmm0, 34(%rdx) 2621; SSE42-NEXT: movw %ax, 32(%rdx) 2622; SSE42-NEXT: pextrb $2, %xmm0, 38(%rdx) 2623; SSE42-NEXT: movw %ax, 36(%rdx) 2624; SSE42-NEXT: pextrb $2, %xmm0, 42(%rdx) 2625; SSE42-NEXT: movw %ax, 40(%rdx) 2626; SSE42-NEXT: pextrb $2, %xmm0, 46(%rdx) 2627; SSE42-NEXT: movw %ax, 44(%rdx) 2628; SSE42-NEXT: pextrb $2, %xmm0, 50(%rdx) 2629; SSE42-NEXT: movw %ax, 48(%rdx) 2630; SSE42-NEXT: pextrb $2, %xmm0, 54(%rdx) 2631; SSE42-NEXT: movw %ax, 52(%rdx) 2632; SSE42-NEXT: pextrb $2, %xmm0, 58(%rdx) 2633; SSE42-NEXT: movw %ax, 56(%rdx) 2634; SSE42-NEXT: pextrb $2, %xmm0, 62(%rdx) 2635; SSE42-NEXT: movw %ax, 60(%rdx) 2636; SSE42-NEXT: retq 2637; 2638; AVX1-LABEL: vec384_v3i8: 2639; AVX1: # %bb.0: 2640; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2641; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2642; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2643; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rsi) 2644; AVX1-NEXT: vmovd %xmm0, %eax 2645; AVX1-NEXT: movw %ax, (%rsi) 2646; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdx) 2647; AVX1-NEXT: movw %ax, (%rdx) 2648; AVX1-NEXT: vpextrb $2, %xmm0, 6(%rdx) 2649; AVX1-NEXT: movw %ax, 4(%rdx) 2650; AVX1-NEXT: vpextrb $2, %xmm0, 10(%rdx) 2651; AVX1-NEXT: movw %ax, 8(%rdx) 2652; AVX1-NEXT: vpextrb $2, %xmm0, 14(%rdx) 2653; AVX1-NEXT: movw %ax, 12(%rdx) 2654; AVX1-NEXT: vpextrb $2, %xmm0, 18(%rdx) 2655; AVX1-NEXT: movw %ax, 16(%rdx) 2656; AVX1-NEXT: vpextrb $2, %xmm0, 22(%rdx) 2657; AVX1-NEXT: movw %ax, 20(%rdx) 2658; AVX1-NEXT: vpextrb $2, %xmm0, 26(%rdx) 2659; AVX1-NEXT: movw %ax, 24(%rdx) 2660; AVX1-NEXT: vpextrb $2, %xmm0, 30(%rdx) 2661; AVX1-NEXT: movw %ax, 28(%rdx) 2662; AVX1-NEXT: vpextrb $2, %xmm0, 34(%rdx) 2663; AVX1-NEXT: movw %ax, 32(%rdx) 2664; AVX1-NEXT: vpextrb $2, %xmm0, 38(%rdx) 2665; AVX1-NEXT: movw %ax, 36(%rdx) 2666; AVX1-NEXT: vpextrb $2, %xmm0, 42(%rdx) 2667; AVX1-NEXT: movw %ax, 40(%rdx) 2668; AVX1-NEXT: vpextrb $2, %xmm0, 46(%rdx) 2669; AVX1-NEXT: movw %ax, 44(%rdx) 2670; AVX1-NEXT: vpextrb $2, %xmm0, 50(%rdx) 2671; AVX1-NEXT: movw %ax, 48(%rdx) 2672; AVX1-NEXT: vpextrb $2, %xmm0, 54(%rdx) 2673; AVX1-NEXT: movw %ax, 52(%rdx) 2674; AVX1-NEXT: vpextrb $2, %xmm0, 58(%rdx) 2675; AVX1-NEXT: movw %ax, 56(%rdx) 2676; AVX1-NEXT: vpextrb $2, %xmm0, 62(%rdx) 2677; AVX1-NEXT: movw %ax, 60(%rdx) 2678; AVX1-NEXT: retq 2679; 2680; AVX2-ONLY-LABEL: vec384_v3i8: 2681; AVX2-ONLY: # %bb.0: 2682; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2683; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2684; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 2685; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 2(%rsi) 2686; AVX2-ONLY-NEXT: vmovd %xmm0, %eax 2687; AVX2-ONLY-NEXT: movw %ax, (%rsi) 2688; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 2(%rdx) 2689; AVX2-ONLY-NEXT: movw %ax, (%rdx) 2690; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 6(%rdx) 2691; AVX2-ONLY-NEXT: movw %ax, 4(%rdx) 2692; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 10(%rdx) 2693; AVX2-ONLY-NEXT: movw %ax, 8(%rdx) 2694; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 14(%rdx) 2695; AVX2-ONLY-NEXT: movw %ax, 12(%rdx) 2696; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 18(%rdx) 2697; AVX2-ONLY-NEXT: movw %ax, 16(%rdx) 2698; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 22(%rdx) 2699; AVX2-ONLY-NEXT: movw %ax, 20(%rdx) 2700; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 26(%rdx) 2701; AVX2-ONLY-NEXT: movw %ax, 24(%rdx) 2702; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 30(%rdx) 2703; AVX2-ONLY-NEXT: movw %ax, 28(%rdx) 2704; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 34(%rdx) 2705; AVX2-ONLY-NEXT: movw %ax, 32(%rdx) 2706; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 38(%rdx) 2707; AVX2-ONLY-NEXT: movw %ax, 36(%rdx) 2708; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 42(%rdx) 2709; AVX2-ONLY-NEXT: movw %ax, 40(%rdx) 2710; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 46(%rdx) 2711; AVX2-ONLY-NEXT: movw %ax, 44(%rdx) 2712; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 50(%rdx) 2713; AVX2-ONLY-NEXT: movw %ax, 48(%rdx) 2714; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 54(%rdx) 2715; AVX2-ONLY-NEXT: movw %ax, 52(%rdx) 2716; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 58(%rdx) 2717; AVX2-ONLY-NEXT: movw %ax, 56(%rdx) 2718; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 62(%rdx) 2719; AVX2-ONLY-NEXT: movw %ax, 60(%rdx) 2720; AVX2-ONLY-NEXT: retq 2721; 2722; AVX512-LABEL: vec384_v3i8: 2723; AVX512: # %bb.0: 2724; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2725; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 2726; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rsi) 2727; AVX512-NEXT: vmovd %xmm0, %eax 2728; AVX512-NEXT: movw %ax, (%rsi) 2729; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rdx) 2730; AVX512-NEXT: movw %ax, (%rdx) 2731; AVX512-NEXT: vpextrb $2, %xmm0, 6(%rdx) 2732; AVX512-NEXT: movw %ax, 4(%rdx) 2733; AVX512-NEXT: vpextrb $2, %xmm0, 10(%rdx) 2734; AVX512-NEXT: movw %ax, 8(%rdx) 2735; AVX512-NEXT: vpextrb $2, %xmm0, 14(%rdx) 2736; AVX512-NEXT: movw %ax, 12(%rdx) 2737; AVX512-NEXT: vpextrb $2, %xmm0, 18(%rdx) 2738; AVX512-NEXT: movw %ax, 16(%rdx) 2739; AVX512-NEXT: vpextrb $2, %xmm0, 22(%rdx) 2740; AVX512-NEXT: movw %ax, 20(%rdx) 2741; AVX512-NEXT: vpextrb $2, %xmm0, 26(%rdx) 2742; AVX512-NEXT: movw %ax, 24(%rdx) 2743; AVX512-NEXT: vpextrb $2, %xmm0, 30(%rdx) 2744; AVX512-NEXT: movw %ax, 28(%rdx) 2745; AVX512-NEXT: vpextrb $2, %xmm0, 34(%rdx) 2746; AVX512-NEXT: movw %ax, 32(%rdx) 2747; AVX512-NEXT: vpextrb $2, %xmm0, 38(%rdx) 2748; AVX512-NEXT: movw %ax, 36(%rdx) 2749; AVX512-NEXT: vpextrb $2, %xmm0, 42(%rdx) 2750; AVX512-NEXT: movw %ax, 40(%rdx) 2751; AVX512-NEXT: vpextrb $2, %xmm0, 46(%rdx) 2752; AVX512-NEXT: movw %ax, 44(%rdx) 2753; AVX512-NEXT: vpextrb $2, %xmm0, 50(%rdx) 2754; AVX512-NEXT: movw %ax, 48(%rdx) 2755; AVX512-NEXT: vpextrb $2, %xmm0, 54(%rdx) 2756; AVX512-NEXT: movw %ax, 52(%rdx) 2757; AVX512-NEXT: vpextrb $2, %xmm0, 58(%rdx) 2758; AVX512-NEXT: movw %ax, 56(%rdx) 2759; AVX512-NEXT: vpextrb $2, %xmm0, 62(%rdx) 2760; AVX512-NEXT: movw %ax, 60(%rdx) 2761; AVX512-NEXT: retq 2762 %in.subvec.not = load <3 x i8>, ptr %in.subvec.ptr, align 64 2763 %in.subvec = xor <3 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1> 2764 store <3 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 2765 %out.subvec0.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 0 2766 store <3 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 2767 %out.subvec1.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 1 2768 store <3 x i8> %in.subvec, ptr %out.subvec1.ptr, align 1 2769 %out.subvec2.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 2 2770 store <3 x i8> %in.subvec, ptr %out.subvec2.ptr, align 2 2771 %out.subvec3.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 3 2772 store <3 x i8> %in.subvec, ptr %out.subvec3.ptr, align 1 2773 %out.subvec4.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 4 2774 store <3 x i8> %in.subvec, ptr %out.subvec4.ptr, align 4 2775 %out.subvec5.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 5 2776 store <3 x i8> %in.subvec, ptr %out.subvec5.ptr, align 1 2777 %out.subvec6.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 6 2778 store <3 x i8> %in.subvec, ptr %out.subvec6.ptr, align 2 2779 %out.subvec7.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 7 2780 store <3 x i8> %in.subvec, ptr %out.subvec7.ptr, align 1 2781 %out.subvec8.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 8 2782 store <3 x i8> %in.subvec, ptr %out.subvec8.ptr, align 8 2783 %out.subvec9.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 9 2784 store <3 x i8> %in.subvec, ptr %out.subvec9.ptr, align 1 2785 %out.subvec10.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 10 2786 store <3 x i8> %in.subvec, ptr %out.subvec10.ptr, align 2 2787 %out.subvec11.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 11 2788 store <3 x i8> %in.subvec, ptr %out.subvec11.ptr, align 1 2789 %out.subvec12.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 12 2790 store <3 x i8> %in.subvec, ptr %out.subvec12.ptr, align 4 2791 %out.subvec13.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 13 2792 store <3 x i8> %in.subvec, ptr %out.subvec13.ptr, align 1 2793 %out.subvec14.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 14 2794 store <3 x i8> %in.subvec, ptr %out.subvec14.ptr, align 2 2795 %out.subvec15.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 15 2796 store <3 x i8> %in.subvec, ptr %out.subvec15.ptr, align 1 2797 ret void 2798} 2799 2800define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 2801; SCALAR-LABEL: vec384_v3i16: 2802; SCALAR: # %bb.0: 2803; SCALAR-NEXT: movq (%rdi), %rax 2804; SCALAR-NEXT: movq %rax, %rcx 2805; SCALAR-NEXT: shrq $32, %rcx 2806; SCALAR-NEXT: notl %ecx 2807; SCALAR-NEXT: notl %eax 2808; SCALAR-NEXT: movl %eax, (%rsi) 2809; SCALAR-NEXT: movw %cx, 4(%rsi) 2810; SCALAR-NEXT: movw %cx, 4(%rdx) 2811; SCALAR-NEXT: movl %eax, (%rdx) 2812; SCALAR-NEXT: movw %cx, 12(%rdx) 2813; SCALAR-NEXT: movl %eax, 8(%rdx) 2814; SCALAR-NEXT: movw %cx, 20(%rdx) 2815; SCALAR-NEXT: movl %eax, 16(%rdx) 2816; SCALAR-NEXT: movw %cx, 28(%rdx) 2817; SCALAR-NEXT: movl %eax, 24(%rdx) 2818; SCALAR-NEXT: movw %cx, 36(%rdx) 2819; SCALAR-NEXT: movl %eax, 32(%rdx) 2820; SCALAR-NEXT: movw %cx, 44(%rdx) 2821; SCALAR-NEXT: movl %eax, 40(%rdx) 2822; SCALAR-NEXT: movw %cx, 52(%rdx) 2823; SCALAR-NEXT: movl %eax, 48(%rdx) 2824; SCALAR-NEXT: movw %cx, 60(%rdx) 2825; SCALAR-NEXT: movl %eax, 56(%rdx) 2826; SCALAR-NEXT: retq 2827; 2828; SSE2-ONLY-LABEL: vec384_v3i16: 2829; SSE2-ONLY: # %bb.0: 2830; SSE2-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2831; SSE2-ONLY-NEXT: pcmpeqd %xmm1, %xmm1 2832; SSE2-ONLY-NEXT: pxor %xmm0, %xmm1 2833; SSE2-ONLY-NEXT: movd %xmm1, (%rsi) 2834; SSE2-ONLY-NEXT: pextrw $2, %xmm1, %eax 2835; SSE2-ONLY-NEXT: movw %ax, 4(%rsi) 2836; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) 2837; SSE2-ONLY-NEXT: movd %xmm1, (%rdx) 2838; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) 2839; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) 2840; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) 2841; SSE2-ONLY-NEXT: movd %xmm1, 16(%rdx) 2842; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) 2843; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) 2844; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) 2845; SSE2-ONLY-NEXT: movd %xmm1, 32(%rdx) 2846; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) 2847; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) 2848; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) 2849; SSE2-ONLY-NEXT: movd %xmm1, 48(%rdx) 2850; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) 2851; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) 2852; SSE2-ONLY-NEXT: retq 2853; 2854; SSE3-LABEL: vec384_v3i16: 2855; SSE3: # %bb.0: 2856; SSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2857; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 2858; SSE3-NEXT: pxor %xmm0, %xmm1 2859; SSE3-NEXT: movd %xmm1, (%rsi) 2860; SSE3-NEXT: pextrw $2, %xmm1, %eax 2861; SSE3-NEXT: movw %ax, 4(%rsi) 2862; SSE3-NEXT: movw %ax, 4(%rdx) 2863; SSE3-NEXT: movd %xmm1, (%rdx) 2864; SSE3-NEXT: movw %ax, 12(%rdx) 2865; SSE3-NEXT: movd %xmm1, 8(%rdx) 2866; SSE3-NEXT: movw %ax, 20(%rdx) 2867; SSE3-NEXT: movd %xmm1, 16(%rdx) 2868; SSE3-NEXT: movw %ax, 28(%rdx) 2869; SSE3-NEXT: movd %xmm1, 24(%rdx) 2870; SSE3-NEXT: movw %ax, 36(%rdx) 2871; SSE3-NEXT: movd %xmm1, 32(%rdx) 2872; SSE3-NEXT: movw %ax, 44(%rdx) 2873; SSE3-NEXT: movd %xmm1, 40(%rdx) 2874; SSE3-NEXT: movw %ax, 52(%rdx) 2875; SSE3-NEXT: movd %xmm1, 48(%rdx) 2876; SSE3-NEXT: movw %ax, 60(%rdx) 2877; SSE3-NEXT: movd %xmm1, 56(%rdx) 2878; SSE3-NEXT: retq 2879; 2880; SSSE3-ONLY-LABEL: vec384_v3i16: 2881; SSSE3-ONLY: # %bb.0: 2882; SSSE3-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2883; SSSE3-ONLY-NEXT: pcmpeqd %xmm1, %xmm1 2884; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm1 2885; SSSE3-ONLY-NEXT: movd %xmm1, (%rsi) 2886; SSSE3-ONLY-NEXT: pextrw $2, %xmm1, %eax 2887; SSSE3-ONLY-NEXT: movw %ax, 4(%rsi) 2888; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) 2889; SSSE3-ONLY-NEXT: movd %xmm1, (%rdx) 2890; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) 2891; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) 2892; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) 2893; SSSE3-ONLY-NEXT: movd %xmm1, 16(%rdx) 2894; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) 2895; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) 2896; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) 2897; SSSE3-ONLY-NEXT: movd %xmm1, 32(%rdx) 2898; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) 2899; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) 2900; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) 2901; SSSE3-ONLY-NEXT: movd %xmm1, 48(%rdx) 2902; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) 2903; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) 2904; SSSE3-ONLY-NEXT: retq 2905; 2906; SSE41-LABEL: vec384_v3i16: 2907; SSE41: # %bb.0: 2908; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2909; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 2910; SSE41-NEXT: pxor %xmm0, %xmm1 2911; SSE41-NEXT: pextrw $2, %xmm1, 4(%rsi) 2912; SSE41-NEXT: movd %xmm1, (%rsi) 2913; SSE41-NEXT: pextrw $2, %xmm1, 4(%rdx) 2914; SSE41-NEXT: movd %xmm1, (%rdx) 2915; SSE41-NEXT: pextrw $2, %xmm1, 12(%rdx) 2916; SSE41-NEXT: movd %xmm1, 8(%rdx) 2917; SSE41-NEXT: pextrw $2, %xmm1, 20(%rdx) 2918; SSE41-NEXT: movd %xmm1, 16(%rdx) 2919; SSE41-NEXT: pextrw $2, %xmm1, 28(%rdx) 2920; SSE41-NEXT: movd %xmm1, 24(%rdx) 2921; SSE41-NEXT: pextrw $2, %xmm1, 36(%rdx) 2922; SSE41-NEXT: movd %xmm1, 32(%rdx) 2923; SSE41-NEXT: pextrw $2, %xmm1, 44(%rdx) 2924; SSE41-NEXT: movd %xmm1, 40(%rdx) 2925; SSE41-NEXT: pextrw $2, %xmm1, 52(%rdx) 2926; SSE41-NEXT: movd %xmm1, 48(%rdx) 2927; SSE41-NEXT: pextrw $2, %xmm1, 60(%rdx) 2928; SSE41-NEXT: movd %xmm1, 56(%rdx) 2929; SSE41-NEXT: retq 2930; 2931; SSE42-LABEL: vec384_v3i16: 2932; SSE42: # %bb.0: 2933; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2934; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 2935; SSE42-NEXT: pxor %xmm0, %xmm1 2936; SSE42-NEXT: pextrw $2, %xmm1, 4(%rsi) 2937; SSE42-NEXT: movd %xmm1, (%rsi) 2938; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdx) 2939; SSE42-NEXT: movd %xmm1, (%rdx) 2940; SSE42-NEXT: pextrw $2, %xmm1, 12(%rdx) 2941; SSE42-NEXT: movd %xmm1, 8(%rdx) 2942; SSE42-NEXT: pextrw $2, %xmm1, 20(%rdx) 2943; SSE42-NEXT: movd %xmm1, 16(%rdx) 2944; SSE42-NEXT: pextrw $2, %xmm1, 28(%rdx) 2945; SSE42-NEXT: movd %xmm1, 24(%rdx) 2946; SSE42-NEXT: pextrw $2, %xmm1, 36(%rdx) 2947; SSE42-NEXT: movd %xmm1, 32(%rdx) 2948; SSE42-NEXT: pextrw $2, %xmm1, 44(%rdx) 2949; SSE42-NEXT: movd %xmm1, 40(%rdx) 2950; SSE42-NEXT: pextrw $2, %xmm1, 52(%rdx) 2951; SSE42-NEXT: movd %xmm1, 48(%rdx) 2952; SSE42-NEXT: pextrw $2, %xmm1, 60(%rdx) 2953; SSE42-NEXT: movd %xmm1, 56(%rdx) 2954; SSE42-NEXT: retq 2955; 2956; AVX1-LABEL: vec384_v3i16: 2957; AVX1: # %bb.0: 2958; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2959; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2960; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2961; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rsi) 2962; AVX1-NEXT: vmovd %xmm0, (%rsi) 2963; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdx) 2964; AVX1-NEXT: vmovd %xmm0, (%rdx) 2965; AVX1-NEXT: vpextrw $2, %xmm0, 12(%rdx) 2966; AVX1-NEXT: vmovd %xmm0, 8(%rdx) 2967; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdx) 2968; AVX1-NEXT: vmovd %xmm0, 16(%rdx) 2969; AVX1-NEXT: vpextrw $2, %xmm0, 28(%rdx) 2970; AVX1-NEXT: vmovd %xmm0, 24(%rdx) 2971; AVX1-NEXT: vpextrw $2, %xmm0, 36(%rdx) 2972; AVX1-NEXT: vmovd %xmm0, 32(%rdx) 2973; AVX1-NEXT: vpextrw $2, %xmm0, 44(%rdx) 2974; AVX1-NEXT: vmovd %xmm0, 40(%rdx) 2975; AVX1-NEXT: vpextrw $2, %xmm0, 52(%rdx) 2976; AVX1-NEXT: vmovd %xmm0, 48(%rdx) 2977; AVX1-NEXT: vpextrw $2, %xmm0, 60(%rdx) 2978; AVX1-NEXT: vmovd %xmm0, 56(%rdx) 2979; AVX1-NEXT: retq 2980; 2981; AVX2-ONLY-LABEL: vec384_v3i16: 2982; AVX2-ONLY: # %bb.0: 2983; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 2984; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2985; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 2986; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rsi) 2987; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi) 2988; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rdx) 2989; AVX2-ONLY-NEXT: vmovd %xmm0, (%rdx) 2990; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 12(%rdx) 2991; AVX2-ONLY-NEXT: vmovd %xmm0, 8(%rdx) 2992; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 20(%rdx) 2993; AVX2-ONLY-NEXT: vmovd %xmm0, 16(%rdx) 2994; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 28(%rdx) 2995; AVX2-ONLY-NEXT: vmovd %xmm0, 24(%rdx) 2996; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 36(%rdx) 2997; AVX2-ONLY-NEXT: vmovd %xmm0, 32(%rdx) 2998; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 44(%rdx) 2999; AVX2-ONLY-NEXT: vmovd %xmm0, 40(%rdx) 3000; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 52(%rdx) 3001; AVX2-ONLY-NEXT: vmovd %xmm0, 48(%rdx) 3002; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 60(%rdx) 3003; AVX2-ONLY-NEXT: vmovd %xmm0, 56(%rdx) 3004; AVX2-ONLY-NEXT: retq 3005; 3006; AVX512-LABEL: vec384_v3i16: 3007; AVX512: # %bb.0: 3008; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 3009; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 3010; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) 3011; AVX512-NEXT: vmovd %xmm0, (%rsi) 3012; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) 3013; AVX512-NEXT: vmovd %xmm0, (%rdx) 3014; AVX512-NEXT: vpextrw $2, %xmm0, 12(%rdx) 3015; AVX512-NEXT: vmovd %xmm0, 8(%rdx) 3016; AVX512-NEXT: vpextrw $2, %xmm0, 20(%rdx) 3017; AVX512-NEXT: vmovd %xmm0, 16(%rdx) 3018; AVX512-NEXT: vpextrw $2, %xmm0, 28(%rdx) 3019; AVX512-NEXT: vmovd %xmm0, 24(%rdx) 3020; AVX512-NEXT: vpextrw $2, %xmm0, 36(%rdx) 3021; AVX512-NEXT: vmovd %xmm0, 32(%rdx) 3022; AVX512-NEXT: vpextrw $2, %xmm0, 44(%rdx) 3023; AVX512-NEXT: vmovd %xmm0, 40(%rdx) 3024; AVX512-NEXT: vpextrw $2, %xmm0, 52(%rdx) 3025; AVX512-NEXT: vmovd %xmm0, 48(%rdx) 3026; AVX512-NEXT: vpextrw $2, %xmm0, 60(%rdx) 3027; AVX512-NEXT: vmovd %xmm0, 56(%rdx) 3028; AVX512-NEXT: retq 3029 %in.subvec.not = load <3 x i16>, ptr %in.subvec.ptr, align 64 3030 %in.subvec = xor <3 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1> 3031 store <3 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 3032 %out.subvec0.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 0 3033 store <3 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 3034 %out.subvec1.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 1 3035 store <3 x i16> %in.subvec, ptr %out.subvec1.ptr, align 2 3036 %out.subvec2.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 2 3037 store <3 x i16> %in.subvec, ptr %out.subvec2.ptr, align 4 3038 %out.subvec3.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 3 3039 store <3 x i16> %in.subvec, ptr %out.subvec3.ptr, align 2 3040 %out.subvec4.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 4 3041 store <3 x i16> %in.subvec, ptr %out.subvec4.ptr, align 8 3042 %out.subvec5.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 5 3043 store <3 x i16> %in.subvec, ptr %out.subvec5.ptr, align 2 3044 %out.subvec6.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 6 3045 store <3 x i16> %in.subvec, ptr %out.subvec6.ptr, align 4 3046 %out.subvec7.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 7 3047 store <3 x i16> %in.subvec, ptr %out.subvec7.ptr, align 2 3048 ret void 3049} 3050 3051define void @vec384_v3i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 3052; SCALAR-LABEL: vec384_v3i32: 3053; SCALAR: # %bb.0: 3054; SCALAR-NEXT: movl 8(%rdi), %eax 3055; SCALAR-NEXT: movq (%rdi), %rcx 3056; SCALAR-NEXT: notq %rcx 3057; SCALAR-NEXT: notl %eax 3058; SCALAR-NEXT: movl %eax, 8(%rsi) 3059; SCALAR-NEXT: movq %rcx, (%rsi) 3060; SCALAR-NEXT: movl %eax, 8(%rdx) 3061; SCALAR-NEXT: movq %rcx, (%rdx) 3062; SCALAR-NEXT: movl %eax, 24(%rdx) 3063; SCALAR-NEXT: movq %rcx, 16(%rdx) 3064; SCALAR-NEXT: movl %eax, 40(%rdx) 3065; SCALAR-NEXT: movq %rcx, 32(%rdx) 3066; SCALAR-NEXT: movl %eax, 56(%rdx) 3067; SCALAR-NEXT: movq %rcx, 48(%rdx) 3068; SCALAR-NEXT: retq 3069; 3070; SSE2-ONLY-LABEL: vec384_v3i32: 3071; SSE2-ONLY: # %bb.0: 3072; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 3073; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 3074; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) 3075; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3076; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) 3077; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) 3078; SSE2-ONLY-NEXT: movq %xmm0, (%rdx) 3079; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) 3080; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx) 3081; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) 3082; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx) 3083; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) 3084; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx) 3085; SSE2-ONLY-NEXT: retq 3086; 3087; SSE3-LABEL: vec384_v3i32: 3088; SSE3: # %bb.0: 3089; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 3090; SSE3-NEXT: pxor (%rdi), %xmm0 3091; SSE3-NEXT: movq %xmm0, (%rsi) 3092; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3093; SSE3-NEXT: movd %xmm1, 8(%rsi) 3094; SSE3-NEXT: movd %xmm1, 8(%rdx) 3095; SSE3-NEXT: movq %xmm0, (%rdx) 3096; SSE3-NEXT: movd %xmm1, 24(%rdx) 3097; SSE3-NEXT: movq %xmm0, 16(%rdx) 3098; SSE3-NEXT: movd %xmm1, 40(%rdx) 3099; SSE3-NEXT: movq %xmm0, 32(%rdx) 3100; SSE3-NEXT: movd %xmm1, 56(%rdx) 3101; SSE3-NEXT: movq %xmm0, 48(%rdx) 3102; SSE3-NEXT: retq 3103; 3104; SSSE3-ONLY-LABEL: vec384_v3i32: 3105; SSSE3-ONLY: # %bb.0: 3106; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 3107; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 3108; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) 3109; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3110; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) 3111; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) 3112; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx) 3113; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) 3114; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx) 3115; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) 3116; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx) 3117; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) 3118; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx) 3119; SSSE3-ONLY-NEXT: retq 3120; 3121; SSE41-LABEL: vec384_v3i32: 3122; SSE41: # %bb.0: 3123; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 3124; SSE41-NEXT: pxor (%rdi), %xmm0 3125; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) 3126; SSE41-NEXT: movq %xmm0, (%rsi) 3127; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx) 3128; SSE41-NEXT: movq %xmm0, (%rdx) 3129; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx) 3130; SSE41-NEXT: movq %xmm0, 16(%rdx) 3131; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx) 3132; SSE41-NEXT: movq %xmm0, 32(%rdx) 3133; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx) 3134; SSE41-NEXT: movq %xmm0, 48(%rdx) 3135; SSE41-NEXT: retq 3136; 3137; SSE42-LABEL: vec384_v3i32: 3138; SSE42: # %bb.0: 3139; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 3140; SSE42-NEXT: pxor (%rdi), %xmm0 3141; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) 3142; SSE42-NEXT: movq %xmm0, (%rsi) 3143; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx) 3144; SSE42-NEXT: movq %xmm0, (%rdx) 3145; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx) 3146; SSE42-NEXT: movq %xmm0, 16(%rdx) 3147; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx) 3148; SSE42-NEXT: movq %xmm0, 32(%rdx) 3149; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx) 3150; SSE42-NEXT: movq %xmm0, 48(%rdx) 3151; SSE42-NEXT: retq 3152; 3153; AVX-LABEL: vec384_v3i32: 3154; AVX: # %bb.0: 3155; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3156; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 3157; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) 3158; AVX-NEXT: vmovq %xmm0, (%rsi) 3159; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx) 3160; AVX-NEXT: vmovq %xmm0, (%rdx) 3161; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx) 3162; AVX-NEXT: vmovq %xmm0, 16(%rdx) 3163; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx) 3164; AVX-NEXT: vmovq %xmm0, 32(%rdx) 3165; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx) 3166; AVX-NEXT: vmovq %xmm0, 48(%rdx) 3167; AVX-NEXT: retq 3168 %in.subvec.not = load <3 x i32>, ptr %in.subvec.ptr, align 64 3169 %in.subvec = xor <3 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1> 3170 store <3 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 3171 %out.subvec0.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 0 3172 store <3 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 3173 %out.subvec1.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 1 3174 store <3 x i32> %in.subvec, ptr %out.subvec1.ptr, align 4 3175 %out.subvec2.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 2 3176 store <3 x i32> %in.subvec, ptr %out.subvec2.ptr, align 8 3177 %out.subvec3.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 3 3178 store <3 x i32> %in.subvec, ptr %out.subvec3.ptr, align 4 3179 ret void 3180} 3181 3182define void @vec384_v3f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 3183; SCALAR-LABEL: vec384_v3f32: 3184; SCALAR: # %bb.0: 3185; SCALAR-NEXT: movl 8(%rdi), %eax 3186; SCALAR-NEXT: movq (%rdi), %rcx 3187; SCALAR-NEXT: notq %rcx 3188; SCALAR-NEXT: notl %eax 3189; SCALAR-NEXT: movl %eax, 8(%rsi) 3190; SCALAR-NEXT: movq %rcx, (%rsi) 3191; SCALAR-NEXT: movl %eax, 8(%rdx) 3192; SCALAR-NEXT: movq %rcx, (%rdx) 3193; SCALAR-NEXT: movl %eax, 24(%rdx) 3194; SCALAR-NEXT: movq %rcx, 16(%rdx) 3195; SCALAR-NEXT: movl %eax, 40(%rdx) 3196; SCALAR-NEXT: movq %rcx, 32(%rdx) 3197; SCALAR-NEXT: movl %eax, 56(%rdx) 3198; SCALAR-NEXT: movq %rcx, 48(%rdx) 3199; SCALAR-NEXT: retq 3200; 3201; SSE2-ONLY-LABEL: vec384_v3f32: 3202; SSE2-ONLY: # %bb.0: 3203; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 3204; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 3205; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) 3206; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3207; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) 3208; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) 3209; SSE2-ONLY-NEXT: movq %xmm0, (%rdx) 3210; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) 3211; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx) 3212; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) 3213; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx) 3214; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) 3215; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx) 3216; SSE2-ONLY-NEXT: retq 3217; 3218; SSE3-LABEL: vec384_v3f32: 3219; SSE3: # %bb.0: 3220; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 3221; SSE3-NEXT: pxor (%rdi), %xmm0 3222; SSE3-NEXT: movq %xmm0, (%rsi) 3223; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3224; SSE3-NEXT: movd %xmm1, 8(%rsi) 3225; SSE3-NEXT: movd %xmm1, 8(%rdx) 3226; SSE3-NEXT: movq %xmm0, (%rdx) 3227; SSE3-NEXT: movd %xmm1, 24(%rdx) 3228; SSE3-NEXT: movq %xmm0, 16(%rdx) 3229; SSE3-NEXT: movd %xmm1, 40(%rdx) 3230; SSE3-NEXT: movq %xmm0, 32(%rdx) 3231; SSE3-NEXT: movd %xmm1, 56(%rdx) 3232; SSE3-NEXT: movq %xmm0, 48(%rdx) 3233; SSE3-NEXT: retq 3234; 3235; SSSE3-ONLY-LABEL: vec384_v3f32: 3236; SSSE3-ONLY: # %bb.0: 3237; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 3238; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 3239; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) 3240; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3241; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) 3242; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) 3243; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx) 3244; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) 3245; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx) 3246; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) 3247; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx) 3248; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) 3249; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx) 3250; SSSE3-ONLY-NEXT: retq 3251; 3252; SSE41-LABEL: vec384_v3f32: 3253; SSE41: # %bb.0: 3254; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 3255; SSE41-NEXT: pxor (%rdi), %xmm0 3256; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) 3257; SSE41-NEXT: movq %xmm0, (%rsi) 3258; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx) 3259; SSE41-NEXT: movq %xmm0, (%rdx) 3260; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx) 3261; SSE41-NEXT: movq %xmm0, 16(%rdx) 3262; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx) 3263; SSE41-NEXT: movq %xmm0, 32(%rdx) 3264; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx) 3265; SSE41-NEXT: movq %xmm0, 48(%rdx) 3266; SSE41-NEXT: retq 3267; 3268; SSE42-LABEL: vec384_v3f32: 3269; SSE42: # %bb.0: 3270; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 3271; SSE42-NEXT: pxor (%rdi), %xmm0 3272; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) 3273; SSE42-NEXT: movq %xmm0, (%rsi) 3274; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx) 3275; SSE42-NEXT: movq %xmm0, (%rdx) 3276; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx) 3277; SSE42-NEXT: movq %xmm0, 16(%rdx) 3278; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx) 3279; SSE42-NEXT: movq %xmm0, 32(%rdx) 3280; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx) 3281; SSE42-NEXT: movq %xmm0, 48(%rdx) 3282; SSE42-NEXT: retq 3283; 3284; AVX-LABEL: vec384_v3f32: 3285; AVX: # %bb.0: 3286; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3287; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 3288; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) 3289; AVX-NEXT: vmovq %xmm0, (%rsi) 3290; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx) 3291; AVX-NEXT: vmovq %xmm0, (%rdx) 3292; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx) 3293; AVX-NEXT: vmovq %xmm0, 16(%rdx) 3294; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx) 3295; AVX-NEXT: vmovq %xmm0, 32(%rdx) 3296; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx) 3297; AVX-NEXT: vmovq %xmm0, 48(%rdx) 3298; AVX-NEXT: retq 3299 %in.subvec.not = load <3 x i32>, ptr %in.subvec.ptr, align 64 3300 %in.subvec.int = xor <3 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1> 3301 %in.subvec = bitcast <3 x i32> %in.subvec.int to <3 x float> 3302 store <3 x float> %in.subvec, ptr %out.subvec.ptr, align 64 3303 %out.subvec0.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 0 3304 store <3 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 3305 %out.subvec1.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 1 3306 store <3 x float> %in.subvec, ptr %out.subvec1.ptr, align 4 3307 %out.subvec2.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 2 3308 store <3 x float> %in.subvec, ptr %out.subvec2.ptr, align 8 3309 %out.subvec3.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 3 3310 store <3 x float> %in.subvec, ptr %out.subvec3.ptr, align 4 3311 ret void 3312} 3313 3314define void @vec384_v3i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 3315; SCALAR-LABEL: vec384_v3i64: 3316; SCALAR: # %bb.0: 3317; SCALAR-NEXT: movq (%rdi), %rax 3318; SCALAR-NEXT: movq 8(%rdi), %rcx 3319; SCALAR-NEXT: movq 16(%rdi), %rdi 3320; SCALAR-NEXT: notq %rdi 3321; SCALAR-NEXT: notq %rcx 3322; SCALAR-NEXT: notq %rax 3323; SCALAR-NEXT: movq %rax, (%rsi) 3324; SCALAR-NEXT: movq %rcx, 8(%rsi) 3325; SCALAR-NEXT: movq %rdi, 16(%rsi) 3326; SCALAR-NEXT: movq %rax, (%rdx) 3327; SCALAR-NEXT: movq %rcx, 8(%rdx) 3328; SCALAR-NEXT: movq %rdi, 16(%rdx) 3329; SCALAR-NEXT: movq %rdi, 48(%rdx) 3330; SCALAR-NEXT: movq %rcx, 40(%rdx) 3331; SCALAR-NEXT: movq %rax, 32(%rdx) 3332; SCALAR-NEXT: retq 3333; 3334; SSE2-LABEL: vec384_v3i64: 3335; SSE2: # %bb.0: 3336; SSE2-NEXT: movq 16(%rdi), %rax 3337; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 3338; SSE2-NEXT: pxor (%rdi), %xmm0 3339; SSE2-NEXT: movdqa %xmm0, (%rsi) 3340; SSE2-NEXT: notq %rax 3341; SSE2-NEXT: movq %rax, 16(%rsi) 3342; SSE2-NEXT: movq %rax, 16(%rdx) 3343; SSE2-NEXT: movdqa %xmm0, (%rdx) 3344; SSE2-NEXT: movq %rax, 48(%rdx) 3345; SSE2-NEXT: movdqu %xmm0, 32(%rdx) 3346; SSE2-NEXT: retq 3347; 3348; AVX1-LABEL: vec384_v3i64: 3349; AVX1: # %bb.0: 3350; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 3351; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 3352; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 3353; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3354; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) 3355; AVX1-NEXT: vmovaps %xmm0, (%rsi) 3356; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) 3357; AVX1-NEXT: vmovaps %xmm0, (%rdx) 3358; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) 3359; AVX1-NEXT: vmovups %xmm0, 32(%rdx) 3360; AVX1-NEXT: vzeroupper 3361; AVX1-NEXT: retq 3362; 3363; AVX2-LABEL: vec384_v3i64: 3364; AVX2: # %bb.0: 3365; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 3366; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 3367; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3368; AVX2-NEXT: vmovq %xmm1, 16(%rsi) 3369; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 3370; AVX2-NEXT: vmovq %xmm1, 16(%rdx) 3371; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 3372; AVX2-NEXT: vmovq %xmm1, 48(%rdx) 3373; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) 3374; AVX2-NEXT: vzeroupper 3375; AVX2-NEXT: retq 3376 %in.subvec.not = load <3 x i64>, ptr %in.subvec.ptr, align 64 3377 %in.subvec = xor <3 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1> 3378 store <3 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 3379 %out.subvec0.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 0 3380 store <3 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 3381 %out.subvec1.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 1 3382 store <3 x i64> %in.subvec, ptr %out.subvec1.ptr, align 8 3383 ret void 3384} 3385 3386define void @vec384_v3f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 3387; SCALAR-LABEL: vec384_v3f64: 3388; SCALAR: # %bb.0: 3389; SCALAR-NEXT: movq (%rdi), %rax 3390; SCALAR-NEXT: movq 8(%rdi), %rcx 3391; SCALAR-NEXT: movq 16(%rdi), %rdi 3392; SCALAR-NEXT: notq %rdi 3393; SCALAR-NEXT: notq %rcx 3394; SCALAR-NEXT: notq %rax 3395; SCALAR-NEXT: movq %rax, (%rsi) 3396; SCALAR-NEXT: movq %rcx, 8(%rsi) 3397; SCALAR-NEXT: movq %rdi, 16(%rsi) 3398; SCALAR-NEXT: movq %rax, (%rdx) 3399; SCALAR-NEXT: movq %rcx, 8(%rdx) 3400; SCALAR-NEXT: movq %rdi, 16(%rdx) 3401; SCALAR-NEXT: movq %rdi, 48(%rdx) 3402; SCALAR-NEXT: movq %rcx, 40(%rdx) 3403; SCALAR-NEXT: movq %rax, 32(%rdx) 3404; SCALAR-NEXT: retq 3405; 3406; SSE2-LABEL: vec384_v3f64: 3407; SSE2: # %bb.0: 3408; SSE2-NEXT: movq 16(%rdi), %rax 3409; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 3410; SSE2-NEXT: pxor (%rdi), %xmm0 3411; SSE2-NEXT: movdqa %xmm0, (%rsi) 3412; SSE2-NEXT: notq %rax 3413; SSE2-NEXT: movq %rax, 16(%rsi) 3414; SSE2-NEXT: movq %rax, 16(%rdx) 3415; SSE2-NEXT: movdqa %xmm0, (%rdx) 3416; SSE2-NEXT: movq %rax, 48(%rdx) 3417; SSE2-NEXT: movdqu %xmm0, 32(%rdx) 3418; SSE2-NEXT: retq 3419; 3420; AVX1-LABEL: vec384_v3f64: 3421; AVX1: # %bb.0: 3422; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 3423; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 3424; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 3425; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3426; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) 3427; AVX1-NEXT: vmovaps %xmm0, (%rsi) 3428; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) 3429; AVX1-NEXT: vmovaps %xmm0, (%rdx) 3430; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) 3431; AVX1-NEXT: vmovups %xmm0, 32(%rdx) 3432; AVX1-NEXT: vzeroupper 3433; AVX1-NEXT: retq 3434; 3435; AVX2-LABEL: vec384_v3f64: 3436; AVX2: # %bb.0: 3437; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 3438; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 3439; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3440; AVX2-NEXT: vmovq %xmm1, 16(%rsi) 3441; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 3442; AVX2-NEXT: vmovq %xmm1, 16(%rdx) 3443; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 3444; AVX2-NEXT: vmovq %xmm1, 48(%rdx) 3445; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) 3446; AVX2-NEXT: vzeroupper 3447; AVX2-NEXT: retq 3448 %in.subvec.not = load <3 x i64>, ptr %in.subvec.ptr, align 64 3449 %in.subvec.int = xor <3 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1> 3450 %in.subvec = bitcast <3 x i64> %in.subvec.int to <3 x double> 3451 store <3 x double> %in.subvec, ptr %out.subvec.ptr, align 64 3452 %out.subvec0.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 0 3453 store <3 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 3454 %out.subvec1.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 1 3455 store <3 x double> %in.subvec, ptr %out.subvec1.ptr, align 8 3456 ret void 3457} 3458 3459define void @vec384_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 3460; SCALAR-LABEL: vec384_v4i8: 3461; SCALAR: # %bb.0: 3462; SCALAR-NEXT: movzbl 3(%rdi), %r8d 3463; SCALAR-NEXT: movzbl 2(%rdi), %ecx 3464; SCALAR-NEXT: movzbl (%rdi), %eax 3465; SCALAR-NEXT: movzbl 1(%rdi), %edi 3466; SCALAR-NEXT: notb %al 3467; SCALAR-NEXT: notb %dil 3468; SCALAR-NEXT: notb %cl 3469; SCALAR-NEXT: notb %r8b 3470; SCALAR-NEXT: movb %r8b, 3(%rsi) 3471; SCALAR-NEXT: movb %cl, 2(%rsi) 3472; SCALAR-NEXT: movb %dil, 1(%rsi) 3473; SCALAR-NEXT: movb %al, (%rsi) 3474; SCALAR-NEXT: movb %r8b, 3(%rdx) 3475; SCALAR-NEXT: movb %cl, 2(%rdx) 3476; SCALAR-NEXT: movb %dil, 1(%rdx) 3477; SCALAR-NEXT: movb %al, (%rdx) 3478; SCALAR-NEXT: movb %r8b, 7(%rdx) 3479; SCALAR-NEXT: movb %cl, 6(%rdx) 3480; SCALAR-NEXT: movb %dil, 5(%rdx) 3481; SCALAR-NEXT: movb %al, 4(%rdx) 3482; SCALAR-NEXT: movb %r8b, 11(%rdx) 3483; SCALAR-NEXT: movb %cl, 10(%rdx) 3484; SCALAR-NEXT: movb %dil, 9(%rdx) 3485; SCALAR-NEXT: movb %al, 8(%rdx) 3486; SCALAR-NEXT: movb %r8b, 15(%rdx) 3487; SCALAR-NEXT: movb %cl, 14(%rdx) 3488; SCALAR-NEXT: movb %dil, 13(%rdx) 3489; SCALAR-NEXT: movb %al, 12(%rdx) 3490; SCALAR-NEXT: movb %r8b, 19(%rdx) 3491; SCALAR-NEXT: movb %cl, 18(%rdx) 3492; SCALAR-NEXT: movb %dil, 17(%rdx) 3493; SCALAR-NEXT: movb %al, 16(%rdx) 3494; SCALAR-NEXT: movb %r8b, 23(%rdx) 3495; SCALAR-NEXT: movb %cl, 22(%rdx) 3496; SCALAR-NEXT: movb %dil, 21(%rdx) 3497; SCALAR-NEXT: movb %al, 20(%rdx) 3498; SCALAR-NEXT: movb %r8b, 27(%rdx) 3499; SCALAR-NEXT: movb %cl, 26(%rdx) 3500; SCALAR-NEXT: movb %dil, 25(%rdx) 3501; SCALAR-NEXT: movb %al, 24(%rdx) 3502; SCALAR-NEXT: movb %r8b, 31(%rdx) 3503; SCALAR-NEXT: movb %cl, 30(%rdx) 3504; SCALAR-NEXT: movb %dil, 29(%rdx) 3505; SCALAR-NEXT: movb %al, 28(%rdx) 3506; SCALAR-NEXT: movb %r8b, 35(%rdx) 3507; SCALAR-NEXT: movb %cl, 34(%rdx) 3508; SCALAR-NEXT: movb %dil, 33(%rdx) 3509; SCALAR-NEXT: movb %al, 32(%rdx) 3510; SCALAR-NEXT: movb %r8b, 39(%rdx) 3511; SCALAR-NEXT: movb %cl, 38(%rdx) 3512; SCALAR-NEXT: movb %dil, 37(%rdx) 3513; SCALAR-NEXT: movb %al, 36(%rdx) 3514; SCALAR-NEXT: movb %r8b, 43(%rdx) 3515; SCALAR-NEXT: movb %cl, 42(%rdx) 3516; SCALAR-NEXT: movb %dil, 41(%rdx) 3517; SCALAR-NEXT: movb %al, 40(%rdx) 3518; SCALAR-NEXT: movb %r8b, 47(%rdx) 3519; SCALAR-NEXT: movb %cl, 46(%rdx) 3520; SCALAR-NEXT: movb %dil, 45(%rdx) 3521; SCALAR-NEXT: movb %al, 44(%rdx) 3522; SCALAR-NEXT: retq 3523; 3524; SSE2-LABEL: vec384_v4i8: 3525; SSE2: # %bb.0: 3526; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 3527; SSE2-NEXT: pxor (%rdi), %xmm0 3528; SSE2-NEXT: movd %xmm0, (%rsi) 3529; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3530; SSE2-NEXT: movdqa %xmm0, (%rdx) 3531; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 3532; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 3533; SSE2-NEXT: retq 3534; 3535; AVX1-LABEL: vec384_v4i8: 3536; AVX1: # %bb.0: 3537; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3538; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 3539; AVX1-NEXT: vmovd %xmm0, (%rsi) 3540; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3541; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx) 3542; AVX1-NEXT: vmovdqa %xmm0, (%rdx) 3543; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) 3544; AVX1-NEXT: retq 3545; 3546; AVX2-LABEL: vec384_v4i8: 3547; AVX2: # %bb.0: 3548; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3549; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 3550; AVX2-NEXT: vmovd %xmm0, (%rsi) 3551; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 3552; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 3553; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx) 3554; AVX2-NEXT: vzeroupper 3555; AVX2-NEXT: retq 3556 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64 3557 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1> 3558 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 3559 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 3560 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 3561 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 3562 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 3563 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 3564 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 3565 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 3566 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 3567 %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4 3568 store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16 3569 %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5 3570 store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4 3571 %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6 3572 store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8 3573 %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7 3574 store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4 3575 %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8 3576 store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32 3577 %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9 3578 store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4 3579 %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10 3580 store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8 3581 %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11 3582 store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4 3583 ret void 3584} 3585 3586define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 3587; SCALAR-LABEL: vec384_v4i16: 3588; SCALAR: # %bb.0: 3589; SCALAR-NEXT: movzwl 6(%rdi), %r8d 3590; SCALAR-NEXT: movzwl 2(%rdi), %ecx 3591; SCALAR-NEXT: movl (%rdi), %eax 3592; SCALAR-NEXT: movl 4(%rdi), %edi 3593; SCALAR-NEXT: notl %eax 3594; SCALAR-NEXT: notl %ecx 3595; SCALAR-NEXT: notl %edi 3596; SCALAR-NEXT: notl %r8d 3597; SCALAR-NEXT: movw %r8w, 6(%rsi) 3598; SCALAR-NEXT: movw %di, 4(%rsi) 3599; SCALAR-NEXT: movw %cx, 2(%rsi) 3600; SCALAR-NEXT: movw %ax, (%rsi) 3601; SCALAR-NEXT: movw %r8w, 6(%rdx) 3602; SCALAR-NEXT: movw %di, 4(%rdx) 3603; SCALAR-NEXT: movw %cx, 2(%rdx) 3604; SCALAR-NEXT: movw %ax, (%rdx) 3605; SCALAR-NEXT: movw %r8w, 14(%rdx) 3606; SCALAR-NEXT: movw %di, 12(%rdx) 3607; SCALAR-NEXT: movw %cx, 10(%rdx) 3608; SCALAR-NEXT: movw %ax, 8(%rdx) 3609; SCALAR-NEXT: movw %r8w, 22(%rdx) 3610; SCALAR-NEXT: movw %di, 20(%rdx) 3611; SCALAR-NEXT: movw %cx, 18(%rdx) 3612; SCALAR-NEXT: movw %ax, 16(%rdx) 3613; SCALAR-NEXT: movw %r8w, 30(%rdx) 3614; SCALAR-NEXT: movw %di, 28(%rdx) 3615; SCALAR-NEXT: movw %cx, 26(%rdx) 3616; SCALAR-NEXT: movw %ax, 24(%rdx) 3617; SCALAR-NEXT: movw %r8w, 38(%rdx) 3618; SCALAR-NEXT: movw %di, 36(%rdx) 3619; SCALAR-NEXT: movw %cx, 34(%rdx) 3620; SCALAR-NEXT: movw %ax, 32(%rdx) 3621; SCALAR-NEXT: movw %r8w, 46(%rdx) 3622; SCALAR-NEXT: movw %di, 44(%rdx) 3623; SCALAR-NEXT: movw %cx, 42(%rdx) 3624; SCALAR-NEXT: movw %ax, 40(%rdx) 3625; SCALAR-NEXT: retq 3626; 3627; SSE2-LABEL: vec384_v4i16: 3628; SSE2: # %bb.0: 3629; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3630; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 3631; SSE2-NEXT: pxor %xmm0, %xmm1 3632; SSE2-NEXT: movq %xmm1, (%rsi) 3633; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 3634; SSE2-NEXT: movdqa %xmm0, (%rdx) 3635; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 3636; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 3637; SSE2-NEXT: retq 3638; 3639; AVX1-LABEL: vec384_v4i16: 3640; AVX1: # %bb.0: 3641; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 3642; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 3643; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 3644; AVX1-NEXT: vmovq %xmm0, (%rsi) 3645; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3646; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 3647; AVX1-NEXT: vmovaps %ymm1, (%rdx) 3648; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) 3649; AVX1-NEXT: vzeroupper 3650; AVX1-NEXT: retq 3651; 3652; AVX2-ONLY-LABEL: vec384_v4i16: 3653; AVX2-ONLY: # %bb.0: 3654; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 3655; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 3656; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 3657; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 3658; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 3659; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 3660; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) 3661; AVX2-ONLY-NEXT: vzeroupper 3662; AVX2-ONLY-NEXT: retq 3663; 3664; AVX512-LABEL: vec384_v4i16: 3665; AVX512: # %bb.0: 3666; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 3667; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 3668; AVX512-NEXT: vmovq %xmm0, (%rsi) 3669; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 3670; AVX512-NEXT: vmovdqa %ymm0, (%rdx) 3671; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) 3672; AVX512-NEXT: vzeroupper 3673; AVX512-NEXT: retq 3674 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 3675 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1> 3676 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 3677 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 3678 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 3679 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 3680 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 3681 %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2 3682 store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16 3683 %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3 3684 store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8 3685 %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4 3686 store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32 3687 %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5 3688 store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8 3689 ret void 3690} 3691 3692define void @vec384_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 3693; SCALAR-LABEL: vec384_v4i32: 3694; SCALAR: # %bb.0: 3695; SCALAR-NEXT: movaps (%rdi), %xmm0 3696; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3697; SCALAR-NEXT: movaps %xmm0, (%rsi) 3698; SCALAR-NEXT: movaps %xmm0, (%rdx) 3699; SCALAR-NEXT: movaps %xmm0, 16(%rdx) 3700; SCALAR-NEXT: movaps %xmm0, 32(%rdx) 3701; SCALAR-NEXT: retq 3702; 3703; SSE2-LABEL: vec384_v4i32: 3704; SSE2: # %bb.0: 3705; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 3706; SSE2-NEXT: pxor (%rdi), %xmm0 3707; SSE2-NEXT: movdqa %xmm0, (%rsi) 3708; SSE2-NEXT: movdqa %xmm0, (%rdx) 3709; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 3710; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 3711; SSE2-NEXT: retq 3712; 3713; AVX-LABEL: vec384_v4i32: 3714; AVX: # %bb.0: 3715; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3716; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 3717; AVX-NEXT: vmovdqa %xmm0, (%rsi) 3718; AVX-NEXT: vmovdqa %xmm0, (%rdx) 3719; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 3720; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 3721; AVX-NEXT: retq 3722 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 3723 %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> 3724 store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 3725 %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0 3726 store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 3727 %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1 3728 store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16 3729 %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2 3730 store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32 3731 ret void 3732} 3733 3734define void @vec384_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 3735; SCALAR-LABEL: vec384_v4f32: 3736; SCALAR: # %bb.0: 3737; SCALAR-NEXT: movaps (%rdi), %xmm0 3738; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3739; SCALAR-NEXT: movaps %xmm0, (%rsi) 3740; SCALAR-NEXT: movaps %xmm0, (%rdx) 3741; SCALAR-NEXT: movaps %xmm0, 16(%rdx) 3742; SCALAR-NEXT: movaps %xmm0, 32(%rdx) 3743; SCALAR-NEXT: retq 3744; 3745; SSE2-LABEL: vec384_v4f32: 3746; SSE2: # %bb.0: 3747; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 3748; SSE2-NEXT: pxor (%rdi), %xmm0 3749; SSE2-NEXT: movdqa %xmm0, (%rsi) 3750; SSE2-NEXT: movdqa %xmm0, (%rdx) 3751; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 3752; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 3753; SSE2-NEXT: retq 3754; 3755; AVX-LABEL: vec384_v4f32: 3756; AVX: # %bb.0: 3757; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3758; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 3759; AVX-NEXT: vmovdqa %xmm0, (%rsi) 3760; AVX-NEXT: vmovdqa %xmm0, (%rdx) 3761; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 3762; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 3763; AVX-NEXT: retq 3764 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 3765 %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> 3766 %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float> 3767 store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64 3768 %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0 3769 store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 3770 %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1 3771 store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16 3772 %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2 3773 store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32 3774 ret void 3775} 3776 3777define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 3778; SCALAR-LABEL: vec384_v6i8: 3779; SCALAR: # %bb.0: 3780; SCALAR-NEXT: movq (%rdi), %rax 3781; SCALAR-NEXT: movq %rax, %rcx 3782; SCALAR-NEXT: shrq $32, %rcx 3783; SCALAR-NEXT: notl %ecx 3784; SCALAR-NEXT: notl %eax 3785; SCALAR-NEXT: movl %eax, (%rsi) 3786; SCALAR-NEXT: movw %cx, 4(%rsi) 3787; SCALAR-NEXT: movw %cx, 4(%rdx) 3788; SCALAR-NEXT: movl %eax, (%rdx) 3789; SCALAR-NEXT: movw %cx, 12(%rdx) 3790; SCALAR-NEXT: movl %eax, 8(%rdx) 3791; SCALAR-NEXT: movw %cx, 20(%rdx) 3792; SCALAR-NEXT: movl %eax, 16(%rdx) 3793; SCALAR-NEXT: movw %cx, 28(%rdx) 3794; SCALAR-NEXT: movl %eax, 24(%rdx) 3795; SCALAR-NEXT: movw %cx, 36(%rdx) 3796; SCALAR-NEXT: movl %eax, 32(%rdx) 3797; SCALAR-NEXT: movw %cx, 44(%rdx) 3798; SCALAR-NEXT: movl %eax, 40(%rdx) 3799; SCALAR-NEXT: movw %cx, 52(%rdx) 3800; SCALAR-NEXT: movl %eax, 48(%rdx) 3801; SCALAR-NEXT: movw %cx, 60(%rdx) 3802; SCALAR-NEXT: movl %eax, 56(%rdx) 3803; SCALAR-NEXT: retq 3804; 3805; SSE2-ONLY-LABEL: vec384_v6i8: 3806; SSE2-ONLY: # %bb.0: 3807; SSE2-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3808; SSE2-ONLY-NEXT: pcmpeqd %xmm1, %xmm1 3809; SSE2-ONLY-NEXT: pxor %xmm0, %xmm1 3810; SSE2-ONLY-NEXT: movd %xmm1, (%rsi) 3811; SSE2-ONLY-NEXT: pextrw $2, %xmm1, %eax 3812; SSE2-ONLY-NEXT: movw %ax, 4(%rsi) 3813; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) 3814; SSE2-ONLY-NEXT: movd %xmm1, (%rdx) 3815; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) 3816; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) 3817; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) 3818; SSE2-ONLY-NEXT: movd %xmm1, 16(%rdx) 3819; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) 3820; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) 3821; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) 3822; SSE2-ONLY-NEXT: movd %xmm1, 32(%rdx) 3823; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) 3824; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) 3825; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) 3826; SSE2-ONLY-NEXT: movd %xmm1, 48(%rdx) 3827; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) 3828; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) 3829; SSE2-ONLY-NEXT: retq 3830; 3831; SSE3-LABEL: vec384_v6i8: 3832; SSE3: # %bb.0: 3833; SSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3834; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 3835; SSE3-NEXT: pxor %xmm0, %xmm1 3836; SSE3-NEXT: movd %xmm1, (%rsi) 3837; SSE3-NEXT: pextrw $2, %xmm1, %eax 3838; SSE3-NEXT: movw %ax, 4(%rsi) 3839; SSE3-NEXT: movw %ax, 4(%rdx) 3840; SSE3-NEXT: movd %xmm1, (%rdx) 3841; SSE3-NEXT: movw %ax, 12(%rdx) 3842; SSE3-NEXT: movd %xmm1, 8(%rdx) 3843; SSE3-NEXT: movw %ax, 20(%rdx) 3844; SSE3-NEXT: movd %xmm1, 16(%rdx) 3845; SSE3-NEXT: movw %ax, 28(%rdx) 3846; SSE3-NEXT: movd %xmm1, 24(%rdx) 3847; SSE3-NEXT: movw %ax, 36(%rdx) 3848; SSE3-NEXT: movd %xmm1, 32(%rdx) 3849; SSE3-NEXT: movw %ax, 44(%rdx) 3850; SSE3-NEXT: movd %xmm1, 40(%rdx) 3851; SSE3-NEXT: movw %ax, 52(%rdx) 3852; SSE3-NEXT: movd %xmm1, 48(%rdx) 3853; SSE3-NEXT: movw %ax, 60(%rdx) 3854; SSE3-NEXT: movd %xmm1, 56(%rdx) 3855; SSE3-NEXT: retq 3856; 3857; SSSE3-ONLY-LABEL: vec384_v6i8: 3858; SSSE3-ONLY: # %bb.0: 3859; SSSE3-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3860; SSSE3-ONLY-NEXT: pcmpeqd %xmm1, %xmm1 3861; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm1 3862; SSSE3-ONLY-NEXT: movd %xmm1, (%rsi) 3863; SSSE3-ONLY-NEXT: pextrw $2, %xmm1, %eax 3864; SSSE3-ONLY-NEXT: movw %ax, 4(%rsi) 3865; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) 3866; SSSE3-ONLY-NEXT: movd %xmm1, (%rdx) 3867; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) 3868; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) 3869; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) 3870; SSSE3-ONLY-NEXT: movd %xmm1, 16(%rdx) 3871; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) 3872; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) 3873; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) 3874; SSSE3-ONLY-NEXT: movd %xmm1, 32(%rdx) 3875; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) 3876; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) 3877; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) 3878; SSSE3-ONLY-NEXT: movd %xmm1, 48(%rdx) 3879; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) 3880; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) 3881; SSSE3-ONLY-NEXT: retq 3882; 3883; SSE41-LABEL: vec384_v6i8: 3884; SSE41: # %bb.0: 3885; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3886; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 3887; SSE41-NEXT: pxor %xmm0, %xmm1 3888; SSE41-NEXT: pextrw $2, %xmm1, 4(%rsi) 3889; SSE41-NEXT: movd %xmm1, (%rsi) 3890; SSE41-NEXT: pextrw $2, %xmm1, 4(%rdx) 3891; SSE41-NEXT: movd %xmm1, (%rdx) 3892; SSE41-NEXT: pextrw $2, %xmm1, 12(%rdx) 3893; SSE41-NEXT: movd %xmm1, 8(%rdx) 3894; SSE41-NEXT: pextrw $2, %xmm1, 20(%rdx) 3895; SSE41-NEXT: movd %xmm1, 16(%rdx) 3896; SSE41-NEXT: pextrw $2, %xmm1, 28(%rdx) 3897; SSE41-NEXT: movd %xmm1, 24(%rdx) 3898; SSE41-NEXT: pextrw $2, %xmm1, 36(%rdx) 3899; SSE41-NEXT: movd %xmm1, 32(%rdx) 3900; SSE41-NEXT: pextrw $2, %xmm1, 44(%rdx) 3901; SSE41-NEXT: movd %xmm1, 40(%rdx) 3902; SSE41-NEXT: pextrw $2, %xmm1, 52(%rdx) 3903; SSE41-NEXT: movd %xmm1, 48(%rdx) 3904; SSE41-NEXT: pextrw $2, %xmm1, 60(%rdx) 3905; SSE41-NEXT: movd %xmm1, 56(%rdx) 3906; SSE41-NEXT: retq 3907; 3908; SSE42-LABEL: vec384_v6i8: 3909; SSE42: # %bb.0: 3910; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3911; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 3912; SSE42-NEXT: pxor %xmm0, %xmm1 3913; SSE42-NEXT: pextrw $2, %xmm1, 4(%rsi) 3914; SSE42-NEXT: movd %xmm1, (%rsi) 3915; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdx) 3916; SSE42-NEXT: movd %xmm1, (%rdx) 3917; SSE42-NEXT: pextrw $2, %xmm1, 12(%rdx) 3918; SSE42-NEXT: movd %xmm1, 8(%rdx) 3919; SSE42-NEXT: pextrw $2, %xmm1, 20(%rdx) 3920; SSE42-NEXT: movd %xmm1, 16(%rdx) 3921; SSE42-NEXT: pextrw $2, %xmm1, 28(%rdx) 3922; SSE42-NEXT: movd %xmm1, 24(%rdx) 3923; SSE42-NEXT: pextrw $2, %xmm1, 36(%rdx) 3924; SSE42-NEXT: movd %xmm1, 32(%rdx) 3925; SSE42-NEXT: pextrw $2, %xmm1, 44(%rdx) 3926; SSE42-NEXT: movd %xmm1, 40(%rdx) 3927; SSE42-NEXT: pextrw $2, %xmm1, 52(%rdx) 3928; SSE42-NEXT: movd %xmm1, 48(%rdx) 3929; SSE42-NEXT: pextrw $2, %xmm1, 60(%rdx) 3930; SSE42-NEXT: movd %xmm1, 56(%rdx) 3931; SSE42-NEXT: retq 3932; 3933; AVX1-LABEL: vec384_v6i8: 3934; AVX1: # %bb.0: 3935; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 3936; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 3937; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 3938; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rsi) 3939; AVX1-NEXT: vmovd %xmm0, (%rsi) 3940; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdx) 3941; AVX1-NEXT: vmovd %xmm0, (%rdx) 3942; AVX1-NEXT: vpextrw $2, %xmm0, 12(%rdx) 3943; AVX1-NEXT: vmovd %xmm0, 8(%rdx) 3944; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdx) 3945; AVX1-NEXT: vmovd %xmm0, 16(%rdx) 3946; AVX1-NEXT: vpextrw $2, %xmm0, 28(%rdx) 3947; AVX1-NEXT: vmovd %xmm0, 24(%rdx) 3948; AVX1-NEXT: vpextrw $2, %xmm0, 36(%rdx) 3949; AVX1-NEXT: vmovd %xmm0, 32(%rdx) 3950; AVX1-NEXT: vpextrw $2, %xmm0, 44(%rdx) 3951; AVX1-NEXT: vmovd %xmm0, 40(%rdx) 3952; AVX1-NEXT: vpextrw $2, %xmm0, 52(%rdx) 3953; AVX1-NEXT: vmovd %xmm0, 48(%rdx) 3954; AVX1-NEXT: vpextrw $2, %xmm0, 60(%rdx) 3955; AVX1-NEXT: vmovd %xmm0, 56(%rdx) 3956; AVX1-NEXT: retq 3957; 3958; AVX2-ONLY-LABEL: vec384_v6i8: 3959; AVX2-ONLY: # %bb.0: 3960; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 3961; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 3962; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 3963; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rsi) 3964; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi) 3965; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rdx) 3966; AVX2-ONLY-NEXT: vmovd %xmm0, (%rdx) 3967; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 12(%rdx) 3968; AVX2-ONLY-NEXT: vmovd %xmm0, 8(%rdx) 3969; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 20(%rdx) 3970; AVX2-ONLY-NEXT: vmovd %xmm0, 16(%rdx) 3971; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 28(%rdx) 3972; AVX2-ONLY-NEXT: vmovd %xmm0, 24(%rdx) 3973; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 36(%rdx) 3974; AVX2-ONLY-NEXT: vmovd %xmm0, 32(%rdx) 3975; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 44(%rdx) 3976; AVX2-ONLY-NEXT: vmovd %xmm0, 40(%rdx) 3977; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 52(%rdx) 3978; AVX2-ONLY-NEXT: vmovd %xmm0, 48(%rdx) 3979; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 60(%rdx) 3980; AVX2-ONLY-NEXT: vmovd %xmm0, 56(%rdx) 3981; AVX2-ONLY-NEXT: retq 3982; 3983; AVX512-LABEL: vec384_v6i8: 3984; AVX512: # %bb.0: 3985; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 3986; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 3987; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) 3988; AVX512-NEXT: vmovd %xmm0, (%rsi) 3989; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) 3990; AVX512-NEXT: vmovd %xmm0, (%rdx) 3991; AVX512-NEXT: vpextrw $2, %xmm0, 12(%rdx) 3992; AVX512-NEXT: vmovd %xmm0, 8(%rdx) 3993; AVX512-NEXT: vpextrw $2, %xmm0, 20(%rdx) 3994; AVX512-NEXT: vmovd %xmm0, 16(%rdx) 3995; AVX512-NEXT: vpextrw $2, %xmm0, 28(%rdx) 3996; AVX512-NEXT: vmovd %xmm0, 24(%rdx) 3997; AVX512-NEXT: vpextrw $2, %xmm0, 36(%rdx) 3998; AVX512-NEXT: vmovd %xmm0, 32(%rdx) 3999; AVX512-NEXT: vpextrw $2, %xmm0, 44(%rdx) 4000; AVX512-NEXT: vmovd %xmm0, 40(%rdx) 4001; AVX512-NEXT: vpextrw $2, %xmm0, 52(%rdx) 4002; AVX512-NEXT: vmovd %xmm0, 48(%rdx) 4003; AVX512-NEXT: vpextrw $2, %xmm0, 60(%rdx) 4004; AVX512-NEXT: vmovd %xmm0, 56(%rdx) 4005; AVX512-NEXT: retq 4006 %in.subvec.not = load <6 x i8>, ptr %in.subvec.ptr, align 64 4007 %in.subvec = xor <6 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 4008 store <6 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 4009 %out.subvec0.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 0 4010 store <6 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 4011 %out.subvec1.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 1 4012 store <6 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 4013 %out.subvec2.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 2 4014 store <6 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 4015 %out.subvec3.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 3 4016 store <6 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 4017 %out.subvec4.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 4 4018 store <6 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 4019 %out.subvec5.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 5 4020 store <6 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 4021 %out.subvec6.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 6 4022 store <6 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 4023 %out.subvec7.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 7 4024 store <6 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 4025 ret void 4026} 4027 4028define void @vec384_v6i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 4029; SCALAR-LABEL: vec384_v6i16: 4030; SCALAR: # %bb.0: 4031; SCALAR-NEXT: movq (%rdi), %rax 4032; SCALAR-NEXT: movl 8(%rdi), %ecx 4033; SCALAR-NEXT: notl %ecx 4034; SCALAR-NEXT: notq %rax 4035; SCALAR-NEXT: movq %rax, (%rsi) 4036; SCALAR-NEXT: movl %ecx, 8(%rsi) 4037; SCALAR-NEXT: movl %ecx, 8(%rdx) 4038; SCALAR-NEXT: movq %rax, (%rdx) 4039; SCALAR-NEXT: movl %ecx, 24(%rdx) 4040; SCALAR-NEXT: movq %rax, 16(%rdx) 4041; SCALAR-NEXT: movl %ecx, 40(%rdx) 4042; SCALAR-NEXT: movq %rax, 32(%rdx) 4043; SCALAR-NEXT: movl %ecx, 56(%rdx) 4044; SCALAR-NEXT: movq %rax, 48(%rdx) 4045; SCALAR-NEXT: retq 4046; 4047; SSE2-ONLY-LABEL: vec384_v6i16: 4048; SSE2-ONLY: # %bb.0: 4049; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 4050; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 4051; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) 4052; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 4053; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) 4054; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) 4055; SSE2-ONLY-NEXT: movq %xmm0, (%rdx) 4056; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) 4057; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx) 4058; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) 4059; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx) 4060; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) 4061; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx) 4062; SSE2-ONLY-NEXT: retq 4063; 4064; SSE3-LABEL: vec384_v6i16: 4065; SSE3: # %bb.0: 4066; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 4067; SSE3-NEXT: pxor (%rdi), %xmm0 4068; SSE3-NEXT: movq %xmm0, (%rsi) 4069; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 4070; SSE3-NEXT: movd %xmm1, 8(%rsi) 4071; SSE3-NEXT: movd %xmm1, 8(%rdx) 4072; SSE3-NEXT: movq %xmm0, (%rdx) 4073; SSE3-NEXT: movd %xmm1, 24(%rdx) 4074; SSE3-NEXT: movq %xmm0, 16(%rdx) 4075; SSE3-NEXT: movd %xmm1, 40(%rdx) 4076; SSE3-NEXT: movq %xmm0, 32(%rdx) 4077; SSE3-NEXT: movd %xmm1, 56(%rdx) 4078; SSE3-NEXT: movq %xmm0, 48(%rdx) 4079; SSE3-NEXT: retq 4080; 4081; SSSE3-ONLY-LABEL: vec384_v6i16: 4082; SSSE3-ONLY: # %bb.0: 4083; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 4084; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 4085; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) 4086; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 4087; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) 4088; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) 4089; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx) 4090; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) 4091; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx) 4092; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) 4093; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx) 4094; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) 4095; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx) 4096; SSSE3-ONLY-NEXT: retq 4097; 4098; SSE41-LABEL: vec384_v6i16: 4099; SSE41: # %bb.0: 4100; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 4101; SSE41-NEXT: pxor (%rdi), %xmm0 4102; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) 4103; SSE41-NEXT: movq %xmm0, (%rsi) 4104; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx) 4105; SSE41-NEXT: movq %xmm0, (%rdx) 4106; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx) 4107; SSE41-NEXT: movq %xmm0, 16(%rdx) 4108; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx) 4109; SSE41-NEXT: movq %xmm0, 32(%rdx) 4110; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx) 4111; SSE41-NEXT: movq %xmm0, 48(%rdx) 4112; SSE41-NEXT: retq 4113; 4114; SSE42-LABEL: vec384_v6i16: 4115; SSE42: # %bb.0: 4116; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 4117; SSE42-NEXT: pxor (%rdi), %xmm0 4118; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) 4119; SSE42-NEXT: movq %xmm0, (%rsi) 4120; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx) 4121; SSE42-NEXT: movq %xmm0, (%rdx) 4122; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx) 4123; SSE42-NEXT: movq %xmm0, 16(%rdx) 4124; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx) 4125; SSE42-NEXT: movq %xmm0, 32(%rdx) 4126; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx) 4127; SSE42-NEXT: movq %xmm0, 48(%rdx) 4128; SSE42-NEXT: retq 4129; 4130; AVX-LABEL: vec384_v6i16: 4131; AVX: # %bb.0: 4132; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 4133; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 4134; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) 4135; AVX-NEXT: vmovq %xmm0, (%rsi) 4136; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx) 4137; AVX-NEXT: vmovq %xmm0, (%rdx) 4138; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx) 4139; AVX-NEXT: vmovq %xmm0, 16(%rdx) 4140; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx) 4141; AVX-NEXT: vmovq %xmm0, 32(%rdx) 4142; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx) 4143; AVX-NEXT: vmovq %xmm0, 48(%rdx) 4144; AVX-NEXT: retq 4145 %in.subvec.not = load <6 x i16>, ptr %in.subvec.ptr, align 64 4146 %in.subvec = xor <6 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> 4147 store <6 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 4148 %out.subvec0.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 0 4149 store <6 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 4150 %out.subvec1.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 1 4151 store <6 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 4152 %out.subvec2.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 2 4153 store <6 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 4154 %out.subvec3.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 3 4155 store <6 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 4156 ret void 4157} 4158 4159define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 4160; SCALAR-LABEL: vec384_v6i32: 4161; SCALAR: # %bb.0: 4162; SCALAR-NEXT: movq (%rdi), %rax 4163; SCALAR-NEXT: movq 8(%rdi), %rcx 4164; SCALAR-NEXT: movq 16(%rdi), %rdi 4165; SCALAR-NEXT: notq %rdi 4166; SCALAR-NEXT: notq %rcx 4167; SCALAR-NEXT: notq %rax 4168; SCALAR-NEXT: movq %rax, (%rsi) 4169; SCALAR-NEXT: movq %rcx, 8(%rsi) 4170; SCALAR-NEXT: movq %rdi, 16(%rsi) 4171; SCALAR-NEXT: movq %rax, (%rdx) 4172; SCALAR-NEXT: movq %rcx, 8(%rdx) 4173; SCALAR-NEXT: movq %rdi, 16(%rdx) 4174; SCALAR-NEXT: movq %rdi, 48(%rdx) 4175; SCALAR-NEXT: movq %rcx, 40(%rdx) 4176; SCALAR-NEXT: movq %rax, 32(%rdx) 4177; SCALAR-NEXT: retq 4178; 4179; SSE2-LABEL: vec384_v6i32: 4180; SSE2: # %bb.0: 4181; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 4182; SSE2-NEXT: movdqa 16(%rdi), %xmm1 4183; SSE2-NEXT: pxor %xmm0, %xmm1 4184; SSE2-NEXT: pxor (%rdi), %xmm0 4185; SSE2-NEXT: movdqa %xmm0, (%rsi) 4186; SSE2-NEXT: movq %xmm1, 16(%rsi) 4187; SSE2-NEXT: movq %xmm1, 16(%rdx) 4188; SSE2-NEXT: movdqa %xmm0, (%rdx) 4189; SSE2-NEXT: movq %xmm1, 48(%rdx) 4190; SSE2-NEXT: movdqu %xmm0, 32(%rdx) 4191; SSE2-NEXT: retq 4192; 4193; AVX1-LABEL: vec384_v6i32: 4194; AVX1: # %bb.0: 4195; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 4196; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 4197; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 4198; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4199; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) 4200; AVX1-NEXT: vmovaps %xmm0, (%rsi) 4201; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) 4202; AVX1-NEXT: vmovaps %xmm0, (%rdx) 4203; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) 4204; AVX1-NEXT: vmovups %xmm0, 32(%rdx) 4205; AVX1-NEXT: vzeroupper 4206; AVX1-NEXT: retq 4207; 4208; AVX2-LABEL: vec384_v6i32: 4209; AVX2: # %bb.0: 4210; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 4211; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 4212; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4213; AVX2-NEXT: vmovq %xmm1, 16(%rsi) 4214; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 4215; AVX2-NEXT: vmovq %xmm1, 16(%rdx) 4216; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 4217; AVX2-NEXT: vmovq %xmm1, 48(%rdx) 4218; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) 4219; AVX2-NEXT: vzeroupper 4220; AVX2-NEXT: retq 4221 %in.subvec.not = load <6 x i32>, ptr %in.subvec.ptr, align 64 4222 %in.subvec = xor <6 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 4223 store <6 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 4224 %out.subvec0.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 0 4225 store <6 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 4226 %out.subvec1.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 1 4227 store <6 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 4228 ret void 4229} 4230 4231define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 4232; SCALAR-LABEL: vec384_v6f32: 4233; SCALAR: # %bb.0: 4234; SCALAR-NEXT: movq (%rdi), %rax 4235; SCALAR-NEXT: movq 8(%rdi), %rcx 4236; SCALAR-NEXT: movq 16(%rdi), %rdi 4237; SCALAR-NEXT: notq %rdi 4238; SCALAR-NEXT: notq %rcx 4239; SCALAR-NEXT: notq %rax 4240; SCALAR-NEXT: movq %rax, (%rsi) 4241; SCALAR-NEXT: movq %rcx, 8(%rsi) 4242; SCALAR-NEXT: movq %rdi, 16(%rsi) 4243; SCALAR-NEXT: movq %rax, (%rdx) 4244; SCALAR-NEXT: movq %rcx, 8(%rdx) 4245; SCALAR-NEXT: movq %rdi, 16(%rdx) 4246; SCALAR-NEXT: movq %rdi, 48(%rdx) 4247; SCALAR-NEXT: movq %rcx, 40(%rdx) 4248; SCALAR-NEXT: movq %rax, 32(%rdx) 4249; SCALAR-NEXT: retq 4250; 4251; SSE2-LABEL: vec384_v6f32: 4252; SSE2: # %bb.0: 4253; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 4254; SSE2-NEXT: movdqa 16(%rdi), %xmm1 4255; SSE2-NEXT: pxor %xmm0, %xmm1 4256; SSE2-NEXT: pxor (%rdi), %xmm0 4257; SSE2-NEXT: movdqa %xmm0, (%rsi) 4258; SSE2-NEXT: movq %xmm1, 16(%rsi) 4259; SSE2-NEXT: movq %xmm1, 16(%rdx) 4260; SSE2-NEXT: movdqa %xmm0, (%rdx) 4261; SSE2-NEXT: movq %xmm1, 48(%rdx) 4262; SSE2-NEXT: movdqu %xmm0, 32(%rdx) 4263; SSE2-NEXT: retq 4264; 4265; AVX1-LABEL: vec384_v6f32: 4266; AVX1: # %bb.0: 4267; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 4268; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 4269; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 4270; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4271; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) 4272; AVX1-NEXT: vmovaps %xmm0, (%rsi) 4273; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) 4274; AVX1-NEXT: vmovaps %xmm0, (%rdx) 4275; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) 4276; AVX1-NEXT: vmovups %xmm0, 32(%rdx) 4277; AVX1-NEXT: vzeroupper 4278; AVX1-NEXT: retq 4279; 4280; AVX2-LABEL: vec384_v6f32: 4281; AVX2: # %bb.0: 4282; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 4283; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 4284; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4285; AVX2-NEXT: vmovq %xmm1, 16(%rsi) 4286; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 4287; AVX2-NEXT: vmovq %xmm1, 16(%rdx) 4288; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 4289; AVX2-NEXT: vmovq %xmm1, 48(%rdx) 4290; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) 4291; AVX2-NEXT: vzeroupper 4292; AVX2-NEXT: retq 4293 %in.subvec.not = load <6 x i32>, ptr %in.subvec.ptr, align 64 4294 %in.subvec.int = xor <6 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 4295 %in.subvec = bitcast <6 x i32> %in.subvec.int to <6 x float> 4296 store <6 x float> %in.subvec, ptr %out.subvec.ptr, align 64 4297 %out.subvec0.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 0 4298 store <6 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 4299 %out.subvec1.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 1 4300 store <6 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 4301 ret void 4302} 4303 4304define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 4305; SCALAR-LABEL: vec384_v8i8: 4306; SCALAR: # %bb.0: 4307; SCALAR-NEXT: pushq %rbx 4308; SCALAR-NEXT: movzbl 7(%rdi), %ebx 4309; SCALAR-NEXT: movzbl 6(%rdi), %r11d 4310; SCALAR-NEXT: movzbl 5(%rdi), %r10d 4311; SCALAR-NEXT: movzbl 4(%rdi), %r9d 4312; SCALAR-NEXT: movzbl 3(%rdi), %r8d 4313; SCALAR-NEXT: movzbl 2(%rdi), %ecx 4314; SCALAR-NEXT: movzbl (%rdi), %eax 4315; SCALAR-NEXT: movzbl 1(%rdi), %edi 4316; SCALAR-NEXT: notb %al 4317; SCALAR-NEXT: notb %dil 4318; SCALAR-NEXT: notb %cl 4319; SCALAR-NEXT: notb %r8b 4320; SCALAR-NEXT: notb %r9b 4321; SCALAR-NEXT: notb %r10b 4322; SCALAR-NEXT: notb %r11b 4323; SCALAR-NEXT: notb %bl 4324; SCALAR-NEXT: movb %bl, 7(%rsi) 4325; SCALAR-NEXT: movb %r11b, 6(%rsi) 4326; SCALAR-NEXT: movb %r10b, 5(%rsi) 4327; SCALAR-NEXT: movb %r9b, 4(%rsi) 4328; SCALAR-NEXT: movb %r8b, 3(%rsi) 4329; SCALAR-NEXT: movb %cl, 2(%rsi) 4330; SCALAR-NEXT: movb %dil, 1(%rsi) 4331; SCALAR-NEXT: movb %al, (%rsi) 4332; SCALAR-NEXT: movb %bl, 7(%rdx) 4333; SCALAR-NEXT: movb %r11b, 6(%rdx) 4334; SCALAR-NEXT: movb %r10b, 5(%rdx) 4335; SCALAR-NEXT: movb %r9b, 4(%rdx) 4336; SCALAR-NEXT: movb %r8b, 3(%rdx) 4337; SCALAR-NEXT: movb %cl, 2(%rdx) 4338; SCALAR-NEXT: movb %dil, 1(%rdx) 4339; SCALAR-NEXT: movb %al, (%rdx) 4340; SCALAR-NEXT: movb %bl, 15(%rdx) 4341; SCALAR-NEXT: movb %r11b, 14(%rdx) 4342; SCALAR-NEXT: movb %r10b, 13(%rdx) 4343; SCALAR-NEXT: movb %r9b, 12(%rdx) 4344; SCALAR-NEXT: movb %r8b, 11(%rdx) 4345; SCALAR-NEXT: movb %cl, 10(%rdx) 4346; SCALAR-NEXT: movb %dil, 9(%rdx) 4347; SCALAR-NEXT: movb %al, 8(%rdx) 4348; SCALAR-NEXT: movb %bl, 23(%rdx) 4349; SCALAR-NEXT: movb %r11b, 22(%rdx) 4350; SCALAR-NEXT: movb %r10b, 21(%rdx) 4351; SCALAR-NEXT: movb %r9b, 20(%rdx) 4352; SCALAR-NEXT: movb %r8b, 19(%rdx) 4353; SCALAR-NEXT: movb %cl, 18(%rdx) 4354; SCALAR-NEXT: movb %dil, 17(%rdx) 4355; SCALAR-NEXT: movb %al, 16(%rdx) 4356; SCALAR-NEXT: movb %bl, 31(%rdx) 4357; SCALAR-NEXT: movb %r11b, 30(%rdx) 4358; SCALAR-NEXT: movb %r10b, 29(%rdx) 4359; SCALAR-NEXT: movb %r9b, 28(%rdx) 4360; SCALAR-NEXT: movb %r8b, 27(%rdx) 4361; SCALAR-NEXT: movb %cl, 26(%rdx) 4362; SCALAR-NEXT: movb %dil, 25(%rdx) 4363; SCALAR-NEXT: movb %al, 24(%rdx) 4364; SCALAR-NEXT: movb %bl, 39(%rdx) 4365; SCALAR-NEXT: movb %r11b, 38(%rdx) 4366; SCALAR-NEXT: movb %r10b, 37(%rdx) 4367; SCALAR-NEXT: movb %r9b, 36(%rdx) 4368; SCALAR-NEXT: movb %r8b, 35(%rdx) 4369; SCALAR-NEXT: movb %cl, 34(%rdx) 4370; SCALAR-NEXT: movb %dil, 33(%rdx) 4371; SCALAR-NEXT: movb %al, 32(%rdx) 4372; SCALAR-NEXT: movb %bl, 47(%rdx) 4373; SCALAR-NEXT: movb %r11b, 46(%rdx) 4374; SCALAR-NEXT: movb %r10b, 45(%rdx) 4375; SCALAR-NEXT: movb %r9b, 44(%rdx) 4376; SCALAR-NEXT: movb %r8b, 43(%rdx) 4377; SCALAR-NEXT: movb %cl, 42(%rdx) 4378; SCALAR-NEXT: movb %dil, 41(%rdx) 4379; SCALAR-NEXT: movb %al, 40(%rdx) 4380; SCALAR-NEXT: popq %rbx 4381; SCALAR-NEXT: retq 4382; 4383; SSE2-LABEL: vec384_v8i8: 4384; SSE2: # %bb.0: 4385; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 4386; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 4387; SSE2-NEXT: pxor %xmm0, %xmm1 4388; SSE2-NEXT: movq %xmm1, (%rsi) 4389; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 4390; SSE2-NEXT: movdqa %xmm0, (%rdx) 4391; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 4392; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 4393; SSE2-NEXT: retq 4394; 4395; AVX1-LABEL: vec384_v8i8: 4396; AVX1: # %bb.0: 4397; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 4398; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 4399; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 4400; AVX1-NEXT: vmovq %xmm0, (%rsi) 4401; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 4402; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 4403; AVX1-NEXT: vmovaps %ymm1, (%rdx) 4404; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) 4405; AVX1-NEXT: vzeroupper 4406; AVX1-NEXT: retq 4407; 4408; AVX2-ONLY-LABEL: vec384_v8i8: 4409; AVX2-ONLY: # %bb.0: 4410; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 4411; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 4412; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 4413; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 4414; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 4415; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 4416; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) 4417; AVX2-ONLY-NEXT: vzeroupper 4418; AVX2-ONLY-NEXT: retq 4419; 4420; AVX512-LABEL: vec384_v8i8: 4421; AVX512: # %bb.0: 4422; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 4423; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 4424; AVX512-NEXT: vmovq %xmm0, (%rsi) 4425; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 4426; AVX512-NEXT: vmovdqa %ymm0, (%rdx) 4427; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) 4428; AVX512-NEXT: vzeroupper 4429; AVX512-NEXT: retq 4430 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 4431 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 4432 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 4433 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 4434 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 4435 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 4436 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 4437 %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2 4438 store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16 4439 %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3 4440 store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8 4441 %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4 4442 store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32 4443 %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5 4444 store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8 4445 ret void 4446} 4447 4448define void @vec384_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 4449; SCALAR-LABEL: vec384_v8i16: 4450; SCALAR: # %bb.0: 4451; SCALAR-NEXT: pushq %rbx 4452; SCALAR-NEXT: movzwl 14(%rdi), %ebx 4453; SCALAR-NEXT: movl 12(%rdi), %r11d 4454; SCALAR-NEXT: movzwl 10(%rdi), %r10d 4455; SCALAR-NEXT: movl 8(%rdi), %r9d 4456; SCALAR-NEXT: movzwl 6(%rdi), %r8d 4457; SCALAR-NEXT: movzwl 2(%rdi), %ecx 4458; SCALAR-NEXT: movl (%rdi), %eax 4459; SCALAR-NEXT: movl 4(%rdi), %edi 4460; SCALAR-NEXT: notl %eax 4461; SCALAR-NEXT: notl %ecx 4462; SCALAR-NEXT: notl %edi 4463; SCALAR-NEXT: notl %r8d 4464; SCALAR-NEXT: notl %r9d 4465; SCALAR-NEXT: notl %r10d 4466; SCALAR-NEXT: notl %r11d 4467; SCALAR-NEXT: notl %ebx 4468; SCALAR-NEXT: movw %bx, 14(%rsi) 4469; SCALAR-NEXT: movw %r11w, 12(%rsi) 4470; SCALAR-NEXT: movw %r10w, 10(%rsi) 4471; SCALAR-NEXT: movw %r9w, 8(%rsi) 4472; SCALAR-NEXT: movw %r8w, 6(%rsi) 4473; SCALAR-NEXT: movw %di, 4(%rsi) 4474; SCALAR-NEXT: movw %cx, 2(%rsi) 4475; SCALAR-NEXT: movw %ax, (%rsi) 4476; SCALAR-NEXT: movw %bx, 14(%rdx) 4477; SCALAR-NEXT: movw %r11w, 12(%rdx) 4478; SCALAR-NEXT: movw %r10w, 10(%rdx) 4479; SCALAR-NEXT: movw %r9w, 8(%rdx) 4480; SCALAR-NEXT: movw %r8w, 6(%rdx) 4481; SCALAR-NEXT: movw %di, 4(%rdx) 4482; SCALAR-NEXT: movw %cx, 2(%rdx) 4483; SCALAR-NEXT: movw %ax, (%rdx) 4484; SCALAR-NEXT: movw %bx, 30(%rdx) 4485; SCALAR-NEXT: movw %r11w, 28(%rdx) 4486; SCALAR-NEXT: movw %r10w, 26(%rdx) 4487; SCALAR-NEXT: movw %r9w, 24(%rdx) 4488; SCALAR-NEXT: movw %r8w, 22(%rdx) 4489; SCALAR-NEXT: movw %di, 20(%rdx) 4490; SCALAR-NEXT: movw %cx, 18(%rdx) 4491; SCALAR-NEXT: movw %ax, 16(%rdx) 4492; SCALAR-NEXT: movw %bx, 46(%rdx) 4493; SCALAR-NEXT: movw %r11w, 44(%rdx) 4494; SCALAR-NEXT: movw %r10w, 42(%rdx) 4495; SCALAR-NEXT: movw %r9w, 40(%rdx) 4496; SCALAR-NEXT: movw %r8w, 38(%rdx) 4497; SCALAR-NEXT: movw %di, 36(%rdx) 4498; SCALAR-NEXT: movw %cx, 34(%rdx) 4499; SCALAR-NEXT: movw %ax, 32(%rdx) 4500; SCALAR-NEXT: popq %rbx 4501; SCALAR-NEXT: retq 4502; 4503; SSE2-LABEL: vec384_v8i16: 4504; SSE2: # %bb.0: 4505; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 4506; SSE2-NEXT: pxor (%rdi), %xmm0 4507; SSE2-NEXT: movdqa %xmm0, (%rsi) 4508; SSE2-NEXT: movdqa %xmm0, (%rdx) 4509; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 4510; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 4511; SSE2-NEXT: retq 4512; 4513; AVX-LABEL: vec384_v8i16: 4514; AVX: # %bb.0: 4515; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 4516; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 4517; AVX-NEXT: vmovdqa %xmm0, (%rsi) 4518; AVX-NEXT: vmovdqa %xmm0, (%rdx) 4519; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 4520; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 4521; AVX-NEXT: retq 4522 %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64 4523 %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> 4524 store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 4525 %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0 4526 store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 4527 %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1 4528 store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16 4529 %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2 4530 store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32 4531 ret void 4532} 4533 4534define void @vec384_v12i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 4535; SCALAR-LABEL: vec384_v12i8: 4536; SCALAR: # %bb.0: 4537; SCALAR-NEXT: movq (%rdi), %rax 4538; SCALAR-NEXT: movl 8(%rdi), %ecx 4539; SCALAR-NEXT: notl %ecx 4540; SCALAR-NEXT: notq %rax 4541; SCALAR-NEXT: movq %rax, (%rsi) 4542; SCALAR-NEXT: movl %ecx, 8(%rsi) 4543; SCALAR-NEXT: movl %ecx, 8(%rdx) 4544; SCALAR-NEXT: movq %rax, (%rdx) 4545; SCALAR-NEXT: movl %ecx, 24(%rdx) 4546; SCALAR-NEXT: movq %rax, 16(%rdx) 4547; SCALAR-NEXT: movl %ecx, 40(%rdx) 4548; SCALAR-NEXT: movq %rax, 32(%rdx) 4549; SCALAR-NEXT: movl %ecx, 56(%rdx) 4550; SCALAR-NEXT: movq %rax, 48(%rdx) 4551; SCALAR-NEXT: retq 4552; 4553; SSE2-ONLY-LABEL: vec384_v12i8: 4554; SSE2-ONLY: # %bb.0: 4555; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 4556; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 4557; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) 4558; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 4559; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) 4560; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx) 4561; SSE2-ONLY-NEXT: movq %xmm0, (%rdx) 4562; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx) 4563; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx) 4564; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx) 4565; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx) 4566; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx) 4567; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx) 4568; SSE2-ONLY-NEXT: retq 4569; 4570; SSE3-LABEL: vec384_v12i8: 4571; SSE3: # %bb.0: 4572; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 4573; SSE3-NEXT: pxor (%rdi), %xmm0 4574; SSE3-NEXT: movq %xmm0, (%rsi) 4575; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 4576; SSE3-NEXT: movd %xmm1, 8(%rsi) 4577; SSE3-NEXT: movd %xmm1, 8(%rdx) 4578; SSE3-NEXT: movq %xmm0, (%rdx) 4579; SSE3-NEXT: movd %xmm1, 24(%rdx) 4580; SSE3-NEXT: movq %xmm0, 16(%rdx) 4581; SSE3-NEXT: movd %xmm1, 40(%rdx) 4582; SSE3-NEXT: movq %xmm0, 32(%rdx) 4583; SSE3-NEXT: movd %xmm1, 56(%rdx) 4584; SSE3-NEXT: movq %xmm0, 48(%rdx) 4585; SSE3-NEXT: retq 4586; 4587; SSSE3-ONLY-LABEL: vec384_v12i8: 4588; SSSE3-ONLY: # %bb.0: 4589; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 4590; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 4591; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) 4592; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 4593; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) 4594; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx) 4595; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx) 4596; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx) 4597; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx) 4598; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx) 4599; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx) 4600; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx) 4601; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx) 4602; SSSE3-ONLY-NEXT: retq 4603; 4604; SSE41-LABEL: vec384_v12i8: 4605; SSE41: # %bb.0: 4606; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 4607; SSE41-NEXT: pxor (%rdi), %xmm0 4608; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) 4609; SSE41-NEXT: movq %xmm0, (%rsi) 4610; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx) 4611; SSE41-NEXT: movq %xmm0, (%rdx) 4612; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx) 4613; SSE41-NEXT: movq %xmm0, 16(%rdx) 4614; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx) 4615; SSE41-NEXT: movq %xmm0, 32(%rdx) 4616; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx) 4617; SSE41-NEXT: movq %xmm0, 48(%rdx) 4618; SSE41-NEXT: retq 4619; 4620; SSE42-LABEL: vec384_v12i8: 4621; SSE42: # %bb.0: 4622; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 4623; SSE42-NEXT: pxor (%rdi), %xmm0 4624; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) 4625; SSE42-NEXT: movq %xmm0, (%rsi) 4626; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx) 4627; SSE42-NEXT: movq %xmm0, (%rdx) 4628; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx) 4629; SSE42-NEXT: movq %xmm0, 16(%rdx) 4630; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx) 4631; SSE42-NEXT: movq %xmm0, 32(%rdx) 4632; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx) 4633; SSE42-NEXT: movq %xmm0, 48(%rdx) 4634; SSE42-NEXT: retq 4635; 4636; AVX-LABEL: vec384_v12i8: 4637; AVX: # %bb.0: 4638; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 4639; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 4640; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) 4641; AVX-NEXT: vmovq %xmm0, (%rsi) 4642; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx) 4643; AVX-NEXT: vmovq %xmm0, (%rdx) 4644; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx) 4645; AVX-NEXT: vmovq %xmm0, 16(%rdx) 4646; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx) 4647; AVX-NEXT: vmovq %xmm0, 32(%rdx) 4648; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx) 4649; AVX-NEXT: vmovq %xmm0, 48(%rdx) 4650; AVX-NEXT: retq 4651 %in.subvec.not = load <12 x i8>, ptr %in.subvec.ptr, align 64 4652 %in.subvec = xor <12 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 4653 store <12 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 4654 %out.subvec0.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 0 4655 store <12 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 4656 %out.subvec1.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 1 4657 store <12 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 4658 %out.subvec2.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 2 4659 store <12 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 4660 %out.subvec3.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 3 4661 store <12 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 4662 ret void 4663} 4664 4665define void @vec384_v12i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 4666; SCALAR-LABEL: vec384_v12i16: 4667; SCALAR: # %bb.0: 4668; SCALAR-NEXT: movq (%rdi), %rax 4669; SCALAR-NEXT: movq 8(%rdi), %rcx 4670; SCALAR-NEXT: movq 16(%rdi), %rdi 4671; SCALAR-NEXT: notq %rdi 4672; SCALAR-NEXT: notq %rcx 4673; SCALAR-NEXT: notq %rax 4674; SCALAR-NEXT: movq %rax, (%rsi) 4675; SCALAR-NEXT: movq %rcx, 8(%rsi) 4676; SCALAR-NEXT: movq %rdi, 16(%rsi) 4677; SCALAR-NEXT: movq %rax, (%rdx) 4678; SCALAR-NEXT: movq %rcx, 8(%rdx) 4679; SCALAR-NEXT: movq %rdi, 16(%rdx) 4680; SCALAR-NEXT: movq %rdi, 48(%rdx) 4681; SCALAR-NEXT: movq %rcx, 40(%rdx) 4682; SCALAR-NEXT: movq %rax, 32(%rdx) 4683; SCALAR-NEXT: retq 4684; 4685; SSE2-LABEL: vec384_v12i16: 4686; SSE2: # %bb.0: 4687; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 4688; SSE2-NEXT: movdqa 16(%rdi), %xmm1 4689; SSE2-NEXT: pxor %xmm0, %xmm1 4690; SSE2-NEXT: pxor (%rdi), %xmm0 4691; SSE2-NEXT: movdqa %xmm0, (%rsi) 4692; SSE2-NEXT: movq %xmm1, 16(%rsi) 4693; SSE2-NEXT: movq %xmm1, 16(%rdx) 4694; SSE2-NEXT: movdqa %xmm0, (%rdx) 4695; SSE2-NEXT: movq %xmm1, 48(%rdx) 4696; SSE2-NEXT: movdqu %xmm0, 32(%rdx) 4697; SSE2-NEXT: retq 4698; 4699; AVX1-LABEL: vec384_v12i16: 4700; AVX1: # %bb.0: 4701; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 4702; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 4703; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 4704; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4705; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) 4706; AVX1-NEXT: vmovaps %xmm0, (%rsi) 4707; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) 4708; AVX1-NEXT: vmovaps %xmm0, (%rdx) 4709; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) 4710; AVX1-NEXT: vmovups %xmm0, 32(%rdx) 4711; AVX1-NEXT: vzeroupper 4712; AVX1-NEXT: retq 4713; 4714; AVX2-LABEL: vec384_v12i16: 4715; AVX2: # %bb.0: 4716; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 4717; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 4718; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4719; AVX2-NEXT: vmovq %xmm1, 16(%rsi) 4720; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 4721; AVX2-NEXT: vmovq %xmm1, 16(%rdx) 4722; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 4723; AVX2-NEXT: vmovq %xmm1, 48(%rdx) 4724; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) 4725; AVX2-NEXT: vzeroupper 4726; AVX2-NEXT: retq 4727 %in.subvec.not = load <12 x i16>, ptr %in.subvec.ptr, align 64 4728 %in.subvec = xor <12 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> 4729 store <12 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 4730 %out.subvec0.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 0 4731 store <12 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 4732 %out.subvec1.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 1 4733 store <12 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 4734 ret void 4735} 4736 4737define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 4738; SCALAR-LABEL: vec384_v16i8: 4739; SCALAR: # %bb.0: 4740; SCALAR-NEXT: pushq %rbp 4741; SCALAR-NEXT: pushq %r15 4742; SCALAR-NEXT: pushq %r14 4743; SCALAR-NEXT: pushq %r13 4744; SCALAR-NEXT: pushq %r12 4745; SCALAR-NEXT: pushq %rbx 4746; SCALAR-NEXT: movzbl 15(%rdi), %eax 4747; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4748; SCALAR-NEXT: movzbl 14(%rdi), %eax 4749; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4750; SCALAR-NEXT: movzbl 13(%rdi), %eax 4751; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4752; SCALAR-NEXT: movzbl 12(%rdi), %r11d 4753; SCALAR-NEXT: movzbl 11(%rdi), %r13d 4754; SCALAR-NEXT: movzbl 10(%rdi), %r12d 4755; SCALAR-NEXT: movzbl 9(%rdi), %ebp 4756; SCALAR-NEXT: movzbl 8(%rdi), %r14d 4757; SCALAR-NEXT: movzbl 7(%rdi), %ebx 4758; SCALAR-NEXT: movzbl 6(%rdi), %r10d 4759; SCALAR-NEXT: movzbl 5(%rdi), %r15d 4760; SCALAR-NEXT: movzbl 4(%rdi), %r9d 4761; SCALAR-NEXT: movzbl 3(%rdi), %r8d 4762; SCALAR-NEXT: movzbl 2(%rdi), %ecx 4763; SCALAR-NEXT: movzbl (%rdi), %eax 4764; SCALAR-NEXT: movzbl 1(%rdi), %edi 4765; SCALAR-NEXT: notb %al 4766; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4767; SCALAR-NEXT: notb %dil 4768; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4769; SCALAR-NEXT: notb %cl 4770; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4771; SCALAR-NEXT: notb %r8b 4772; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4773; SCALAR-NEXT: notb %r9b 4774; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4775; SCALAR-NEXT: movl %r15d, %r9d 4776; SCALAR-NEXT: notb %r9b 4777; SCALAR-NEXT: notb %r10b 4778; SCALAR-NEXT: notb %bl 4779; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4780; SCALAR-NEXT: notb %r14b 4781; SCALAR-NEXT: notb %bpl 4782; SCALAR-NEXT: movl %ebp, %r15d 4783; SCALAR-NEXT: notb %r12b 4784; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4785; SCALAR-NEXT: notb %r13b 4786; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4787; SCALAR-NEXT: notb %r11b 4788; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 4789; SCALAR-NEXT: notb %dil 4790; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 4791; SCALAR-NEXT: notb %cl 4792; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4793; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload 4794; SCALAR-NEXT: notb %r8b 4795; SCALAR-NEXT: movb %r8b, 15(%rsi) 4796; SCALAR-NEXT: movb %cl, 14(%rsi) 4797; SCALAR-NEXT: movl %edi, %eax 4798; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4799; SCALAR-NEXT: movb %dil, 13(%rsi) 4800; SCALAR-NEXT: movb %r11b, 12(%rsi) 4801; SCALAR-NEXT: movl %r11d, %ebp 4802; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4803; SCALAR-NEXT: movb %r13b, 11(%rsi) 4804; SCALAR-NEXT: movb %r12b, 10(%rsi) 4805; SCALAR-NEXT: movb %r15b, 9(%rsi) 4806; SCALAR-NEXT: movb %r14b, 8(%rsi) 4807; SCALAR-NEXT: movb %bl, 7(%rsi) 4808; SCALAR-NEXT: movb %r10b, 6(%rsi) 4809; SCALAR-NEXT: movl %r10d, %ebx 4810; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4811; SCALAR-NEXT: movb %r9b, 5(%rsi) 4812; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload 4813; SCALAR-NEXT: movb %r11b, 4(%rsi) 4814; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload 4815; SCALAR-NEXT: movb %r12b, 3(%rsi) 4816; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 4817; SCALAR-NEXT: movb %cl, 2(%rsi) 4818; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload 4819; SCALAR-NEXT: movb %r13b, 1(%rsi) 4820; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload 4821; SCALAR-NEXT: movb %r10b, (%rsi) 4822; SCALAR-NEXT: movb %r8b, 15(%rdx) 4823; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 4824; SCALAR-NEXT: movb %dil, 14(%rdx) 4825; SCALAR-NEXT: movb %al, 13(%rdx) 4826; SCALAR-NEXT: movb %bpl, 12(%rdx) 4827; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 4828; SCALAR-NEXT: movb %al, 11(%rdx) 4829; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 4830; SCALAR-NEXT: movb %al, 10(%rdx) 4831; SCALAR-NEXT: movb %r15b, 9(%rdx) 4832; SCALAR-NEXT: movb %r14b, 8(%rdx) 4833; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload 4834; SCALAR-NEXT: movb %bpl, 7(%rdx) 4835; SCALAR-NEXT: movb %bl, 6(%rdx) 4836; SCALAR-NEXT: movb %r9b, 5(%rdx) 4837; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 4838; SCALAR-NEXT: movb %r11b, 4(%rdx) 4839; SCALAR-NEXT: movb %r12b, 3(%rdx) 4840; SCALAR-NEXT: movb %cl, 2(%rdx) 4841; SCALAR-NEXT: movl %r13d, %ebx 4842; SCALAR-NEXT: movb %r13b, 1(%rdx) 4843; SCALAR-NEXT: movl %r10d, %esi 4844; SCALAR-NEXT: movb %r10b, (%rdx) 4845; SCALAR-NEXT: movb %r8b, 31(%rdx) 4846; SCALAR-NEXT: movb %dil, 30(%rdx) 4847; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 4848; SCALAR-NEXT: movb %al, 29(%rdx) 4849; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload 4850; SCALAR-NEXT: movb %r11b, 28(%rdx) 4851; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload 4852; SCALAR-NEXT: movb %r13b, 27(%rdx) 4853; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload 4854; SCALAR-NEXT: movb %r12b, 26(%rdx) 4855; SCALAR-NEXT: movb %r15b, 25(%rdx) 4856; SCALAR-NEXT: movb %r14b, 24(%rdx) 4857; SCALAR-NEXT: movb %bpl, 23(%rdx) 4858; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload 4859; SCALAR-NEXT: movb %r10b, 22(%rdx) 4860; SCALAR-NEXT: movb %r9b, 21(%rdx) 4861; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload 4862; SCALAR-NEXT: movb %r9b, 20(%rdx) 4863; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 4864; SCALAR-NEXT: movb %dil, 19(%rdx) 4865; SCALAR-NEXT: movb %cl, 18(%rdx) 4866; SCALAR-NEXT: movb %bl, 17(%rdx) 4867; SCALAR-NEXT: movb %sil, 16(%rdx) 4868; SCALAR-NEXT: movb %r8b, 47(%rdx) 4869; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload 4870; SCALAR-NEXT: movb %r8b, 46(%rdx) 4871; SCALAR-NEXT: movb %al, 45(%rdx) 4872; SCALAR-NEXT: movb %r11b, 44(%rdx) 4873; SCALAR-NEXT: movb %r13b, 43(%rdx) 4874; SCALAR-NEXT: movb %r12b, 42(%rdx) 4875; SCALAR-NEXT: movb %r15b, 41(%rdx) 4876; SCALAR-NEXT: movb %r14b, 40(%rdx) 4877; SCALAR-NEXT: movb %bpl, 39(%rdx) 4878; SCALAR-NEXT: movb %r10b, 38(%rdx) 4879; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 4880; SCALAR-NEXT: movb %al, 37(%rdx) 4881; SCALAR-NEXT: movb %r9b, 36(%rdx) 4882; SCALAR-NEXT: movb %dil, 35(%rdx) 4883; SCALAR-NEXT: movb %cl, 34(%rdx) 4884; SCALAR-NEXT: movb %bl, 33(%rdx) 4885; SCALAR-NEXT: movb %sil, 32(%rdx) 4886; SCALAR-NEXT: popq %rbx 4887; SCALAR-NEXT: popq %r12 4888; SCALAR-NEXT: popq %r13 4889; SCALAR-NEXT: popq %r14 4890; SCALAR-NEXT: popq %r15 4891; SCALAR-NEXT: popq %rbp 4892; SCALAR-NEXT: retq 4893; 4894; SSE2-LABEL: vec384_v16i8: 4895; SSE2: # %bb.0: 4896; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 4897; SSE2-NEXT: pxor (%rdi), %xmm0 4898; SSE2-NEXT: movdqa %xmm0, (%rsi) 4899; SSE2-NEXT: movdqa %xmm0, (%rdx) 4900; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 4901; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 4902; SSE2-NEXT: retq 4903; 4904; AVX-LABEL: vec384_v16i8: 4905; AVX: # %bb.0: 4906; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 4907; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 4908; AVX-NEXT: vmovdqa %xmm0, (%rsi) 4909; AVX-NEXT: vmovdqa %xmm0, (%rdx) 4910; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 4911; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 4912; AVX-NEXT: retq 4913 %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64 4914 %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 4915 store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 4916 %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0 4917 store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 4918 %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1 4919 store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16 4920 %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2 4921 store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32 4922 ret void 4923} 4924 4925define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 4926; SCALAR-LABEL: vec384_v24i8: 4927; SCALAR: # %bb.0: 4928; SCALAR-NEXT: movq (%rdi), %rax 4929; SCALAR-NEXT: movq 8(%rdi), %rcx 4930; SCALAR-NEXT: movq 16(%rdi), %rdi 4931; SCALAR-NEXT: notq %rdi 4932; SCALAR-NEXT: notq %rcx 4933; SCALAR-NEXT: notq %rax 4934; SCALAR-NEXT: movq %rax, (%rsi) 4935; SCALAR-NEXT: movq %rcx, 8(%rsi) 4936; SCALAR-NEXT: movq %rdi, 16(%rsi) 4937; SCALAR-NEXT: movq %rax, (%rdx) 4938; SCALAR-NEXT: movq %rcx, 8(%rdx) 4939; SCALAR-NEXT: movq %rdi, 16(%rdx) 4940; SCALAR-NEXT: movq %rdi, 48(%rdx) 4941; SCALAR-NEXT: movq %rcx, 40(%rdx) 4942; SCALAR-NEXT: movq %rax, 32(%rdx) 4943; SCALAR-NEXT: retq 4944; 4945; SSE2-LABEL: vec384_v24i8: 4946; SSE2: # %bb.0: 4947; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 4948; SSE2-NEXT: movdqa 16(%rdi), %xmm1 4949; SSE2-NEXT: pxor %xmm0, %xmm1 4950; SSE2-NEXT: pxor (%rdi), %xmm0 4951; SSE2-NEXT: movdqa %xmm0, (%rsi) 4952; SSE2-NEXT: movq %xmm1, 16(%rsi) 4953; SSE2-NEXT: movq %xmm1, 16(%rdx) 4954; SSE2-NEXT: movdqa %xmm0, (%rdx) 4955; SSE2-NEXT: movq %xmm1, 48(%rdx) 4956; SSE2-NEXT: movdqu %xmm0, 32(%rdx) 4957; SSE2-NEXT: retq 4958; 4959; AVX1-LABEL: vec384_v24i8: 4960; AVX1: # %bb.0: 4961; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 4962; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 4963; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 4964; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4965; AVX1-NEXT: vmovlps %xmm1, 16(%rsi) 4966; AVX1-NEXT: vmovaps %xmm0, (%rsi) 4967; AVX1-NEXT: vmovlps %xmm1, 16(%rdx) 4968; AVX1-NEXT: vmovaps %xmm0, (%rdx) 4969; AVX1-NEXT: vmovlps %xmm1, 48(%rdx) 4970; AVX1-NEXT: vmovups %xmm0, 32(%rdx) 4971; AVX1-NEXT: vzeroupper 4972; AVX1-NEXT: retq 4973; 4974; AVX2-LABEL: vec384_v24i8: 4975; AVX2: # %bb.0: 4976; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 4977; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 4978; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4979; AVX2-NEXT: vmovq %xmm1, 16(%rsi) 4980; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 4981; AVX2-NEXT: vmovq %xmm1, 16(%rdx) 4982; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 4983; AVX2-NEXT: vmovq %xmm1, 48(%rdx) 4984; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx) 4985; AVX2-NEXT: vzeroupper 4986; AVX2-NEXT: retq 4987 %in.subvec.not = load <24 x i8>, ptr %in.subvec.ptr, align 64 4988 %in.subvec = xor <24 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 4989 store <24 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 4990 %out.subvec0.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 0 4991 store <24 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 4992 %out.subvec1.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 1 4993 store <24 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 4994 ret void 4995} 4996 4997define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 4998; SCALAR-LABEL: vec512_v2i8: 4999; SCALAR: # %bb.0: 5000; SCALAR-NEXT: movzbl (%rdi), %eax 5001; SCALAR-NEXT: movzbl 1(%rdi), %ecx 5002; SCALAR-NEXT: notb %al 5003; SCALAR-NEXT: notb %cl 5004; SCALAR-NEXT: movb %cl, 1(%rsi) 5005; SCALAR-NEXT: movb %al, (%rsi) 5006; SCALAR-NEXT: movb %cl, 1(%rdx) 5007; SCALAR-NEXT: movb %al, (%rdx) 5008; SCALAR-NEXT: movb %cl, 3(%rdx) 5009; SCALAR-NEXT: movb %al, 2(%rdx) 5010; SCALAR-NEXT: movb %cl, 5(%rdx) 5011; SCALAR-NEXT: movb %al, 4(%rdx) 5012; SCALAR-NEXT: movb %cl, 7(%rdx) 5013; SCALAR-NEXT: movb %al, 6(%rdx) 5014; SCALAR-NEXT: movb %cl, 9(%rdx) 5015; SCALAR-NEXT: movb %al, 8(%rdx) 5016; SCALAR-NEXT: movb %cl, 11(%rdx) 5017; SCALAR-NEXT: movb %al, 10(%rdx) 5018; SCALAR-NEXT: movb %cl, 13(%rdx) 5019; SCALAR-NEXT: movb %al, 12(%rdx) 5020; SCALAR-NEXT: movb %cl, 15(%rdx) 5021; SCALAR-NEXT: movb %al, 14(%rdx) 5022; SCALAR-NEXT: movb %cl, 17(%rdx) 5023; SCALAR-NEXT: movb %al, 16(%rdx) 5024; SCALAR-NEXT: movb %cl, 19(%rdx) 5025; SCALAR-NEXT: movb %al, 18(%rdx) 5026; SCALAR-NEXT: movb %cl, 21(%rdx) 5027; SCALAR-NEXT: movb %al, 20(%rdx) 5028; SCALAR-NEXT: movb %cl, 23(%rdx) 5029; SCALAR-NEXT: movb %al, 22(%rdx) 5030; SCALAR-NEXT: movb %cl, 25(%rdx) 5031; SCALAR-NEXT: movb %al, 24(%rdx) 5032; SCALAR-NEXT: movb %cl, 27(%rdx) 5033; SCALAR-NEXT: movb %al, 26(%rdx) 5034; SCALAR-NEXT: movb %cl, 29(%rdx) 5035; SCALAR-NEXT: movb %al, 28(%rdx) 5036; SCALAR-NEXT: movb %cl, 31(%rdx) 5037; SCALAR-NEXT: movb %al, 30(%rdx) 5038; SCALAR-NEXT: movb %cl, 33(%rdx) 5039; SCALAR-NEXT: movb %al, 32(%rdx) 5040; SCALAR-NEXT: movb %cl, 35(%rdx) 5041; SCALAR-NEXT: movb %al, 34(%rdx) 5042; SCALAR-NEXT: movb %cl, 37(%rdx) 5043; SCALAR-NEXT: movb %al, 36(%rdx) 5044; SCALAR-NEXT: movb %cl, 39(%rdx) 5045; SCALAR-NEXT: movb %al, 38(%rdx) 5046; SCALAR-NEXT: movb %cl, 41(%rdx) 5047; SCALAR-NEXT: movb %al, 40(%rdx) 5048; SCALAR-NEXT: movb %cl, 43(%rdx) 5049; SCALAR-NEXT: movb %al, 42(%rdx) 5050; SCALAR-NEXT: movb %cl, 45(%rdx) 5051; SCALAR-NEXT: movb %al, 44(%rdx) 5052; SCALAR-NEXT: movb %cl, 47(%rdx) 5053; SCALAR-NEXT: movb %al, 46(%rdx) 5054; SCALAR-NEXT: movb %cl, 49(%rdx) 5055; SCALAR-NEXT: movb %al, 48(%rdx) 5056; SCALAR-NEXT: movb %cl, 51(%rdx) 5057; SCALAR-NEXT: movb %al, 50(%rdx) 5058; SCALAR-NEXT: movb %cl, 53(%rdx) 5059; SCALAR-NEXT: movb %al, 52(%rdx) 5060; SCALAR-NEXT: movb %cl, 55(%rdx) 5061; SCALAR-NEXT: movb %al, 54(%rdx) 5062; SCALAR-NEXT: movb %cl, 57(%rdx) 5063; SCALAR-NEXT: movb %al, 56(%rdx) 5064; SCALAR-NEXT: movb %cl, 59(%rdx) 5065; SCALAR-NEXT: movb %al, 58(%rdx) 5066; SCALAR-NEXT: movb %cl, 61(%rdx) 5067; SCALAR-NEXT: movb %al, 60(%rdx) 5068; SCALAR-NEXT: movb %cl, 63(%rdx) 5069; SCALAR-NEXT: movb %al, 62(%rdx) 5070; SCALAR-NEXT: retq 5071; 5072; SSE2-ONLY-LABEL: vec512_v2i8: 5073; SSE2-ONLY: # %bb.0: 5074; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 5075; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0 5076; SSE2-ONLY-NEXT: movd %xmm0, %eax 5077; SSE2-ONLY-NEXT: movw %ax, (%rsi) 5078; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 5079; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5080; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) 5081; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx) 5082; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx) 5083; SSE2-ONLY-NEXT: movdqa %xmm0, 48(%rdx) 5084; SSE2-ONLY-NEXT: retq 5085; 5086; SSE3-LABEL: vec512_v2i8: 5087; SSE3: # %bb.0: 5088; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 5089; SSE3-NEXT: pxor (%rdi), %xmm0 5090; SSE3-NEXT: movd %xmm0, %eax 5091; SSE3-NEXT: movw %ax, (%rsi) 5092; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 5093; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5094; SSE3-NEXT: movdqa %xmm0, (%rdx) 5095; SSE3-NEXT: movdqa %xmm0, 16(%rdx) 5096; SSE3-NEXT: movdqa %xmm0, 32(%rdx) 5097; SSE3-NEXT: movdqa %xmm0, 48(%rdx) 5098; SSE3-NEXT: retq 5099; 5100; SSSE3-ONLY-LABEL: vec512_v2i8: 5101; SSSE3-ONLY: # %bb.0: 5102; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0 5103; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0 5104; SSSE3-ONLY-NEXT: movd %xmm0, %eax 5105; SSSE3-ONLY-NEXT: movw %ax, (%rsi) 5106; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 5107; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5108; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) 5109; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx) 5110; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx) 5111; SSSE3-ONLY-NEXT: movdqa %xmm0, 48(%rdx) 5112; SSSE3-ONLY-NEXT: retq 5113; 5114; SSE41-LABEL: vec512_v2i8: 5115; SSE41: # %bb.0: 5116; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 5117; SSE41-NEXT: pxor (%rdi), %xmm0 5118; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) 5119; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 5120; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5121; SSE41-NEXT: movdqa %xmm0, (%rdx) 5122; SSE41-NEXT: movdqa %xmm0, 16(%rdx) 5123; SSE41-NEXT: movdqa %xmm0, 32(%rdx) 5124; SSE41-NEXT: movdqa %xmm0, 48(%rdx) 5125; SSE41-NEXT: retq 5126; 5127; SSE42-LABEL: vec512_v2i8: 5128; SSE42: # %bb.0: 5129; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 5130; SSE42-NEXT: pxor (%rdi), %xmm0 5131; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 5132; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 5133; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5134; SSE42-NEXT: movdqa %xmm0, (%rdx) 5135; SSE42-NEXT: movdqa %xmm0, 16(%rdx) 5136; SSE42-NEXT: movdqa %xmm0, 32(%rdx) 5137; SSE42-NEXT: movdqa %xmm0, 48(%rdx) 5138; SSE42-NEXT: retq 5139; 5140; AVX1-LABEL: vec512_v2i8: 5141; AVX1: # %bb.0: 5142; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5143; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 5144; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) 5145; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 5146; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5147; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5148; AVX1-NEXT: vmovaps %ymm0, (%rdx) 5149; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 5150; AVX1-NEXT: vzeroupper 5151; AVX1-NEXT: retq 5152; 5153; AVX2-ONLY-LABEL: vec512_v2i8: 5154; AVX2-ONLY: # %bb.0: 5155; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5156; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 5157; AVX2-ONLY-NEXT: vpextrw $0, %xmm0, (%rsi) 5158; AVX2-ONLY-NEXT: vpbroadcastw %xmm0, %ymm0 5159; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 5160; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) 5161; AVX2-ONLY-NEXT: vzeroupper 5162; AVX2-ONLY-NEXT: retq 5163; 5164; AVX512F-LABEL: vec512_v2i8: 5165; AVX512F: # %bb.0: 5166; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5167; AVX512F-NEXT: vpxor (%rdi), %xmm0, %xmm0 5168; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 5169; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 5170; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 5171; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) 5172; AVX512F-NEXT: vzeroupper 5173; AVX512F-NEXT: retq 5174; 5175; AVX512BW-LABEL: vec512_v2i8: 5176; AVX512BW: # %bb.0: 5177; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5178; AVX512BW-NEXT: vpxor (%rdi), %xmm0, %xmm0 5179; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 5180; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0 5181; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 5182; AVX512BW-NEXT: vzeroupper 5183; AVX512BW-NEXT: retq 5184 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 5185 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1> 5186 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 5187 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 5188 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 5189 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 5190 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 5191 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 5192 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 5193 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 5194 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 5195 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 5196 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 5197 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 5198 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 5199 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 5200 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 5201 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 5202 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 5203 %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8 5204 store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16 5205 %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9 5206 store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2 5207 %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10 5208 store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4 5209 %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11 5210 store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2 5211 %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12 5212 store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8 5213 %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13 5214 store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2 5215 %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14 5216 store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4 5217 %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15 5218 store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2 5219 %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16 5220 store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32 5221 %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17 5222 store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2 5223 %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18 5224 store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4 5225 %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19 5226 store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2 5227 %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20 5228 store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8 5229 %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21 5230 store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2 5231 %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22 5232 store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4 5233 %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23 5234 store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2 5235 %out.subvec24.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 24 5236 store <2 x i8> %in.subvec, ptr %out.subvec24.ptr, align 16 5237 %out.subvec25.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 25 5238 store <2 x i8> %in.subvec, ptr %out.subvec25.ptr, align 2 5239 %out.subvec26.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 26 5240 store <2 x i8> %in.subvec, ptr %out.subvec26.ptr, align 4 5241 %out.subvec27.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 27 5242 store <2 x i8> %in.subvec, ptr %out.subvec27.ptr, align 2 5243 %out.subvec28.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 28 5244 store <2 x i8> %in.subvec, ptr %out.subvec28.ptr, align 8 5245 %out.subvec29.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 29 5246 store <2 x i8> %in.subvec, ptr %out.subvec29.ptr, align 2 5247 %out.subvec30.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 30 5248 store <2 x i8> %in.subvec, ptr %out.subvec30.ptr, align 4 5249 %out.subvec31.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 31 5250 store <2 x i8> %in.subvec, ptr %out.subvec31.ptr, align 2 5251 ret void 5252} 5253 5254define void @vec512_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 5255; SCALAR-LABEL: vec512_v2i16: 5256; SCALAR: # %bb.0: 5257; SCALAR-NEXT: movzwl 2(%rdi), %ecx 5258; SCALAR-NEXT: movl (%rdi), %eax 5259; SCALAR-NEXT: notl %eax 5260; SCALAR-NEXT: notl %ecx 5261; SCALAR-NEXT: movw %cx, 2(%rsi) 5262; SCALAR-NEXT: movw %ax, (%rsi) 5263; SCALAR-NEXT: movw %cx, 2(%rdx) 5264; SCALAR-NEXT: movw %ax, (%rdx) 5265; SCALAR-NEXT: movw %cx, 6(%rdx) 5266; SCALAR-NEXT: movw %ax, 4(%rdx) 5267; SCALAR-NEXT: movw %cx, 10(%rdx) 5268; SCALAR-NEXT: movw %ax, 8(%rdx) 5269; SCALAR-NEXT: movw %cx, 14(%rdx) 5270; SCALAR-NEXT: movw %ax, 12(%rdx) 5271; SCALAR-NEXT: movw %cx, 18(%rdx) 5272; SCALAR-NEXT: movw %ax, 16(%rdx) 5273; SCALAR-NEXT: movw %cx, 22(%rdx) 5274; SCALAR-NEXT: movw %ax, 20(%rdx) 5275; SCALAR-NEXT: movw %cx, 26(%rdx) 5276; SCALAR-NEXT: movw %ax, 24(%rdx) 5277; SCALAR-NEXT: movw %cx, 30(%rdx) 5278; SCALAR-NEXT: movw %ax, 28(%rdx) 5279; SCALAR-NEXT: movw %cx, 34(%rdx) 5280; SCALAR-NEXT: movw %ax, 32(%rdx) 5281; SCALAR-NEXT: movw %cx, 38(%rdx) 5282; SCALAR-NEXT: movw %ax, 36(%rdx) 5283; SCALAR-NEXT: movw %cx, 42(%rdx) 5284; SCALAR-NEXT: movw %ax, 40(%rdx) 5285; SCALAR-NEXT: movw %cx, 46(%rdx) 5286; SCALAR-NEXT: movw %ax, 44(%rdx) 5287; SCALAR-NEXT: movw %cx, 50(%rdx) 5288; SCALAR-NEXT: movw %ax, 48(%rdx) 5289; SCALAR-NEXT: movw %cx, 54(%rdx) 5290; SCALAR-NEXT: movw %ax, 52(%rdx) 5291; SCALAR-NEXT: movw %cx, 58(%rdx) 5292; SCALAR-NEXT: movw %ax, 56(%rdx) 5293; SCALAR-NEXT: movw %cx, 62(%rdx) 5294; SCALAR-NEXT: movw %ax, 60(%rdx) 5295; SCALAR-NEXT: retq 5296; 5297; SSE2-LABEL: vec512_v2i16: 5298; SSE2: # %bb.0: 5299; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 5300; SSE2-NEXT: pxor (%rdi), %xmm0 5301; SSE2-NEXT: movd %xmm0, (%rsi) 5302; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5303; SSE2-NEXT: movdqa %xmm0, (%rdx) 5304; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 5305; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 5306; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 5307; SSE2-NEXT: retq 5308; 5309; AVX1-LABEL: vec512_v2i16: 5310; AVX1: # %bb.0: 5311; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5312; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 5313; AVX1-NEXT: vmovd %xmm0, (%rsi) 5314; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5315; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5316; AVX1-NEXT: vmovaps %ymm0, (%rdx) 5317; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 5318; AVX1-NEXT: vzeroupper 5319; AVX1-NEXT: retq 5320; 5321; AVX2-ONLY-LABEL: vec512_v2i16: 5322; AVX2-ONLY: # %bb.0: 5323; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5324; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 5325; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi) 5326; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %ymm0 5327; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 5328; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) 5329; AVX2-ONLY-NEXT: vzeroupper 5330; AVX2-ONLY-NEXT: retq 5331; 5332; AVX512-LABEL: vec512_v2i16: 5333; AVX512: # %bb.0: 5334; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5335; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 5336; AVX512-NEXT: vmovd %xmm0, (%rsi) 5337; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 5338; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) 5339; AVX512-NEXT: vzeroupper 5340; AVX512-NEXT: retq 5341 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64 5342 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1> 5343 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 5344 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 5345 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 5346 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 5347 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 5348 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 5349 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 5350 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 5351 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 5352 %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4 5353 store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16 5354 %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5 5355 store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4 5356 %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6 5357 store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8 5358 %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7 5359 store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4 5360 %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8 5361 store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32 5362 %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9 5363 store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4 5364 %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10 5365 store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8 5366 %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11 5367 store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4 5368 %out.subvec12.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 12 5369 store <2 x i16> %in.subvec, ptr %out.subvec12.ptr, align 16 5370 %out.subvec13.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 13 5371 store <2 x i16> %in.subvec, ptr %out.subvec13.ptr, align 4 5372 %out.subvec14.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 14 5373 store <2 x i16> %in.subvec, ptr %out.subvec14.ptr, align 8 5374 %out.subvec15.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 15 5375 store <2 x i16> %in.subvec, ptr %out.subvec15.ptr, align 4 5376 ret void 5377} 5378 5379define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 5380; SCALAR-LABEL: vec512_v2i32: 5381; SCALAR: # %bb.0: 5382; SCALAR-NEXT: movl (%rdi), %eax 5383; SCALAR-NEXT: movl 4(%rdi), %ecx 5384; SCALAR-NEXT: notl %eax 5385; SCALAR-NEXT: notl %ecx 5386; SCALAR-NEXT: movl %ecx, 4(%rsi) 5387; SCALAR-NEXT: movl %eax, (%rsi) 5388; SCALAR-NEXT: movl %ecx, 4(%rdx) 5389; SCALAR-NEXT: movl %eax, (%rdx) 5390; SCALAR-NEXT: movl %ecx, 12(%rdx) 5391; SCALAR-NEXT: movl %eax, 8(%rdx) 5392; SCALAR-NEXT: movl %ecx, 20(%rdx) 5393; SCALAR-NEXT: movl %eax, 16(%rdx) 5394; SCALAR-NEXT: movl %ecx, 28(%rdx) 5395; SCALAR-NEXT: movl %eax, 24(%rdx) 5396; SCALAR-NEXT: movl %ecx, 36(%rdx) 5397; SCALAR-NEXT: movl %eax, 32(%rdx) 5398; SCALAR-NEXT: movl %ecx, 44(%rdx) 5399; SCALAR-NEXT: movl %eax, 40(%rdx) 5400; SCALAR-NEXT: movl %ecx, 52(%rdx) 5401; SCALAR-NEXT: movl %eax, 48(%rdx) 5402; SCALAR-NEXT: movl %ecx, 60(%rdx) 5403; SCALAR-NEXT: movl %eax, 56(%rdx) 5404; SCALAR-NEXT: retq 5405; 5406; SSE2-LABEL: vec512_v2i32: 5407; SSE2: # %bb.0: 5408; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 5409; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 5410; SSE2-NEXT: pxor %xmm0, %xmm1 5411; SSE2-NEXT: movq %xmm1, (%rsi) 5412; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 5413; SSE2-NEXT: movdqa %xmm0, (%rdx) 5414; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 5415; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 5416; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 5417; SSE2-NEXT: retq 5418; 5419; AVX1-LABEL: vec512_v2i32: 5420; AVX1: # %bb.0: 5421; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 5422; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 5423; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 5424; AVX1-NEXT: vmovq %xmm0, (%rsi) 5425; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 5426; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5427; AVX1-NEXT: vmovaps %ymm0, (%rdx) 5428; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 5429; AVX1-NEXT: vzeroupper 5430; AVX1-NEXT: retq 5431; 5432; AVX2-ONLY-LABEL: vec512_v2i32: 5433; AVX2-ONLY: # %bb.0: 5434; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 5435; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 5436; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 5437; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 5438; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 5439; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 5440; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) 5441; AVX2-ONLY-NEXT: vzeroupper 5442; AVX2-ONLY-NEXT: retq 5443; 5444; AVX512-LABEL: vec512_v2i32: 5445; AVX512: # %bb.0: 5446; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 5447; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 5448; AVX512-NEXT: vmovq %xmm0, (%rsi) 5449; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 5450; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) 5451; AVX512-NEXT: vzeroupper 5452; AVX512-NEXT: retq 5453 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 5454 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> 5455 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 5456 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 5457 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 5458 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 5459 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 5460 %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2 5461 store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16 5462 %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3 5463 store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8 5464 %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4 5465 store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32 5466 %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5 5467 store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8 5468 %out.subvec6.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 6 5469 store <2 x i32> %in.subvec, ptr %out.subvec6.ptr, align 16 5470 %out.subvec7.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 7 5471 store <2 x i32> %in.subvec, ptr %out.subvec7.ptr, align 8 5472 ret void 5473} 5474 5475define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 5476; SCALAR-LABEL: vec512_v2f32: 5477; SCALAR: # %bb.0: 5478; SCALAR-NEXT: movl (%rdi), %eax 5479; SCALAR-NEXT: movl 4(%rdi), %ecx 5480; SCALAR-NEXT: notl %eax 5481; SCALAR-NEXT: notl %ecx 5482; SCALAR-NEXT: movl %ecx, 4(%rsi) 5483; SCALAR-NEXT: movl %eax, (%rsi) 5484; SCALAR-NEXT: movl %ecx, 4(%rdx) 5485; SCALAR-NEXT: movl %eax, (%rdx) 5486; SCALAR-NEXT: movl %ecx, 12(%rdx) 5487; SCALAR-NEXT: movl %eax, 8(%rdx) 5488; SCALAR-NEXT: movl %ecx, 20(%rdx) 5489; SCALAR-NEXT: movl %eax, 16(%rdx) 5490; SCALAR-NEXT: movl %ecx, 28(%rdx) 5491; SCALAR-NEXT: movl %eax, 24(%rdx) 5492; SCALAR-NEXT: movl %ecx, 36(%rdx) 5493; SCALAR-NEXT: movl %eax, 32(%rdx) 5494; SCALAR-NEXT: movl %ecx, 44(%rdx) 5495; SCALAR-NEXT: movl %eax, 40(%rdx) 5496; SCALAR-NEXT: movl %ecx, 52(%rdx) 5497; SCALAR-NEXT: movl %eax, 48(%rdx) 5498; SCALAR-NEXT: movl %ecx, 60(%rdx) 5499; SCALAR-NEXT: movl %eax, 56(%rdx) 5500; SCALAR-NEXT: retq 5501; 5502; SSE2-LABEL: vec512_v2f32: 5503; SSE2: # %bb.0: 5504; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 5505; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 5506; SSE2-NEXT: pxor %xmm0, %xmm1 5507; SSE2-NEXT: movq %xmm1, (%rsi) 5508; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 5509; SSE2-NEXT: movdqa %xmm0, (%rdx) 5510; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 5511; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 5512; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 5513; SSE2-NEXT: retq 5514; 5515; AVX1-LABEL: vec512_v2f32: 5516; AVX1: # %bb.0: 5517; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 5518; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 5519; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 5520; AVX1-NEXT: vmovq %xmm0, (%rsi) 5521; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 5522; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5523; AVX1-NEXT: vmovaps %ymm0, (%rdx) 5524; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 5525; AVX1-NEXT: vzeroupper 5526; AVX1-NEXT: retq 5527; 5528; AVX2-ONLY-LABEL: vec512_v2f32: 5529; AVX2-ONLY: # %bb.0: 5530; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 5531; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 5532; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 5533; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 5534; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 5535; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 5536; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) 5537; AVX2-ONLY-NEXT: vzeroupper 5538; AVX2-ONLY-NEXT: retq 5539; 5540; AVX512-LABEL: vec512_v2f32: 5541; AVX512: # %bb.0: 5542; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 5543; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 5544; AVX512-NEXT: vmovq %xmm0, (%rsi) 5545; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 5546; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) 5547; AVX512-NEXT: vzeroupper 5548; AVX512-NEXT: retq 5549 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 5550 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1> 5551 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> 5552 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64 5553 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 5554 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 5555 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 5556 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 5557 %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2 5558 store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16 5559 %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3 5560 store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8 5561 %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4 5562 store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32 5563 %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5 5564 store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8 5565 %out.subvec6.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 6 5566 store <2 x float> %in.subvec, ptr %out.subvec6.ptr, align 16 5567 %out.subvec7.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 7 5568 store <2 x float> %in.subvec, ptr %out.subvec7.ptr, align 8 5569 ret void 5570} 5571 5572define void @vec512_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 5573; SCALAR-LABEL: vec512_v2i64: 5574; SCALAR: # %bb.0: 5575; SCALAR-NEXT: movq (%rdi), %rax 5576; SCALAR-NEXT: movq 8(%rdi), %rcx 5577; SCALAR-NEXT: notq %rax 5578; SCALAR-NEXT: notq %rcx 5579; SCALAR-NEXT: movq %rcx, 8(%rsi) 5580; SCALAR-NEXT: movq %rax, (%rsi) 5581; SCALAR-NEXT: movq %rcx, 8(%rdx) 5582; SCALAR-NEXT: movq %rax, (%rdx) 5583; SCALAR-NEXT: movq %rcx, 24(%rdx) 5584; SCALAR-NEXT: movq %rax, 16(%rdx) 5585; SCALAR-NEXT: movq %rcx, 40(%rdx) 5586; SCALAR-NEXT: movq %rax, 32(%rdx) 5587; SCALAR-NEXT: movq %rcx, 56(%rdx) 5588; SCALAR-NEXT: movq %rax, 48(%rdx) 5589; SCALAR-NEXT: retq 5590; 5591; SSE2-LABEL: vec512_v2i64: 5592; SSE2: # %bb.0: 5593; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 5594; SSE2-NEXT: pxor (%rdi), %xmm0 5595; SSE2-NEXT: movdqa %xmm0, (%rsi) 5596; SSE2-NEXT: movdqa %xmm0, (%rdx) 5597; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 5598; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 5599; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 5600; SSE2-NEXT: retq 5601; 5602; AVX-LABEL: vec512_v2i64: 5603; AVX: # %bb.0: 5604; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5605; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 5606; AVX-NEXT: vmovdqa %xmm0, (%rsi) 5607; AVX-NEXT: vmovdqa %xmm0, (%rdx) 5608; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 5609; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 5610; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) 5611; AVX-NEXT: retq 5612 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 5613 %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> 5614 store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 5615 %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0 5616 store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 5617 %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1 5618 store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16 5619 %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2 5620 store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32 5621 %out.subvec3.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 3 5622 store <2 x i64> %in.subvec, ptr %out.subvec3.ptr, align 16 5623 ret void 5624} 5625 5626define void @vec512_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 5627; SCALAR-LABEL: vec512_v2f64: 5628; SCALAR: # %bb.0: 5629; SCALAR-NEXT: movq (%rdi), %rax 5630; SCALAR-NEXT: movq 8(%rdi), %rcx 5631; SCALAR-NEXT: notq %rax 5632; SCALAR-NEXT: notq %rcx 5633; SCALAR-NEXT: movq %rcx, 8(%rsi) 5634; SCALAR-NEXT: movq %rax, (%rsi) 5635; SCALAR-NEXT: movq %rcx, 8(%rdx) 5636; SCALAR-NEXT: movq %rax, (%rdx) 5637; SCALAR-NEXT: movq %rcx, 24(%rdx) 5638; SCALAR-NEXT: movq %rax, 16(%rdx) 5639; SCALAR-NEXT: movq %rcx, 40(%rdx) 5640; SCALAR-NEXT: movq %rax, 32(%rdx) 5641; SCALAR-NEXT: movq %rcx, 56(%rdx) 5642; SCALAR-NEXT: movq %rax, 48(%rdx) 5643; SCALAR-NEXT: retq 5644; 5645; SSE2-LABEL: vec512_v2f64: 5646; SSE2: # %bb.0: 5647; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 5648; SSE2-NEXT: pxor (%rdi), %xmm0 5649; SSE2-NEXT: movdqa %xmm0, (%rsi) 5650; SSE2-NEXT: movdqa %xmm0, (%rdx) 5651; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 5652; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 5653; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 5654; SSE2-NEXT: retq 5655; 5656; AVX-LABEL: vec512_v2f64: 5657; AVX: # %bb.0: 5658; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5659; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 5660; AVX-NEXT: vmovdqa %xmm0, (%rsi) 5661; AVX-NEXT: vmovdqa %xmm0, (%rdx) 5662; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 5663; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 5664; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) 5665; AVX-NEXT: retq 5666 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 5667 %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1> 5668 %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double> 5669 store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64 5670 %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0 5671 store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 5672 %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1 5673 store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16 5674 %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2 5675 store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32 5676 %out.subvec3.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 3 5677 store <2 x double> %in.subvec, ptr %out.subvec3.ptr, align 16 5678 ret void 5679} 5680 5681define void @vec512_v2i128(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 5682; ALL-LABEL: vec512_v2i128: 5683; ALL: # %bb.0: 5684; ALL-NEXT: movq 16(%rdi), %rax 5685; ALL-NEXT: movq 24(%rdi), %rcx 5686; ALL-NEXT: movq (%rdi), %r8 5687; ALL-NEXT: movq 8(%rdi), %rdi 5688; ALL-NEXT: notq %rdi 5689; ALL-NEXT: notq %r8 5690; ALL-NEXT: notq %rcx 5691; ALL-NEXT: notq %rax 5692; ALL-NEXT: movq %rax, 16(%rsi) 5693; ALL-NEXT: movq %rcx, 24(%rsi) 5694; ALL-NEXT: movq %r8, (%rsi) 5695; ALL-NEXT: movq %rdi, 8(%rsi) 5696; ALL-NEXT: movq %rax, 16(%rdx) 5697; ALL-NEXT: movq %rcx, 24(%rdx) 5698; ALL-NEXT: movq %r8, (%rdx) 5699; ALL-NEXT: movq %rdi, 8(%rdx) 5700; ALL-NEXT: movq %rax, 48(%rdx) 5701; ALL-NEXT: movq %rcx, 56(%rdx) 5702; ALL-NEXT: movq %r8, 32(%rdx) 5703; ALL-NEXT: movq %rdi, 40(%rdx) 5704; ALL-NEXT: retq 5705 %in.subvec.not = load <2 x i128>, ptr %in.subvec.ptr, align 64 5706 %in.subvec = xor <2 x i128> %in.subvec.not, <i128 -1, i128 -1> 5707 store <2 x i128> %in.subvec, ptr %out.subvec.ptr, align 64 5708 %out.subvec0.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 0 5709 store <2 x i128> %in.subvec, ptr %out.subvec0.ptr, align 64 5710 %out.subvec1.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 1 5711 store <2 x i128> %in.subvec, ptr %out.subvec1.ptr, align 32 5712 ret void 5713} 5714 5715define void @vec512_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 5716; SCALAR-LABEL: vec512_v4i8: 5717; SCALAR: # %bb.0: 5718; SCALAR-NEXT: movzbl 3(%rdi), %r8d 5719; SCALAR-NEXT: movzbl 2(%rdi), %ecx 5720; SCALAR-NEXT: movzbl (%rdi), %eax 5721; SCALAR-NEXT: movzbl 1(%rdi), %edi 5722; SCALAR-NEXT: notb %al 5723; SCALAR-NEXT: notb %dil 5724; SCALAR-NEXT: notb %cl 5725; SCALAR-NEXT: notb %r8b 5726; SCALAR-NEXT: movb %r8b, 3(%rsi) 5727; SCALAR-NEXT: movb %cl, 2(%rsi) 5728; SCALAR-NEXT: movb %dil, 1(%rsi) 5729; SCALAR-NEXT: movb %al, (%rsi) 5730; SCALAR-NEXT: movb %r8b, 3(%rdx) 5731; SCALAR-NEXT: movb %cl, 2(%rdx) 5732; SCALAR-NEXT: movb %dil, 1(%rdx) 5733; SCALAR-NEXT: movb %al, (%rdx) 5734; SCALAR-NEXT: movb %r8b, 7(%rdx) 5735; SCALAR-NEXT: movb %cl, 6(%rdx) 5736; SCALAR-NEXT: movb %dil, 5(%rdx) 5737; SCALAR-NEXT: movb %al, 4(%rdx) 5738; SCALAR-NEXT: movb %r8b, 11(%rdx) 5739; SCALAR-NEXT: movb %cl, 10(%rdx) 5740; SCALAR-NEXT: movb %dil, 9(%rdx) 5741; SCALAR-NEXT: movb %al, 8(%rdx) 5742; SCALAR-NEXT: movb %r8b, 15(%rdx) 5743; SCALAR-NEXT: movb %cl, 14(%rdx) 5744; SCALAR-NEXT: movb %dil, 13(%rdx) 5745; SCALAR-NEXT: movb %al, 12(%rdx) 5746; SCALAR-NEXT: movb %r8b, 19(%rdx) 5747; SCALAR-NEXT: movb %cl, 18(%rdx) 5748; SCALAR-NEXT: movb %dil, 17(%rdx) 5749; SCALAR-NEXT: movb %al, 16(%rdx) 5750; SCALAR-NEXT: movb %r8b, 23(%rdx) 5751; SCALAR-NEXT: movb %cl, 22(%rdx) 5752; SCALAR-NEXT: movb %dil, 21(%rdx) 5753; SCALAR-NEXT: movb %al, 20(%rdx) 5754; SCALAR-NEXT: movb %r8b, 27(%rdx) 5755; SCALAR-NEXT: movb %cl, 26(%rdx) 5756; SCALAR-NEXT: movb %dil, 25(%rdx) 5757; SCALAR-NEXT: movb %al, 24(%rdx) 5758; SCALAR-NEXT: movb %r8b, 31(%rdx) 5759; SCALAR-NEXT: movb %cl, 30(%rdx) 5760; SCALAR-NEXT: movb %dil, 29(%rdx) 5761; SCALAR-NEXT: movb %al, 28(%rdx) 5762; SCALAR-NEXT: movb %r8b, 35(%rdx) 5763; SCALAR-NEXT: movb %cl, 34(%rdx) 5764; SCALAR-NEXT: movb %dil, 33(%rdx) 5765; SCALAR-NEXT: movb %al, 32(%rdx) 5766; SCALAR-NEXT: movb %r8b, 39(%rdx) 5767; SCALAR-NEXT: movb %cl, 38(%rdx) 5768; SCALAR-NEXT: movb %dil, 37(%rdx) 5769; SCALAR-NEXT: movb %al, 36(%rdx) 5770; SCALAR-NEXT: movb %r8b, 43(%rdx) 5771; SCALAR-NEXT: movb %cl, 42(%rdx) 5772; SCALAR-NEXT: movb %dil, 41(%rdx) 5773; SCALAR-NEXT: movb %al, 40(%rdx) 5774; SCALAR-NEXT: movb %r8b, 47(%rdx) 5775; SCALAR-NEXT: movb %cl, 46(%rdx) 5776; SCALAR-NEXT: movb %dil, 45(%rdx) 5777; SCALAR-NEXT: movb %al, 44(%rdx) 5778; SCALAR-NEXT: movb %r8b, 51(%rdx) 5779; SCALAR-NEXT: movb %cl, 50(%rdx) 5780; SCALAR-NEXT: movb %dil, 49(%rdx) 5781; SCALAR-NEXT: movb %al, 48(%rdx) 5782; SCALAR-NEXT: movb %r8b, 55(%rdx) 5783; SCALAR-NEXT: movb %cl, 54(%rdx) 5784; SCALAR-NEXT: movb %dil, 53(%rdx) 5785; SCALAR-NEXT: movb %al, 52(%rdx) 5786; SCALAR-NEXT: movb %r8b, 59(%rdx) 5787; SCALAR-NEXT: movb %cl, 58(%rdx) 5788; SCALAR-NEXT: movb %dil, 57(%rdx) 5789; SCALAR-NEXT: movb %al, 56(%rdx) 5790; SCALAR-NEXT: movb %r8b, 63(%rdx) 5791; SCALAR-NEXT: movb %cl, 62(%rdx) 5792; SCALAR-NEXT: movb %dil, 61(%rdx) 5793; SCALAR-NEXT: movb %al, 60(%rdx) 5794; SCALAR-NEXT: retq 5795; 5796; SSE2-LABEL: vec512_v4i8: 5797; SSE2: # %bb.0: 5798; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 5799; SSE2-NEXT: pxor (%rdi), %xmm0 5800; SSE2-NEXT: movd %xmm0, (%rsi) 5801; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5802; SSE2-NEXT: movdqa %xmm0, (%rdx) 5803; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 5804; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 5805; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 5806; SSE2-NEXT: retq 5807; 5808; AVX1-LABEL: vec512_v4i8: 5809; AVX1: # %bb.0: 5810; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5811; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 5812; AVX1-NEXT: vmovd %xmm0, (%rsi) 5813; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5814; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5815; AVX1-NEXT: vmovaps %ymm0, (%rdx) 5816; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 5817; AVX1-NEXT: vzeroupper 5818; AVX1-NEXT: retq 5819; 5820; AVX2-ONLY-LABEL: vec512_v4i8: 5821; AVX2-ONLY: # %bb.0: 5822; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5823; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 5824; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi) 5825; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %ymm0 5826; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 5827; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) 5828; AVX2-ONLY-NEXT: vzeroupper 5829; AVX2-ONLY-NEXT: retq 5830; 5831; AVX512-LABEL: vec512_v4i8: 5832; AVX512: # %bb.0: 5833; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 5834; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 5835; AVX512-NEXT: vmovd %xmm0, (%rsi) 5836; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 5837; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) 5838; AVX512-NEXT: vzeroupper 5839; AVX512-NEXT: retq 5840 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64 5841 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1> 5842 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 5843 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 5844 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 5845 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 5846 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 5847 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 5848 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 5849 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 5850 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 5851 %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4 5852 store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16 5853 %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5 5854 store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4 5855 %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6 5856 store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8 5857 %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7 5858 store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4 5859 %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8 5860 store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32 5861 %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9 5862 store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4 5863 %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10 5864 store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8 5865 %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11 5866 store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4 5867 %out.subvec12.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 12 5868 store <4 x i8> %in.subvec, ptr %out.subvec12.ptr, align 16 5869 %out.subvec13.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 13 5870 store <4 x i8> %in.subvec, ptr %out.subvec13.ptr, align 4 5871 %out.subvec14.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 14 5872 store <4 x i8> %in.subvec, ptr %out.subvec14.ptr, align 8 5873 %out.subvec15.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 15 5874 store <4 x i8> %in.subvec, ptr %out.subvec15.ptr, align 4 5875 ret void 5876} 5877 5878define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 5879; SCALAR-LABEL: vec512_v4i16: 5880; SCALAR: # %bb.0: 5881; SCALAR-NEXT: movzwl 6(%rdi), %r8d 5882; SCALAR-NEXT: movzwl 2(%rdi), %ecx 5883; SCALAR-NEXT: movl (%rdi), %eax 5884; SCALAR-NEXT: movl 4(%rdi), %edi 5885; SCALAR-NEXT: notl %eax 5886; SCALAR-NEXT: notl %ecx 5887; SCALAR-NEXT: notl %edi 5888; SCALAR-NEXT: notl %r8d 5889; SCALAR-NEXT: movw %r8w, 6(%rsi) 5890; SCALAR-NEXT: movw %di, 4(%rsi) 5891; SCALAR-NEXT: movw %cx, 2(%rsi) 5892; SCALAR-NEXT: movw %ax, (%rsi) 5893; SCALAR-NEXT: movw %r8w, 6(%rdx) 5894; SCALAR-NEXT: movw %di, 4(%rdx) 5895; SCALAR-NEXT: movw %cx, 2(%rdx) 5896; SCALAR-NEXT: movw %ax, (%rdx) 5897; SCALAR-NEXT: movw %r8w, 14(%rdx) 5898; SCALAR-NEXT: movw %di, 12(%rdx) 5899; SCALAR-NEXT: movw %cx, 10(%rdx) 5900; SCALAR-NEXT: movw %ax, 8(%rdx) 5901; SCALAR-NEXT: movw %r8w, 22(%rdx) 5902; SCALAR-NEXT: movw %di, 20(%rdx) 5903; SCALAR-NEXT: movw %cx, 18(%rdx) 5904; SCALAR-NEXT: movw %ax, 16(%rdx) 5905; SCALAR-NEXT: movw %r8w, 30(%rdx) 5906; SCALAR-NEXT: movw %di, 28(%rdx) 5907; SCALAR-NEXT: movw %cx, 26(%rdx) 5908; SCALAR-NEXT: movw %ax, 24(%rdx) 5909; SCALAR-NEXT: movw %r8w, 38(%rdx) 5910; SCALAR-NEXT: movw %di, 36(%rdx) 5911; SCALAR-NEXT: movw %cx, 34(%rdx) 5912; SCALAR-NEXT: movw %ax, 32(%rdx) 5913; SCALAR-NEXT: movw %r8w, 46(%rdx) 5914; SCALAR-NEXT: movw %di, 44(%rdx) 5915; SCALAR-NEXT: movw %cx, 42(%rdx) 5916; SCALAR-NEXT: movw %ax, 40(%rdx) 5917; SCALAR-NEXT: movw %r8w, 54(%rdx) 5918; SCALAR-NEXT: movw %di, 52(%rdx) 5919; SCALAR-NEXT: movw %cx, 50(%rdx) 5920; SCALAR-NEXT: movw %ax, 48(%rdx) 5921; SCALAR-NEXT: movw %r8w, 62(%rdx) 5922; SCALAR-NEXT: movw %di, 60(%rdx) 5923; SCALAR-NEXT: movw %cx, 58(%rdx) 5924; SCALAR-NEXT: movw %ax, 56(%rdx) 5925; SCALAR-NEXT: retq 5926; 5927; SSE2-LABEL: vec512_v4i16: 5928; SSE2: # %bb.0: 5929; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 5930; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 5931; SSE2-NEXT: pxor %xmm0, %xmm1 5932; SSE2-NEXT: movq %xmm1, (%rsi) 5933; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 5934; SSE2-NEXT: movdqa %xmm0, (%rdx) 5935; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 5936; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 5937; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 5938; SSE2-NEXT: retq 5939; 5940; AVX1-LABEL: vec512_v4i16: 5941; AVX1: # %bb.0: 5942; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 5943; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 5944; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 5945; AVX1-NEXT: vmovq %xmm0, (%rsi) 5946; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 5947; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 5948; AVX1-NEXT: vmovaps %ymm0, (%rdx) 5949; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 5950; AVX1-NEXT: vzeroupper 5951; AVX1-NEXT: retq 5952; 5953; AVX2-ONLY-LABEL: vec512_v4i16: 5954; AVX2-ONLY: # %bb.0: 5955; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 5956; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 5957; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 5958; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 5959; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 5960; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 5961; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) 5962; AVX2-ONLY-NEXT: vzeroupper 5963; AVX2-ONLY-NEXT: retq 5964; 5965; AVX512-LABEL: vec512_v4i16: 5966; AVX512: # %bb.0: 5967; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 5968; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 5969; AVX512-NEXT: vmovq %xmm0, (%rsi) 5970; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 5971; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) 5972; AVX512-NEXT: vzeroupper 5973; AVX512-NEXT: retq 5974 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 5975 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1> 5976 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 5977 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 5978 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 5979 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 5980 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 5981 %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2 5982 store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16 5983 %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3 5984 store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8 5985 %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4 5986 store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32 5987 %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5 5988 store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8 5989 %out.subvec6.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 6 5990 store <4 x i16> %in.subvec, ptr %out.subvec6.ptr, align 16 5991 %out.subvec7.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 7 5992 store <4 x i16> %in.subvec, ptr %out.subvec7.ptr, align 8 5993 ret void 5994} 5995 5996define void @vec512_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 5997; SCALAR-LABEL: vec512_v4i32: 5998; SCALAR: # %bb.0: 5999; SCALAR-NEXT: movaps (%rdi), %xmm0 6000; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 6001; SCALAR-NEXT: movaps %xmm0, (%rsi) 6002; SCALAR-NEXT: movaps %xmm0, (%rdx) 6003; SCALAR-NEXT: movaps %xmm0, 16(%rdx) 6004; SCALAR-NEXT: movaps %xmm0, 32(%rdx) 6005; SCALAR-NEXT: movaps %xmm0, 48(%rdx) 6006; SCALAR-NEXT: retq 6007; 6008; SSE2-LABEL: vec512_v4i32: 6009; SSE2: # %bb.0: 6010; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 6011; SSE2-NEXT: pxor (%rdi), %xmm0 6012; SSE2-NEXT: movdqa %xmm0, (%rsi) 6013; SSE2-NEXT: movdqa %xmm0, (%rdx) 6014; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 6015; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 6016; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 6017; SSE2-NEXT: retq 6018; 6019; AVX-LABEL: vec512_v4i32: 6020; AVX: # %bb.0: 6021; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 6022; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 6023; AVX-NEXT: vmovdqa %xmm0, (%rsi) 6024; AVX-NEXT: vmovdqa %xmm0, (%rdx) 6025; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 6026; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 6027; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) 6028; AVX-NEXT: retq 6029 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 6030 %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> 6031 store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 6032 %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0 6033 store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 6034 %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1 6035 store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16 6036 %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2 6037 store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32 6038 %out.subvec3.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 3 6039 store <4 x i32> %in.subvec, ptr %out.subvec3.ptr, align 16 6040 ret void 6041} 6042 6043define void @vec512_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 6044; SCALAR-LABEL: vec512_v4f32: 6045; SCALAR: # %bb.0: 6046; SCALAR-NEXT: movaps (%rdi), %xmm0 6047; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 6048; SCALAR-NEXT: movaps %xmm0, (%rsi) 6049; SCALAR-NEXT: movaps %xmm0, (%rdx) 6050; SCALAR-NEXT: movaps %xmm0, 16(%rdx) 6051; SCALAR-NEXT: movaps %xmm0, 32(%rdx) 6052; SCALAR-NEXT: movaps %xmm0, 48(%rdx) 6053; SCALAR-NEXT: retq 6054; 6055; SSE2-LABEL: vec512_v4f32: 6056; SSE2: # %bb.0: 6057; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 6058; SSE2-NEXT: pxor (%rdi), %xmm0 6059; SSE2-NEXT: movdqa %xmm0, (%rsi) 6060; SSE2-NEXT: movdqa %xmm0, (%rdx) 6061; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 6062; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 6063; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 6064; SSE2-NEXT: retq 6065; 6066; AVX-LABEL: vec512_v4f32: 6067; AVX: # %bb.0: 6068; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 6069; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 6070; AVX-NEXT: vmovdqa %xmm0, (%rsi) 6071; AVX-NEXT: vmovdqa %xmm0, (%rdx) 6072; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 6073; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 6074; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) 6075; AVX-NEXT: retq 6076 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 6077 %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1> 6078 %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float> 6079 store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64 6080 %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0 6081 store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 6082 %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1 6083 store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16 6084 %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2 6085 store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32 6086 %out.subvec3.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 3 6087 store <4 x float> %in.subvec, ptr %out.subvec3.ptr, align 16 6088 ret void 6089} 6090 6091define void @vec512_v4i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 6092; SCALAR-LABEL: vec512_v4i64: 6093; SCALAR: # %bb.0: 6094; SCALAR-NEXT: movq 24(%rdi), %rax 6095; SCALAR-NEXT: movq 16(%rdi), %rcx 6096; SCALAR-NEXT: movq (%rdi), %r8 6097; SCALAR-NEXT: movq 8(%rdi), %rdi 6098; SCALAR-NEXT: notq %r8 6099; SCALAR-NEXT: notq %rdi 6100; SCALAR-NEXT: notq %rcx 6101; SCALAR-NEXT: notq %rax 6102; SCALAR-NEXT: movq %rax, 24(%rsi) 6103; SCALAR-NEXT: movq %rcx, 16(%rsi) 6104; SCALAR-NEXT: movq %rdi, 8(%rsi) 6105; SCALAR-NEXT: movq %r8, (%rsi) 6106; SCALAR-NEXT: movq %rax, 24(%rdx) 6107; SCALAR-NEXT: movq %rcx, 16(%rdx) 6108; SCALAR-NEXT: movq %rdi, 8(%rdx) 6109; SCALAR-NEXT: movq %r8, (%rdx) 6110; SCALAR-NEXT: movq %rax, 56(%rdx) 6111; SCALAR-NEXT: movq %rcx, 48(%rdx) 6112; SCALAR-NEXT: movq %rdi, 40(%rdx) 6113; SCALAR-NEXT: movq %r8, 32(%rdx) 6114; SCALAR-NEXT: retq 6115; 6116; SSE2-LABEL: vec512_v4i64: 6117; SSE2: # %bb.0: 6118; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 6119; SSE2-NEXT: movdqa 16(%rdi), %xmm1 6120; SSE2-NEXT: pxor %xmm0, %xmm1 6121; SSE2-NEXT: pxor (%rdi), %xmm0 6122; SSE2-NEXT: movdqa %xmm0, (%rsi) 6123; SSE2-NEXT: movdqa %xmm1, 16(%rsi) 6124; SSE2-NEXT: movdqa %xmm0, (%rdx) 6125; SSE2-NEXT: movdqa %xmm1, 16(%rdx) 6126; SSE2-NEXT: movdqa %xmm1, 48(%rdx) 6127; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 6128; SSE2-NEXT: retq 6129; 6130; AVX1-LABEL: vec512_v4i64: 6131; AVX1: # %bb.0: 6132; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 6133; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 6134; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 6135; AVX1-NEXT: vmovaps %ymm0, (%rsi) 6136; AVX1-NEXT: vmovaps %ymm0, (%rdx) 6137; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 6138; AVX1-NEXT: vzeroupper 6139; AVX1-NEXT: retq 6140; 6141; AVX2-LABEL: vec512_v4i64: 6142; AVX2: # %bb.0: 6143; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 6144; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 6145; AVX2-NEXT: vmovdqa %ymm0, (%rsi) 6146; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 6147; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) 6148; AVX2-NEXT: vzeroupper 6149; AVX2-NEXT: retq 6150 %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64 6151 %in.subvec = xor <4 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1, i64 -1> 6152 store <4 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 6153 %out.subvec0.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 0 6154 store <4 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 6155 %out.subvec1.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 1 6156 store <4 x i64> %in.subvec, ptr %out.subvec1.ptr, align 32 6157 ret void 6158} 6159 6160define void @vec512_v4f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 6161; SCALAR-LABEL: vec512_v4f64: 6162; SCALAR: # %bb.0: 6163; SCALAR-NEXT: movq 24(%rdi), %rax 6164; SCALAR-NEXT: movq 16(%rdi), %rcx 6165; SCALAR-NEXT: movq (%rdi), %r8 6166; SCALAR-NEXT: movq 8(%rdi), %rdi 6167; SCALAR-NEXT: notq %r8 6168; SCALAR-NEXT: notq %rdi 6169; SCALAR-NEXT: notq %rcx 6170; SCALAR-NEXT: notq %rax 6171; SCALAR-NEXT: movq %rax, 24(%rsi) 6172; SCALAR-NEXT: movq %rcx, 16(%rsi) 6173; SCALAR-NEXT: movq %rdi, 8(%rsi) 6174; SCALAR-NEXT: movq %r8, (%rsi) 6175; SCALAR-NEXT: movq %rax, 24(%rdx) 6176; SCALAR-NEXT: movq %rcx, 16(%rdx) 6177; SCALAR-NEXT: movq %rdi, 8(%rdx) 6178; SCALAR-NEXT: movq %r8, (%rdx) 6179; SCALAR-NEXT: movq %rax, 56(%rdx) 6180; SCALAR-NEXT: movq %rcx, 48(%rdx) 6181; SCALAR-NEXT: movq %rdi, 40(%rdx) 6182; SCALAR-NEXT: movq %r8, 32(%rdx) 6183; SCALAR-NEXT: retq 6184; 6185; SSE2-LABEL: vec512_v4f64: 6186; SSE2: # %bb.0: 6187; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 6188; SSE2-NEXT: movdqa 16(%rdi), %xmm1 6189; SSE2-NEXT: pxor %xmm0, %xmm1 6190; SSE2-NEXT: pxor (%rdi), %xmm0 6191; SSE2-NEXT: movdqa %xmm0, (%rsi) 6192; SSE2-NEXT: movdqa %xmm1, 16(%rsi) 6193; SSE2-NEXT: movdqa %xmm0, (%rdx) 6194; SSE2-NEXT: movdqa %xmm1, 16(%rdx) 6195; SSE2-NEXT: movdqa %xmm1, 48(%rdx) 6196; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 6197; SSE2-NEXT: retq 6198; 6199; AVX1-LABEL: vec512_v4f64: 6200; AVX1: # %bb.0: 6201; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 6202; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 6203; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 6204; AVX1-NEXT: vmovaps %ymm0, (%rsi) 6205; AVX1-NEXT: vmovaps %ymm0, (%rdx) 6206; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 6207; AVX1-NEXT: vzeroupper 6208; AVX1-NEXT: retq 6209; 6210; AVX2-LABEL: vec512_v4f64: 6211; AVX2: # %bb.0: 6212; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 6213; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 6214; AVX2-NEXT: vmovdqa %ymm0, (%rsi) 6215; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 6216; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) 6217; AVX2-NEXT: vzeroupper 6218; AVX2-NEXT: retq 6219 %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64 6220 %in.subvec.int = xor <4 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1, i64 -1> 6221 %in.subvec = bitcast <4 x i64> %in.subvec.int to <4 x double> 6222 store <4 x double> %in.subvec, ptr %out.subvec.ptr, align 64 6223 %out.subvec0.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 0 6224 store <4 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 6225 %out.subvec1.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 1 6226 store <4 x double> %in.subvec, ptr %out.subvec1.ptr, align 32 6227 ret void 6228} 6229 6230define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 6231; SCALAR-LABEL: vec512_v8i8: 6232; SCALAR: # %bb.0: 6233; SCALAR-NEXT: pushq %rbx 6234; SCALAR-NEXT: movzbl 7(%rdi), %ebx 6235; SCALAR-NEXT: movzbl 6(%rdi), %r11d 6236; SCALAR-NEXT: movzbl 5(%rdi), %r10d 6237; SCALAR-NEXT: movzbl 4(%rdi), %r9d 6238; SCALAR-NEXT: movzbl 3(%rdi), %r8d 6239; SCALAR-NEXT: movzbl 2(%rdi), %ecx 6240; SCALAR-NEXT: movzbl (%rdi), %eax 6241; SCALAR-NEXT: movzbl 1(%rdi), %edi 6242; SCALAR-NEXT: notb %al 6243; SCALAR-NEXT: notb %dil 6244; SCALAR-NEXT: notb %cl 6245; SCALAR-NEXT: notb %r8b 6246; SCALAR-NEXT: notb %r9b 6247; SCALAR-NEXT: notb %r10b 6248; SCALAR-NEXT: notb %r11b 6249; SCALAR-NEXT: notb %bl 6250; SCALAR-NEXT: movb %bl, 7(%rsi) 6251; SCALAR-NEXT: movb %r11b, 6(%rsi) 6252; SCALAR-NEXT: movb %r10b, 5(%rsi) 6253; SCALAR-NEXT: movb %r9b, 4(%rsi) 6254; SCALAR-NEXT: movb %r8b, 3(%rsi) 6255; SCALAR-NEXT: movb %cl, 2(%rsi) 6256; SCALAR-NEXT: movb %dil, 1(%rsi) 6257; SCALAR-NEXT: movb %al, (%rsi) 6258; SCALAR-NEXT: movb %bl, 7(%rdx) 6259; SCALAR-NEXT: movb %r11b, 6(%rdx) 6260; SCALAR-NEXT: movb %r10b, 5(%rdx) 6261; SCALAR-NEXT: movb %r9b, 4(%rdx) 6262; SCALAR-NEXT: movb %r8b, 3(%rdx) 6263; SCALAR-NEXT: movb %cl, 2(%rdx) 6264; SCALAR-NEXT: movb %dil, 1(%rdx) 6265; SCALAR-NEXT: movb %al, (%rdx) 6266; SCALAR-NEXT: movb %bl, 15(%rdx) 6267; SCALAR-NEXT: movb %r11b, 14(%rdx) 6268; SCALAR-NEXT: movb %r10b, 13(%rdx) 6269; SCALAR-NEXT: movb %r9b, 12(%rdx) 6270; SCALAR-NEXT: movb %r8b, 11(%rdx) 6271; SCALAR-NEXT: movb %cl, 10(%rdx) 6272; SCALAR-NEXT: movb %dil, 9(%rdx) 6273; SCALAR-NEXT: movb %al, 8(%rdx) 6274; SCALAR-NEXT: movb %bl, 23(%rdx) 6275; SCALAR-NEXT: movb %r11b, 22(%rdx) 6276; SCALAR-NEXT: movb %r10b, 21(%rdx) 6277; SCALAR-NEXT: movb %r9b, 20(%rdx) 6278; SCALAR-NEXT: movb %r8b, 19(%rdx) 6279; SCALAR-NEXT: movb %cl, 18(%rdx) 6280; SCALAR-NEXT: movb %dil, 17(%rdx) 6281; SCALAR-NEXT: movb %al, 16(%rdx) 6282; SCALAR-NEXT: movb %bl, 31(%rdx) 6283; SCALAR-NEXT: movb %r11b, 30(%rdx) 6284; SCALAR-NEXT: movb %r10b, 29(%rdx) 6285; SCALAR-NEXT: movb %r9b, 28(%rdx) 6286; SCALAR-NEXT: movb %r8b, 27(%rdx) 6287; SCALAR-NEXT: movb %cl, 26(%rdx) 6288; SCALAR-NEXT: movb %dil, 25(%rdx) 6289; SCALAR-NEXT: movb %al, 24(%rdx) 6290; SCALAR-NEXT: movb %bl, 39(%rdx) 6291; SCALAR-NEXT: movb %r11b, 38(%rdx) 6292; SCALAR-NEXT: movb %r10b, 37(%rdx) 6293; SCALAR-NEXT: movb %r9b, 36(%rdx) 6294; SCALAR-NEXT: movb %r8b, 35(%rdx) 6295; SCALAR-NEXT: movb %cl, 34(%rdx) 6296; SCALAR-NEXT: movb %dil, 33(%rdx) 6297; SCALAR-NEXT: movb %al, 32(%rdx) 6298; SCALAR-NEXT: movb %bl, 47(%rdx) 6299; SCALAR-NEXT: movb %r11b, 46(%rdx) 6300; SCALAR-NEXT: movb %r10b, 45(%rdx) 6301; SCALAR-NEXT: movb %r9b, 44(%rdx) 6302; SCALAR-NEXT: movb %r8b, 43(%rdx) 6303; SCALAR-NEXT: movb %cl, 42(%rdx) 6304; SCALAR-NEXT: movb %dil, 41(%rdx) 6305; SCALAR-NEXT: movb %al, 40(%rdx) 6306; SCALAR-NEXT: movb %bl, 55(%rdx) 6307; SCALAR-NEXT: movb %r11b, 54(%rdx) 6308; SCALAR-NEXT: movb %r10b, 53(%rdx) 6309; SCALAR-NEXT: movb %r9b, 52(%rdx) 6310; SCALAR-NEXT: movb %r8b, 51(%rdx) 6311; SCALAR-NEXT: movb %cl, 50(%rdx) 6312; SCALAR-NEXT: movb %dil, 49(%rdx) 6313; SCALAR-NEXT: movb %al, 48(%rdx) 6314; SCALAR-NEXT: movb %bl, 63(%rdx) 6315; SCALAR-NEXT: movb %r11b, 62(%rdx) 6316; SCALAR-NEXT: movb %r10b, 61(%rdx) 6317; SCALAR-NEXT: movb %r9b, 60(%rdx) 6318; SCALAR-NEXT: movb %r8b, 59(%rdx) 6319; SCALAR-NEXT: movb %cl, 58(%rdx) 6320; SCALAR-NEXT: movb %dil, 57(%rdx) 6321; SCALAR-NEXT: movb %al, 56(%rdx) 6322; SCALAR-NEXT: popq %rbx 6323; SCALAR-NEXT: retq 6324; 6325; SSE2-LABEL: vec512_v8i8: 6326; SSE2: # %bb.0: 6327; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 6328; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 6329; SSE2-NEXT: pxor %xmm0, %xmm1 6330; SSE2-NEXT: movq %xmm1, (%rsi) 6331; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] 6332; SSE2-NEXT: movdqa %xmm0, (%rdx) 6333; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 6334; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 6335; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 6336; SSE2-NEXT: retq 6337; 6338; AVX1-LABEL: vec512_v8i8: 6339; AVX1: # %bb.0: 6340; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 6341; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 6342; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 6343; AVX1-NEXT: vmovq %xmm0, (%rsi) 6344; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 6345; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6346; AVX1-NEXT: vmovaps %ymm0, (%rdx) 6347; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 6348; AVX1-NEXT: vzeroupper 6349; AVX1-NEXT: retq 6350; 6351; AVX2-ONLY-LABEL: vec512_v8i8: 6352; AVX2-ONLY: # %bb.0: 6353; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 6354; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 6355; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 6356; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) 6357; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 6358; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 6359; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) 6360; AVX2-ONLY-NEXT: vzeroupper 6361; AVX2-ONLY-NEXT: retq 6362; 6363; AVX512-LABEL: vec512_v8i8: 6364; AVX512: # %bb.0: 6365; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 6366; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 6367; AVX512-NEXT: vmovq %xmm0, (%rsi) 6368; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 6369; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) 6370; AVX512-NEXT: vzeroupper 6371; AVX512-NEXT: retq 6372 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 6373 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 6374 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 6375 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 6376 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 6377 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 6378 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 6379 %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2 6380 store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16 6381 %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3 6382 store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8 6383 %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4 6384 store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32 6385 %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5 6386 store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8 6387 %out.subvec6.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 6 6388 store <8 x i8> %in.subvec, ptr %out.subvec6.ptr, align 16 6389 %out.subvec7.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 7 6390 store <8 x i8> %in.subvec, ptr %out.subvec7.ptr, align 8 6391 ret void 6392} 6393 6394define void @vec512_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 6395; SCALAR-LABEL: vec512_v8i16: 6396; SCALAR: # %bb.0: 6397; SCALAR-NEXT: pushq %rbx 6398; SCALAR-NEXT: movzwl 14(%rdi), %ebx 6399; SCALAR-NEXT: movl 12(%rdi), %r11d 6400; SCALAR-NEXT: movzwl 10(%rdi), %r10d 6401; SCALAR-NEXT: movl 8(%rdi), %r9d 6402; SCALAR-NEXT: movzwl 6(%rdi), %r8d 6403; SCALAR-NEXT: movzwl 2(%rdi), %ecx 6404; SCALAR-NEXT: movl (%rdi), %eax 6405; SCALAR-NEXT: movl 4(%rdi), %edi 6406; SCALAR-NEXT: notl %eax 6407; SCALAR-NEXT: notl %ecx 6408; SCALAR-NEXT: notl %edi 6409; SCALAR-NEXT: notl %r8d 6410; SCALAR-NEXT: notl %r9d 6411; SCALAR-NEXT: notl %r10d 6412; SCALAR-NEXT: notl %r11d 6413; SCALAR-NEXT: notl %ebx 6414; SCALAR-NEXT: movw %bx, 14(%rsi) 6415; SCALAR-NEXT: movw %r11w, 12(%rsi) 6416; SCALAR-NEXT: movw %r10w, 10(%rsi) 6417; SCALAR-NEXT: movw %r9w, 8(%rsi) 6418; SCALAR-NEXT: movw %r8w, 6(%rsi) 6419; SCALAR-NEXT: movw %di, 4(%rsi) 6420; SCALAR-NEXT: movw %cx, 2(%rsi) 6421; SCALAR-NEXT: movw %ax, (%rsi) 6422; SCALAR-NEXT: movw %bx, 14(%rdx) 6423; SCALAR-NEXT: movw %r11w, 12(%rdx) 6424; SCALAR-NEXT: movw %r10w, 10(%rdx) 6425; SCALAR-NEXT: movw %r9w, 8(%rdx) 6426; SCALAR-NEXT: movw %r8w, 6(%rdx) 6427; SCALAR-NEXT: movw %di, 4(%rdx) 6428; SCALAR-NEXT: movw %cx, 2(%rdx) 6429; SCALAR-NEXT: movw %ax, (%rdx) 6430; SCALAR-NEXT: movw %bx, 30(%rdx) 6431; SCALAR-NEXT: movw %r11w, 28(%rdx) 6432; SCALAR-NEXT: movw %r10w, 26(%rdx) 6433; SCALAR-NEXT: movw %r9w, 24(%rdx) 6434; SCALAR-NEXT: movw %r8w, 22(%rdx) 6435; SCALAR-NEXT: movw %di, 20(%rdx) 6436; SCALAR-NEXT: movw %cx, 18(%rdx) 6437; SCALAR-NEXT: movw %ax, 16(%rdx) 6438; SCALAR-NEXT: movw %bx, 46(%rdx) 6439; SCALAR-NEXT: movw %r11w, 44(%rdx) 6440; SCALAR-NEXT: movw %r10w, 42(%rdx) 6441; SCALAR-NEXT: movw %r9w, 40(%rdx) 6442; SCALAR-NEXT: movw %r8w, 38(%rdx) 6443; SCALAR-NEXT: movw %di, 36(%rdx) 6444; SCALAR-NEXT: movw %cx, 34(%rdx) 6445; SCALAR-NEXT: movw %ax, 32(%rdx) 6446; SCALAR-NEXT: movw %bx, 62(%rdx) 6447; SCALAR-NEXT: movw %r11w, 60(%rdx) 6448; SCALAR-NEXT: movw %r10w, 58(%rdx) 6449; SCALAR-NEXT: movw %r9w, 56(%rdx) 6450; SCALAR-NEXT: movw %r8w, 54(%rdx) 6451; SCALAR-NEXT: movw %di, 52(%rdx) 6452; SCALAR-NEXT: movw %cx, 50(%rdx) 6453; SCALAR-NEXT: movw %ax, 48(%rdx) 6454; SCALAR-NEXT: popq %rbx 6455; SCALAR-NEXT: retq 6456; 6457; SSE2-LABEL: vec512_v8i16: 6458; SSE2: # %bb.0: 6459; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 6460; SSE2-NEXT: pxor (%rdi), %xmm0 6461; SSE2-NEXT: movdqa %xmm0, (%rsi) 6462; SSE2-NEXT: movdqa %xmm0, (%rdx) 6463; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 6464; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 6465; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 6466; SSE2-NEXT: retq 6467; 6468; AVX-LABEL: vec512_v8i16: 6469; AVX: # %bb.0: 6470; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 6471; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 6472; AVX-NEXT: vmovdqa %xmm0, (%rsi) 6473; AVX-NEXT: vmovdqa %xmm0, (%rdx) 6474; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 6475; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 6476; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) 6477; AVX-NEXT: retq 6478 %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64 6479 %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> 6480 store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 6481 %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0 6482 store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 6483 %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1 6484 store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16 6485 %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2 6486 store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32 6487 %out.subvec3.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 3 6488 store <8 x i16> %in.subvec, ptr %out.subvec3.ptr, align 16 6489 ret void 6490} 6491 6492define void @vec512_v8i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 6493; SCALAR-LABEL: vec512_v8i32: 6494; SCALAR: # %bb.0: 6495; SCALAR-NEXT: pushq %rbx 6496; SCALAR-NEXT: movl 28(%rdi), %ebx 6497; SCALAR-NEXT: movl 24(%rdi), %r11d 6498; SCALAR-NEXT: movl 20(%rdi), %r10d 6499; SCALAR-NEXT: movl 16(%rdi), %r9d 6500; SCALAR-NEXT: movl 12(%rdi), %r8d 6501; SCALAR-NEXT: movl 8(%rdi), %ecx 6502; SCALAR-NEXT: movl (%rdi), %eax 6503; SCALAR-NEXT: movl 4(%rdi), %edi 6504; SCALAR-NEXT: notl %eax 6505; SCALAR-NEXT: notl %edi 6506; SCALAR-NEXT: notl %ecx 6507; SCALAR-NEXT: notl %r8d 6508; SCALAR-NEXT: notl %r9d 6509; SCALAR-NEXT: notl %r10d 6510; SCALAR-NEXT: notl %r11d 6511; SCALAR-NEXT: notl %ebx 6512; SCALAR-NEXT: movl %ebx, 28(%rsi) 6513; SCALAR-NEXT: movl %r11d, 24(%rsi) 6514; SCALAR-NEXT: movl %r10d, 20(%rsi) 6515; SCALAR-NEXT: movl %r9d, 16(%rsi) 6516; SCALAR-NEXT: movl %r8d, 12(%rsi) 6517; SCALAR-NEXT: movl %ecx, 8(%rsi) 6518; SCALAR-NEXT: movl %edi, 4(%rsi) 6519; SCALAR-NEXT: movl %eax, (%rsi) 6520; SCALAR-NEXT: movl %ebx, 28(%rdx) 6521; SCALAR-NEXT: movl %r11d, 24(%rdx) 6522; SCALAR-NEXT: movl %r10d, 20(%rdx) 6523; SCALAR-NEXT: movl %r9d, 16(%rdx) 6524; SCALAR-NEXT: movl %r8d, 12(%rdx) 6525; SCALAR-NEXT: movl %ecx, 8(%rdx) 6526; SCALAR-NEXT: movl %edi, 4(%rdx) 6527; SCALAR-NEXT: movl %eax, (%rdx) 6528; SCALAR-NEXT: movl %ebx, 60(%rdx) 6529; SCALAR-NEXT: movl %r11d, 56(%rdx) 6530; SCALAR-NEXT: movl %r10d, 52(%rdx) 6531; SCALAR-NEXT: movl %r9d, 48(%rdx) 6532; SCALAR-NEXT: movl %r8d, 44(%rdx) 6533; SCALAR-NEXT: movl %ecx, 40(%rdx) 6534; SCALAR-NEXT: movl %edi, 36(%rdx) 6535; SCALAR-NEXT: movl %eax, 32(%rdx) 6536; SCALAR-NEXT: popq %rbx 6537; SCALAR-NEXT: retq 6538; 6539; SSE2-LABEL: vec512_v8i32: 6540; SSE2: # %bb.0: 6541; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 6542; SSE2-NEXT: movdqa 16(%rdi), %xmm1 6543; SSE2-NEXT: pxor %xmm0, %xmm1 6544; SSE2-NEXT: pxor (%rdi), %xmm0 6545; SSE2-NEXT: movdqa %xmm0, (%rsi) 6546; SSE2-NEXT: movdqa %xmm1, 16(%rsi) 6547; SSE2-NEXT: movdqa %xmm0, (%rdx) 6548; SSE2-NEXT: movdqa %xmm1, 16(%rdx) 6549; SSE2-NEXT: movdqa %xmm1, 48(%rdx) 6550; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 6551; SSE2-NEXT: retq 6552; 6553; AVX1-LABEL: vec512_v8i32: 6554; AVX1: # %bb.0: 6555; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 6556; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 6557; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 6558; AVX1-NEXT: vmovaps %ymm0, (%rsi) 6559; AVX1-NEXT: vmovaps %ymm0, (%rdx) 6560; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 6561; AVX1-NEXT: vzeroupper 6562; AVX1-NEXT: retq 6563; 6564; AVX2-LABEL: vec512_v8i32: 6565; AVX2: # %bb.0: 6566; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 6567; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 6568; AVX2-NEXT: vmovdqa %ymm0, (%rsi) 6569; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 6570; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) 6571; AVX2-NEXT: vzeroupper 6572; AVX2-NEXT: retq 6573 %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64 6574 %in.subvec = xor <8 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 6575 store <8 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 6576 %out.subvec0.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 0 6577 store <8 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 6578 %out.subvec1.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 1 6579 store <8 x i32> %in.subvec, ptr %out.subvec1.ptr, align 32 6580 ret void 6581} 6582 6583define void @vec512_v8f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 6584; SCALAR-LABEL: vec512_v8f32: 6585; SCALAR: # %bb.0: 6586; SCALAR-NEXT: pushq %rbx 6587; SCALAR-NEXT: movl 28(%rdi), %ebx 6588; SCALAR-NEXT: movl 24(%rdi), %r11d 6589; SCALAR-NEXT: movl 20(%rdi), %r10d 6590; SCALAR-NEXT: movl 16(%rdi), %r9d 6591; SCALAR-NEXT: movl 12(%rdi), %r8d 6592; SCALAR-NEXT: movl 8(%rdi), %ecx 6593; SCALAR-NEXT: movl (%rdi), %eax 6594; SCALAR-NEXT: movl 4(%rdi), %edi 6595; SCALAR-NEXT: notl %eax 6596; SCALAR-NEXT: notl %edi 6597; SCALAR-NEXT: notl %ecx 6598; SCALAR-NEXT: notl %r8d 6599; SCALAR-NEXT: notl %r9d 6600; SCALAR-NEXT: notl %r10d 6601; SCALAR-NEXT: notl %r11d 6602; SCALAR-NEXT: notl %ebx 6603; SCALAR-NEXT: movl %ebx, 28(%rsi) 6604; SCALAR-NEXT: movl %r11d, 24(%rsi) 6605; SCALAR-NEXT: movl %r10d, 20(%rsi) 6606; SCALAR-NEXT: movl %r9d, 16(%rsi) 6607; SCALAR-NEXT: movl %r8d, 12(%rsi) 6608; SCALAR-NEXT: movl %ecx, 8(%rsi) 6609; SCALAR-NEXT: movl %edi, 4(%rsi) 6610; SCALAR-NEXT: movl %eax, (%rsi) 6611; SCALAR-NEXT: movl %ebx, 28(%rdx) 6612; SCALAR-NEXT: movl %r11d, 24(%rdx) 6613; SCALAR-NEXT: movl %r10d, 20(%rdx) 6614; SCALAR-NEXT: movl %r9d, 16(%rdx) 6615; SCALAR-NEXT: movl %r8d, 12(%rdx) 6616; SCALAR-NEXT: movl %ecx, 8(%rdx) 6617; SCALAR-NEXT: movl %edi, 4(%rdx) 6618; SCALAR-NEXT: movl %eax, (%rdx) 6619; SCALAR-NEXT: movl %ebx, 60(%rdx) 6620; SCALAR-NEXT: movl %r11d, 56(%rdx) 6621; SCALAR-NEXT: movl %r10d, 52(%rdx) 6622; SCALAR-NEXT: movl %r9d, 48(%rdx) 6623; SCALAR-NEXT: movl %r8d, 44(%rdx) 6624; SCALAR-NEXT: movl %ecx, 40(%rdx) 6625; SCALAR-NEXT: movl %edi, 36(%rdx) 6626; SCALAR-NEXT: movl %eax, 32(%rdx) 6627; SCALAR-NEXT: popq %rbx 6628; SCALAR-NEXT: retq 6629; 6630; SSE2-LABEL: vec512_v8f32: 6631; SSE2: # %bb.0: 6632; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 6633; SSE2-NEXT: movdqa 16(%rdi), %xmm1 6634; SSE2-NEXT: pxor %xmm0, %xmm1 6635; SSE2-NEXT: pxor (%rdi), %xmm0 6636; SSE2-NEXT: movdqa %xmm0, (%rsi) 6637; SSE2-NEXT: movdqa %xmm1, 16(%rsi) 6638; SSE2-NEXT: movdqa %xmm0, (%rdx) 6639; SSE2-NEXT: movdqa %xmm1, 16(%rdx) 6640; SSE2-NEXT: movdqa %xmm1, 48(%rdx) 6641; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 6642; SSE2-NEXT: retq 6643; 6644; AVX1-LABEL: vec512_v8f32: 6645; AVX1: # %bb.0: 6646; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 6647; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 6648; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 6649; AVX1-NEXT: vmovaps %ymm0, (%rsi) 6650; AVX1-NEXT: vmovaps %ymm0, (%rdx) 6651; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 6652; AVX1-NEXT: vzeroupper 6653; AVX1-NEXT: retq 6654; 6655; AVX2-LABEL: vec512_v8f32: 6656; AVX2: # %bb.0: 6657; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 6658; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 6659; AVX2-NEXT: vmovdqa %ymm0, (%rsi) 6660; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 6661; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) 6662; AVX2-NEXT: vzeroupper 6663; AVX2-NEXT: retq 6664 %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64 6665 %in.subvec.int = xor <8 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 6666 %in.subvec = bitcast <8 x i32> %in.subvec.int to <8 x float> 6667 store <8 x float> %in.subvec, ptr %out.subvec.ptr, align 64 6668 %out.subvec0.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 0 6669 store <8 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 6670 %out.subvec1.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 1 6671 store <8 x float> %in.subvec, ptr %out.subvec1.ptr, align 32 6672 ret void 6673} 6674 6675define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 6676; SCALAR-LABEL: vec512_v16i8: 6677; SCALAR: # %bb.0: 6678; SCALAR-NEXT: pushq %rbp 6679; SCALAR-NEXT: pushq %r15 6680; SCALAR-NEXT: pushq %r14 6681; SCALAR-NEXT: pushq %r13 6682; SCALAR-NEXT: pushq %r12 6683; SCALAR-NEXT: pushq %rbx 6684; SCALAR-NEXT: movzbl 15(%rdi), %eax 6685; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6686; SCALAR-NEXT: movzbl 14(%rdi), %eax 6687; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6688; SCALAR-NEXT: movzbl 13(%rdi), %eax 6689; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6690; SCALAR-NEXT: movzbl 12(%rdi), %r10d 6691; SCALAR-NEXT: movzbl 11(%rdi), %r13d 6692; SCALAR-NEXT: movzbl 10(%rdi), %r12d 6693; SCALAR-NEXT: movzbl 9(%rdi), %r15d 6694; SCALAR-NEXT: movzbl 8(%rdi), %r14d 6695; SCALAR-NEXT: movzbl 7(%rdi), %ebp 6696; SCALAR-NEXT: movzbl 6(%rdi), %r11d 6697; SCALAR-NEXT: movzbl 5(%rdi), %ebx 6698; SCALAR-NEXT: movzbl 4(%rdi), %r9d 6699; SCALAR-NEXT: movzbl 3(%rdi), %r8d 6700; SCALAR-NEXT: movzbl 2(%rdi), %ecx 6701; SCALAR-NEXT: movzbl (%rdi), %eax 6702; SCALAR-NEXT: movzbl 1(%rdi), %edi 6703; SCALAR-NEXT: notb %al 6704; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6705; SCALAR-NEXT: notb %dil 6706; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6707; SCALAR-NEXT: notb %cl 6708; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6709; SCALAR-NEXT: notb %r8b 6710; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6711; SCALAR-NEXT: notb %r9b 6712; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6713; SCALAR-NEXT: movl %ebx, %r9d 6714; SCALAR-NEXT: notb %r9b 6715; SCALAR-NEXT: notb %r11b 6716; SCALAR-NEXT: movl %r11d, %ebx 6717; SCALAR-NEXT: notb %bpl 6718; SCALAR-NEXT: notb %r14b 6719; SCALAR-NEXT: notb %r15b 6720; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6721; SCALAR-NEXT: notb %r12b 6722; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6723; SCALAR-NEXT: notb %r13b 6724; SCALAR-NEXT: notb %r10b 6725; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6726; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 6727; SCALAR-NEXT: notb %dil 6728; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload 6729; SCALAR-NEXT: notb %r8b 6730; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload 6731; SCALAR-NEXT: notb %r11b 6732; SCALAR-NEXT: movb %r11b, 15(%rsi) 6733; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6734; SCALAR-NEXT: movb %r8b, 14(%rsi) 6735; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6736; SCALAR-NEXT: movl %edi, %eax 6737; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6738; SCALAR-NEXT: movb %dil, 13(%rsi) 6739; SCALAR-NEXT: movb %r10b, 12(%rsi) 6740; SCALAR-NEXT: movb %r13b, 11(%rsi) 6741; SCALAR-NEXT: movb %r12b, 10(%rsi) 6742; SCALAR-NEXT: movb %r15b, 9(%rsi) 6743; SCALAR-NEXT: movb %r14b, 8(%rsi) 6744; SCALAR-NEXT: movl %r14d, %r12d 6745; SCALAR-NEXT: movb %bpl, 7(%rsi) 6746; SCALAR-NEXT: movl %ebp, %r14d 6747; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6748; SCALAR-NEXT: movb %bl, 6(%rsi) 6749; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6750; SCALAR-NEXT: movb %r9b, 5(%rsi) 6751; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 6752; SCALAR-NEXT: movb %cl, 4(%rsi) 6753; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload 6754; SCALAR-NEXT: movb %bpl, 3(%rsi) 6755; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 6756; SCALAR-NEXT: movb %dil, 2(%rsi) 6757; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 6758; SCALAR-NEXT: movb %cl, 1(%rsi) 6759; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload 6760; SCALAR-NEXT: movb %r10b, (%rsi) 6761; SCALAR-NEXT: movb %r11b, 15(%rdx) 6762; SCALAR-NEXT: movb %r8b, 14(%rdx) 6763; SCALAR-NEXT: movb %al, 13(%rdx) 6764; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 6765; SCALAR-NEXT: movb %al, 12(%rdx) 6766; SCALAR-NEXT: movb %r13b, 11(%rdx) 6767; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload 6768; SCALAR-NEXT: movb %r15b, 10(%rdx) 6769; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 6770; SCALAR-NEXT: movb %sil, 9(%rdx) 6771; SCALAR-NEXT: movb %r12b, 8(%rdx) 6772; SCALAR-NEXT: movb %r14b, 7(%rdx) 6773; SCALAR-NEXT: movb %bl, 6(%rdx) 6774; SCALAR-NEXT: movb %r9b, 5(%rdx) 6775; SCALAR-NEXT: movl %r9d, %r11d 6776; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 6777; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload 6778; SCALAR-NEXT: movb %r8b, 4(%rdx) 6779; SCALAR-NEXT: movb %bpl, 3(%rdx) 6780; SCALAR-NEXT: movb %dil, 2(%rdx) 6781; SCALAR-NEXT: movb %cl, 1(%rdx) 6782; SCALAR-NEXT: movl %ecx, %r14d 6783; SCALAR-NEXT: movl %r10d, %esi 6784; SCALAR-NEXT: movb %r10b, (%rdx) 6785; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 6786; SCALAR-NEXT: movb %cl, 31(%rdx) 6787; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload 6788; SCALAR-NEXT: movb %r9b, 30(%rdx) 6789; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 6790; SCALAR-NEXT: movb %dil, 29(%rdx) 6791; SCALAR-NEXT: movb %al, 28(%rdx) 6792; SCALAR-NEXT: movl %eax, %r10d 6793; SCALAR-NEXT: movb %r13b, 27(%rdx) 6794; SCALAR-NEXT: movb %r15b, 26(%rdx) 6795; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload 6796; SCALAR-NEXT: movb %r15b, 25(%rdx) 6797; SCALAR-NEXT: movl %r12d, %ebp 6798; SCALAR-NEXT: movb %r12b, 24(%rdx) 6799; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload 6800; SCALAR-NEXT: movb %bl, 23(%rdx) 6801; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 6802; SCALAR-NEXT: movb %al, 22(%rdx) 6803; SCALAR-NEXT: movb %r11b, 21(%rdx) 6804; SCALAR-NEXT: movb %r8b, 20(%rdx) 6805; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload 6806; SCALAR-NEXT: movb %r8b, 19(%rdx) 6807; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload 6808; SCALAR-NEXT: movb %r8b, 18(%rdx) 6809; SCALAR-NEXT: movb %r14b, 17(%rdx) 6810; SCALAR-NEXT: movb %sil, 16(%rdx) 6811; SCALAR-NEXT: movl %esi, %r11d 6812; SCALAR-NEXT: movb %cl, 47(%rdx) 6813; SCALAR-NEXT: movb %r9b, 46(%rdx) 6814; SCALAR-NEXT: movb %dil, 45(%rdx) 6815; SCALAR-NEXT: movb %r10b, 44(%rdx) 6816; SCALAR-NEXT: movb %r13b, 43(%rdx) 6817; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload 6818; SCALAR-NEXT: movb %r12b, 42(%rdx) 6819; SCALAR-NEXT: movb %r15b, 41(%rdx) 6820; SCALAR-NEXT: movl %ebp, %r14d 6821; SCALAR-NEXT: movb %bpl, 40(%rdx) 6822; SCALAR-NEXT: movl %ebx, %ebp 6823; SCALAR-NEXT: movb %bl, 39(%rdx) 6824; SCALAR-NEXT: movl %eax, %ebx 6825; SCALAR-NEXT: movb %al, 38(%rdx) 6826; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 6827; SCALAR-NEXT: movb %cl, 37(%rdx) 6828; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 6829; SCALAR-NEXT: movb %al, 36(%rdx) 6830; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 6831; SCALAR-NEXT: movb %sil, 35(%rdx) 6832; SCALAR-NEXT: movb %r8b, 34(%rdx) 6833; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload 6834; SCALAR-NEXT: movb %r9b, 33(%rdx) 6835; SCALAR-NEXT: movb %r11b, 32(%rdx) 6836; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload 6837; SCALAR-NEXT: movb %r11b, 63(%rdx) 6838; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload 6839; SCALAR-NEXT: movb %r11b, 62(%rdx) 6840; SCALAR-NEXT: movb %dil, 61(%rdx) 6841; SCALAR-NEXT: movb %r10b, 60(%rdx) 6842; SCALAR-NEXT: movb %r13b, 59(%rdx) 6843; SCALAR-NEXT: movb %r12b, 58(%rdx) 6844; SCALAR-NEXT: movb %r15b, 57(%rdx) 6845; SCALAR-NEXT: movb %r14b, 56(%rdx) 6846; SCALAR-NEXT: movb %bpl, 55(%rdx) 6847; SCALAR-NEXT: movb %bl, 54(%rdx) 6848; SCALAR-NEXT: movb %cl, 53(%rdx) 6849; SCALAR-NEXT: movb %al, 52(%rdx) 6850; SCALAR-NEXT: movb %sil, 51(%rdx) 6851; SCALAR-NEXT: movb %r8b, 50(%rdx) 6852; SCALAR-NEXT: movb %r9b, 49(%rdx) 6853; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 6854; SCALAR-NEXT: movb %al, 48(%rdx) 6855; SCALAR-NEXT: popq %rbx 6856; SCALAR-NEXT: popq %r12 6857; SCALAR-NEXT: popq %r13 6858; SCALAR-NEXT: popq %r14 6859; SCALAR-NEXT: popq %r15 6860; SCALAR-NEXT: popq %rbp 6861; SCALAR-NEXT: retq 6862; 6863; SSE2-LABEL: vec512_v16i8: 6864; SSE2: # %bb.0: 6865; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 6866; SSE2-NEXT: pxor (%rdi), %xmm0 6867; SSE2-NEXT: movdqa %xmm0, (%rsi) 6868; SSE2-NEXT: movdqa %xmm0, (%rdx) 6869; SSE2-NEXT: movdqa %xmm0, 16(%rdx) 6870; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 6871; SSE2-NEXT: movdqa %xmm0, 48(%rdx) 6872; SSE2-NEXT: retq 6873; 6874; AVX-LABEL: vec512_v16i8: 6875; AVX: # %bb.0: 6876; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 6877; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 6878; AVX-NEXT: vmovdqa %xmm0, (%rsi) 6879; AVX-NEXT: vmovdqa %xmm0, (%rdx) 6880; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 6881; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) 6882; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) 6883; AVX-NEXT: retq 6884 %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64 6885 %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 6886 store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 6887 %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0 6888 store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 6889 %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1 6890 store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16 6891 %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2 6892 store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32 6893 %out.subvec3.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 3 6894 store <16 x i8> %in.subvec, ptr %out.subvec3.ptr, align 16 6895 ret void 6896} 6897 6898define void @vec512_v16i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 6899; SCALAR-LABEL: vec512_v16i16: 6900; SCALAR: # %bb.0: 6901; SCALAR-NEXT: pushq %rbp 6902; SCALAR-NEXT: pushq %r15 6903; SCALAR-NEXT: pushq %r14 6904; SCALAR-NEXT: pushq %r13 6905; SCALAR-NEXT: pushq %r12 6906; SCALAR-NEXT: pushq %rbx 6907; SCALAR-NEXT: movzwl 30(%rdi), %eax 6908; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6909; SCALAR-NEXT: movl 28(%rdi), %eax 6910; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6911; SCALAR-NEXT: movzwl 26(%rdi), %eax 6912; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6913; SCALAR-NEXT: movl 24(%rdi), %r13d 6914; SCALAR-NEXT: movzwl 22(%rdi), %r12d 6915; SCALAR-NEXT: movl 20(%rdi), %r15d 6916; SCALAR-NEXT: movzwl 18(%rdi), %r14d 6917; SCALAR-NEXT: movl 16(%rdi), %ebx 6918; SCALAR-NEXT: movzwl 14(%rdi), %r11d 6919; SCALAR-NEXT: movl 12(%rdi), %r10d 6920; SCALAR-NEXT: movzwl 10(%rdi), %r9d 6921; SCALAR-NEXT: movl 8(%rdi), %r8d 6922; SCALAR-NEXT: movzwl 6(%rdi), %ecx 6923; SCALAR-NEXT: movzwl 2(%rdi), %ebp 6924; SCALAR-NEXT: movl (%rdi), %eax 6925; SCALAR-NEXT: movl 4(%rdi), %edi 6926; SCALAR-NEXT: notl %eax 6927; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6928; SCALAR-NEXT: notl %ebp 6929; SCALAR-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6930; SCALAR-NEXT: notl %edi 6931; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6932; SCALAR-NEXT: notl %ecx 6933; SCALAR-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6934; SCALAR-NEXT: notl %r8d 6935; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6936; SCALAR-NEXT: notl %r9d 6937; SCALAR-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6938; SCALAR-NEXT: movl %r10d, %edi 6939; SCALAR-NEXT: notl %edi 6940; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6941; SCALAR-NEXT: notl %r11d 6942; SCALAR-NEXT: movl %r11d, %r9d 6943; SCALAR-NEXT: notl %ebx 6944; SCALAR-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6945; SCALAR-NEXT: notl %r14d 6946; SCALAR-NEXT: notl %r15d 6947; SCALAR-NEXT: notl %r12d 6948; SCALAR-NEXT: notl %r13d 6949; SCALAR-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6950; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload 6951; SCALAR-NEXT: notl %r10d 6952; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload 6953; SCALAR-NEXT: notl %r11d 6954; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload 6955; SCALAR-NEXT: notl %r8d 6956; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6957; SCALAR-NEXT: movw %r8w, 30(%rsi) 6958; SCALAR-NEXT: movw %r11w, 28(%rsi) 6959; SCALAR-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6960; SCALAR-NEXT: movw %r10w, 26(%rsi) 6961; SCALAR-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6962; SCALAR-NEXT: movw %r13w, 24(%rsi) 6963; SCALAR-NEXT: movw %r12w, 22(%rsi) 6964; SCALAR-NEXT: movw %r15w, 20(%rsi) 6965; SCALAR-NEXT: movw %r14w, 18(%rsi) 6966; SCALAR-NEXT: movw %bx, 16(%rsi) 6967; SCALAR-NEXT: movw %r9w, 14(%rsi) 6968; SCALAR-NEXT: movw %di, 12(%rsi) 6969; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Reload 6970; SCALAR-NEXT: movw %bp, 10(%rsi) 6971; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload 6972; SCALAR-NEXT: movw %di, 8(%rsi) 6973; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload 6974; SCALAR-NEXT: movw %cx, 6(%rsi) 6975; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload 6976; SCALAR-NEXT: movw %r8w, 4(%rsi) 6977; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 6978; SCALAR-NEXT: movw %ax, 2(%rsi) 6979; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload 6980; SCALAR-NEXT: movw %bx, (%rsi) 6981; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload 6982; SCALAR-NEXT: movw %r13w, 30(%rdx) 6983; SCALAR-NEXT: movw %r11w, 28(%rdx) 6984; SCALAR-NEXT: movw %r10w, 26(%rdx) 6985; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload 6986; SCALAR-NEXT: movw %si, 24(%rdx) 6987; SCALAR-NEXT: movw %r12w, 22(%rdx) 6988; SCALAR-NEXT: movw %r15w, 20(%rdx) 6989; SCALAR-NEXT: movw %r14w, 18(%rdx) 6990; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload 6991; SCALAR-NEXT: movw %r11w, 16(%rdx) 6992; SCALAR-NEXT: movw %r9w, 14(%rdx) 6993; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload 6994; SCALAR-NEXT: movw %r10w, 12(%rdx) 6995; SCALAR-NEXT: movw %bp, 10(%rdx) 6996; SCALAR-NEXT: movw %di, 8(%rdx) 6997; SCALAR-NEXT: movw %cx, 6(%rdx) 6998; SCALAR-NEXT: movw %r8w, 4(%rdx) 6999; SCALAR-NEXT: movw %ax, 2(%rdx) 7000; SCALAR-NEXT: movl %ebx, %esi 7001; SCALAR-NEXT: movw %si, (%rdx) 7002; SCALAR-NEXT: movw %r13w, 62(%rdx) 7003; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload 7004; SCALAR-NEXT: movw %bx, 60(%rdx) 7005; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload 7006; SCALAR-NEXT: movw %bx, 58(%rdx) 7007; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload 7008; SCALAR-NEXT: movw %bx, 56(%rdx) 7009; SCALAR-NEXT: movw %r12w, 54(%rdx) 7010; SCALAR-NEXT: movw %r15w, 52(%rdx) 7011; SCALAR-NEXT: movw %r14w, 50(%rdx) 7012; SCALAR-NEXT: movw %r11w, 48(%rdx) 7013; SCALAR-NEXT: movw %r9w, 46(%rdx) 7014; SCALAR-NEXT: movw %r10w, 44(%rdx) 7015; SCALAR-NEXT: movw %bp, 42(%rdx) 7016; SCALAR-NEXT: movw %di, 40(%rdx) 7017; SCALAR-NEXT: movw %cx, 38(%rdx) 7018; SCALAR-NEXT: movw %r8w, 36(%rdx) 7019; SCALAR-NEXT: movw %ax, 34(%rdx) 7020; SCALAR-NEXT: movw %si, 32(%rdx) 7021; SCALAR-NEXT: popq %rbx 7022; SCALAR-NEXT: popq %r12 7023; SCALAR-NEXT: popq %r13 7024; SCALAR-NEXT: popq %r14 7025; SCALAR-NEXT: popq %r15 7026; SCALAR-NEXT: popq %rbp 7027; SCALAR-NEXT: retq 7028; 7029; SSE2-LABEL: vec512_v16i16: 7030; SSE2: # %bb.0: 7031; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 7032; SSE2-NEXT: movdqa 16(%rdi), %xmm1 7033; SSE2-NEXT: pxor %xmm0, %xmm1 7034; SSE2-NEXT: pxor (%rdi), %xmm0 7035; SSE2-NEXT: movdqa %xmm0, (%rsi) 7036; SSE2-NEXT: movdqa %xmm1, 16(%rsi) 7037; SSE2-NEXT: movdqa %xmm0, (%rdx) 7038; SSE2-NEXT: movdqa %xmm1, 16(%rdx) 7039; SSE2-NEXT: movdqa %xmm1, 48(%rdx) 7040; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 7041; SSE2-NEXT: retq 7042; 7043; AVX1-LABEL: vec512_v16i16: 7044; AVX1: # %bb.0: 7045; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 7046; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 7047; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 7048; AVX1-NEXT: vmovaps %ymm0, (%rsi) 7049; AVX1-NEXT: vmovaps %ymm0, (%rdx) 7050; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 7051; AVX1-NEXT: vzeroupper 7052; AVX1-NEXT: retq 7053; 7054; AVX2-LABEL: vec512_v16i16: 7055; AVX2: # %bb.0: 7056; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 7057; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 7058; AVX2-NEXT: vmovdqa %ymm0, (%rsi) 7059; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 7060; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) 7061; AVX2-NEXT: vzeroupper 7062; AVX2-NEXT: retq 7063 %in.subvec.not = load <16 x i16>, ptr %in.subvec.ptr, align 64 7064 %in.subvec = xor <16 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> 7065 store <16 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 7066 %out.subvec0.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 0 7067 store <16 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 7068 %out.subvec1.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 1 7069 store <16 x i16> %in.subvec, ptr %out.subvec1.ptr, align 32 7070 ret void 7071} 7072 7073define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { 7074; SCALAR-LABEL: vec512_v32i8: 7075; SCALAR: # %bb.0: 7076; SCALAR-NEXT: pushq %rbp 7077; SCALAR-NEXT: pushq %r15 7078; SCALAR-NEXT: pushq %r14 7079; SCALAR-NEXT: pushq %r13 7080; SCALAR-NEXT: pushq %r12 7081; SCALAR-NEXT: pushq %rbx 7082; SCALAR-NEXT: movzbl 16(%rdi), %eax 7083; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7084; SCALAR-NEXT: movzbl 15(%rdi), %eax 7085; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7086; SCALAR-NEXT: movzbl 14(%rdi), %eax 7087; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7088; SCALAR-NEXT: movzbl 13(%rdi), %eax 7089; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7090; SCALAR-NEXT: movzbl 12(%rdi), %r13d 7091; SCALAR-NEXT: movzbl 11(%rdi), %eax 7092; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7093; SCALAR-NEXT: movzbl 10(%rdi), %r12d 7094; SCALAR-NEXT: movzbl 9(%rdi), %r15d 7095; SCALAR-NEXT: movzbl 8(%rdi), %r14d 7096; SCALAR-NEXT: movzbl 7(%rdi), %ebp 7097; SCALAR-NEXT: movzbl 6(%rdi), %ebx 7098; SCALAR-NEXT: movzbl 5(%rdi), %r11d 7099; SCALAR-NEXT: movzbl 4(%rdi), %r10d 7100; SCALAR-NEXT: movzbl 3(%rdi), %r9d 7101; SCALAR-NEXT: movzbl 2(%rdi), %r8d 7102; SCALAR-NEXT: movzbl (%rdi), %eax 7103; SCALAR-NEXT: movzbl 1(%rdi), %ecx 7104; SCALAR-NEXT: notb %al 7105; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7106; SCALAR-NEXT: notb %cl 7107; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7108; SCALAR-NEXT: notb %r8b 7109; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7110; SCALAR-NEXT: notb %r9b 7111; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7112; SCALAR-NEXT: notb %r10b 7113; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7114; SCALAR-NEXT: notb %r11b 7115; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7116; SCALAR-NEXT: notb %bl 7117; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7118; SCALAR-NEXT: notb %bpl 7119; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7120; SCALAR-NEXT: notb %r14b 7121; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7122; SCALAR-NEXT: notb %r15b 7123; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7124; SCALAR-NEXT: notb %r12b 7125; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7126; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload 7127; SCALAR-NEXT: notb %r11b 7128; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7129; SCALAR-NEXT: notb %r13b 7130; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7131; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill 7132; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill 7133; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload 7134; SCALAR-NEXT: notb %r8b 7135; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill 7136; SCALAR-NEXT: movzbl 17(%rdi), %eax 7137; SCALAR-NEXT: notb %al 7138; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7139; SCALAR-NEXT: movzbl 18(%rdi), %eax 7140; SCALAR-NEXT: notb %al 7141; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7142; SCALAR-NEXT: movzbl 19(%rdi), %eax 7143; SCALAR-NEXT: notb %al 7144; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7145; SCALAR-NEXT: movzbl 20(%rdi), %eax 7146; SCALAR-NEXT: notb %al 7147; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7148; SCALAR-NEXT: movzbl 21(%rdi), %ebp 7149; SCALAR-NEXT: notb %bpl 7150; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7151; SCALAR-NEXT: movzbl 22(%rdi), %ebx 7152; SCALAR-NEXT: notb %bl 7153; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7154; SCALAR-NEXT: movzbl 23(%rdi), %r10d 7155; SCALAR-NEXT: notb %r10b 7156; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7157; SCALAR-NEXT: movzbl 24(%rdi), %r9d 7158; SCALAR-NEXT: notb %r9b 7159; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7160; SCALAR-NEXT: movzbl 25(%rdi), %ecx 7161; SCALAR-NEXT: notb %cl 7162; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7163; SCALAR-NEXT: movzbl 26(%rdi), %r14d 7164; SCALAR-NEXT: notb %r14b 7165; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7166; SCALAR-NEXT: movzbl 27(%rdi), %r15d 7167; SCALAR-NEXT: notb %r15b 7168; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7169; SCALAR-NEXT: movzbl 28(%rdi), %r12d 7170; SCALAR-NEXT: notb %r12b 7171; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7172; SCALAR-NEXT: movzbl 29(%rdi), %r13d 7173; SCALAR-NEXT: notb %r13b 7174; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7175; SCALAR-NEXT: movzbl 30(%rdi), %eax 7176; SCALAR-NEXT: notb %al 7177; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7178; SCALAR-NEXT: movzbl 31(%rdi), %edi 7179; SCALAR-NEXT: notb %dil 7180; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7181; SCALAR-NEXT: movb %dil, 31(%rsi) 7182; SCALAR-NEXT: movb %al, 30(%rsi) 7183; SCALAR-NEXT: movb %r13b, 29(%rsi) 7184; SCALAR-NEXT: movb %r12b, 28(%rsi) 7185; SCALAR-NEXT: movb %r15b, 27(%rsi) 7186; SCALAR-NEXT: movb %r14b, 26(%rsi) 7187; SCALAR-NEXT: movb %cl, 25(%rsi) 7188; SCALAR-NEXT: movb %r9b, 24(%rsi) 7189; SCALAR-NEXT: movb %r10b, 23(%rsi) 7190; SCALAR-NEXT: movb %bl, 22(%rsi) 7191; SCALAR-NEXT: movb %bpl, 21(%rsi) 7192; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload 7193; SCALAR-NEXT: movb %bpl, 20(%rsi) 7194; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7195; SCALAR-NEXT: movb %al, 19(%rsi) 7196; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7197; SCALAR-NEXT: movb %al, 18(%rsi) 7198; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7199; SCALAR-NEXT: movb %al, 17(%rsi) 7200; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 7201; SCALAR-NEXT: movb %cl, 16(%rsi) 7202; SCALAR-NEXT: movb %r8b, 15(%rsi) 7203; SCALAR-NEXT: movl %r8d, %r14d 7204; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill 7205; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload 7206; SCALAR-NEXT: movb %bl, 14(%rsi) 7207; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7208; SCALAR-NEXT: movb %al, 13(%rsi) 7209; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7210; SCALAR-NEXT: movb %al, 12(%rsi) 7211; SCALAR-NEXT: movb %r11b, 11(%rsi) 7212; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 7213; SCALAR-NEXT: movb %dil, 10(%rsi) 7214; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 7215; SCALAR-NEXT: movb %dil, 9(%rsi) 7216; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 7217; SCALAR-NEXT: movb %dil, 8(%rsi) 7218; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload 7219; SCALAR-NEXT: movb %r11b, 7(%rsi) 7220; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload 7221; SCALAR-NEXT: movb %r13b, 6(%rsi) 7222; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload 7223; SCALAR-NEXT: movb %r10b, 5(%rsi) 7224; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload 7225; SCALAR-NEXT: movb %r12b, 4(%rsi) 7226; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload 7227; SCALAR-NEXT: movb %r9b, 3(%rsi) 7228; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload 7229; SCALAR-NEXT: movb %r15b, 2(%rsi) 7230; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload 7231; SCALAR-NEXT: movb %r8b, 1(%rsi) 7232; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload 7233; SCALAR-NEXT: movb %dil, (%rsi) 7234; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7235; SCALAR-NEXT: movb %sil, 31(%rdx) 7236; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7237; SCALAR-NEXT: movb %sil, 30(%rdx) 7238; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7239; SCALAR-NEXT: movb %sil, 29(%rdx) 7240; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7241; SCALAR-NEXT: movb %sil, 28(%rdx) 7242; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7243; SCALAR-NEXT: movb %sil, 27(%rdx) 7244; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7245; SCALAR-NEXT: movb %sil, 26(%rdx) 7246; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7247; SCALAR-NEXT: movb %sil, 25(%rdx) 7248; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7249; SCALAR-NEXT: movb %sil, 24(%rdx) 7250; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7251; SCALAR-NEXT: movb %sil, 23(%rdx) 7252; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7253; SCALAR-NEXT: movb %sil, 22(%rdx) 7254; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7255; SCALAR-NEXT: movb %sil, 21(%rdx) 7256; SCALAR-NEXT: movb %bpl, 20(%rdx) 7257; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7258; SCALAR-NEXT: movb %sil, 19(%rdx) 7259; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7260; SCALAR-NEXT: movb %sil, 18(%rdx) 7261; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7262; SCALAR-NEXT: movb %sil, 17(%rdx) 7263; SCALAR-NEXT: movb %cl, 16(%rdx) 7264; SCALAR-NEXT: movb %r14b, 15(%rdx) 7265; SCALAR-NEXT: movb %bl, 14(%rdx) 7266; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 7267; SCALAR-NEXT: movb %cl, 13(%rdx) 7268; SCALAR-NEXT: movb %al, 12(%rdx) 7269; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload 7270; SCALAR-NEXT: movb %sil, 11(%rdx) 7271; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload 7272; SCALAR-NEXT: movb %bl, 10(%rdx) 7273; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload 7274; SCALAR-NEXT: movb %r14b, 9(%rdx) 7275; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload 7276; SCALAR-NEXT: movb %bpl, 8(%rdx) 7277; SCALAR-NEXT: movb %r11b, 7(%rdx) 7278; SCALAR-NEXT: movb %r13b, 6(%rdx) 7279; SCALAR-NEXT: movb %r10b, 5(%rdx) 7280; SCALAR-NEXT: movb %r12b, 4(%rdx) 7281; SCALAR-NEXT: movb %r9b, 3(%rdx) 7282; SCALAR-NEXT: movb %r15b, 2(%rdx) 7283; SCALAR-NEXT: movb %r8b, 1(%rdx) 7284; SCALAR-NEXT: movb %dil, (%rdx) 7285; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7286; SCALAR-NEXT: movb %al, 63(%rdx) 7287; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7288; SCALAR-NEXT: movb %al, 62(%rdx) 7289; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7290; SCALAR-NEXT: movb %al, 61(%rdx) 7291; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7292; SCALAR-NEXT: movb %al, 60(%rdx) 7293; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7294; SCALAR-NEXT: movb %al, 59(%rdx) 7295; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7296; SCALAR-NEXT: movb %al, 58(%rdx) 7297; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7298; SCALAR-NEXT: movb %al, 57(%rdx) 7299; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7300; SCALAR-NEXT: movb %al, 56(%rdx) 7301; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7302; SCALAR-NEXT: movb %al, 55(%rdx) 7303; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7304; SCALAR-NEXT: movb %al, 54(%rdx) 7305; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7306; SCALAR-NEXT: movb %al, 53(%rdx) 7307; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7308; SCALAR-NEXT: movb %al, 52(%rdx) 7309; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7310; SCALAR-NEXT: movb %al, 51(%rdx) 7311; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7312; SCALAR-NEXT: movb %al, 50(%rdx) 7313; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7314; SCALAR-NEXT: movb %al, 49(%rdx) 7315; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7316; SCALAR-NEXT: movb %al, 48(%rdx) 7317; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7318; SCALAR-NEXT: movb %al, 47(%rdx) 7319; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7320; SCALAR-NEXT: movb %al, 46(%rdx) 7321; SCALAR-NEXT: movb %cl, 45(%rdx) 7322; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 7323; SCALAR-NEXT: movb %al, 44(%rdx) 7324; SCALAR-NEXT: movb %sil, 43(%rdx) 7325; SCALAR-NEXT: movb %bl, 42(%rdx) 7326; SCALAR-NEXT: movb %r14b, 41(%rdx) 7327; SCALAR-NEXT: movb %bpl, 40(%rdx) 7328; SCALAR-NEXT: movb %r11b, 39(%rdx) 7329; SCALAR-NEXT: movb %r13b, 38(%rdx) 7330; SCALAR-NEXT: movb %r10b, 37(%rdx) 7331; SCALAR-NEXT: movb %r12b, 36(%rdx) 7332; SCALAR-NEXT: movb %r9b, 35(%rdx) 7333; SCALAR-NEXT: movb %r15b, 34(%rdx) 7334; SCALAR-NEXT: movb %r8b, 33(%rdx) 7335; SCALAR-NEXT: movb %dil, 32(%rdx) 7336; SCALAR-NEXT: popq %rbx 7337; SCALAR-NEXT: popq %r12 7338; SCALAR-NEXT: popq %r13 7339; SCALAR-NEXT: popq %r14 7340; SCALAR-NEXT: popq %r15 7341; SCALAR-NEXT: popq %rbp 7342; SCALAR-NEXT: retq 7343; 7344; SSE2-LABEL: vec512_v32i8: 7345; SSE2: # %bb.0: 7346; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 7347; SSE2-NEXT: movdqa 16(%rdi), %xmm1 7348; SSE2-NEXT: pxor %xmm0, %xmm1 7349; SSE2-NEXT: pxor (%rdi), %xmm0 7350; SSE2-NEXT: movdqa %xmm0, (%rsi) 7351; SSE2-NEXT: movdqa %xmm1, 16(%rsi) 7352; SSE2-NEXT: movdqa %xmm0, (%rdx) 7353; SSE2-NEXT: movdqa %xmm1, 16(%rdx) 7354; SSE2-NEXT: movdqa %xmm1, 48(%rdx) 7355; SSE2-NEXT: movdqa %xmm0, 32(%rdx) 7356; SSE2-NEXT: retq 7357; 7358; AVX1-LABEL: vec512_v32i8: 7359; AVX1: # %bb.0: 7360; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 7361; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 7362; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0 7363; AVX1-NEXT: vmovaps %ymm0, (%rsi) 7364; AVX1-NEXT: vmovaps %ymm0, (%rdx) 7365; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) 7366; AVX1-NEXT: vzeroupper 7367; AVX1-NEXT: retq 7368; 7369; AVX2-LABEL: vec512_v32i8: 7370; AVX2: # %bb.0: 7371; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 7372; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 7373; AVX2-NEXT: vmovdqa %ymm0, (%rsi) 7374; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 7375; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) 7376; AVX2-NEXT: vzeroupper 7377; AVX2-NEXT: retq 7378 %in.subvec.not = load <32 x i8>, ptr %in.subvec.ptr, align 64 7379 %in.subvec = xor <32 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 7380 store <32 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 7381 %out.subvec0.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 0 7382 store <32 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 7383 %out.subvec1.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 1 7384 store <32 x i8> %in.subvec, ptr %out.subvec1.ptr, align 32 7385 ret void 7386} 7387;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 7388; SSSE3: {{.*}} 7389