1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7 10 11define void @load_single_128bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind { 12; SSE-LABEL: load_single_128bit_elt_vector: 13; SSE: # %bb.0: 14; SSE-NEXT: movaps (%rdi), %xmm0 15; SSE-NEXT: xorps %xmm1, %xmm1 16; SSE-NEXT: movaps %xmm1, 16(%rdx) 17; SSE-NEXT: movaps %xmm0, (%rdx) 18; SSE-NEXT: retq 19; 20; AVX-LABEL: load_single_128bit_elt_vector: 21; AVX: # %bb.0: 22; AVX-NEXT: vmovaps (%rdi), %xmm0 23; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 24; AVX-NEXT: vmovaps %xmm1, 16(%rdx) 25; AVX-NEXT: vmovaps %xmm0, (%rdx) 26; AVX-NEXT: retq 27; 28; AVX2-LABEL: load_single_128bit_elt_vector: 29; AVX2: # %bb.0: 30; AVX2-NEXT: vmovaps (%rdi), %xmm0 31; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 32; AVX2-NEXT: vmovaps %xmm1, 16(%rdx) 33; AVX2-NEXT: vmovaps %xmm0, (%rdx) 34; AVX2-NEXT: retq 35; 36; AVX512F-LABEL: load_single_128bit_elt_vector: 37; AVX512F: # %bb.0: 38; AVX512F-NEXT: vmovaps (%rdi), %xmm0 39; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 40; AVX512F-NEXT: vmovaps %xmm1, 16(%rdx) 41; AVX512F-NEXT: vmovaps %xmm0, (%rdx) 42; AVX512F-NEXT: retq 43 %i0 = load <16 x i8>, ptr %in, align 64 44 %i1 = bitcast <16 x i8> %i0 to <1 x i128> 45 %i2 = shufflevector <1 x i128> %i1, <1 x i128> zeroinitializer, <2 x i32> <i32 0, i32 1> 46 %i3 = bitcast <2 x i128> %i2 to <32 x i8> 47 store <32 x i8> %i3, ptr %out, align 64 48 ret void 49} 50define void @store_single_128bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind { 51; SSE-LABEL: store_single_128bit_elt_vector: 52; SSE: # %bb.0: 53; SSE-NEXT: movaps (%rdi), %xmm0 54; SSE-NEXT: movaps %xmm0, (%rdx) 55; SSE-NEXT: retq 56; 57; AVX-LABEL: store_single_128bit_elt_vector: 58; AVX: # %bb.0: 59; AVX-NEXT: vmovaps (%rdi), %xmm0 60; AVX-NEXT: vmovaps %xmm0, (%rdx) 61; AVX-NEXT: retq 62; 63; AVX2-LABEL: store_single_128bit_elt_vector: 64; AVX2: # %bb.0: 65; AVX2-NEXT: vmovaps (%rdi), %xmm0 66; AVX2-NEXT: vmovaps %xmm0, (%rdx) 67; AVX2-NEXT: retq 68; 69; AVX512F-LABEL: store_single_128bit_elt_vector: 70; AVX512F: # %bb.0: 71; AVX512F-NEXT: vmovaps (%rdi), %xmm0 72; AVX512F-NEXT: vmovaps %xmm0, (%rdx) 73; AVX512F-NEXT: retq 74 %i0 = load <32 x i8>, ptr %in, align 64 75 %i1 = bitcast <32 x i8> %i0 to <2 x i128> 76 %i2 = shufflevector <2 x i128> %i1, <2 x i128> poison, <1 x i32> <i32 0> 77 %i3 = bitcast <1 x i128> %i2 to <16 x i8> 78 store <16 x i8> %i3, ptr %out, align 64 79 ret void 80} 81 82define void @load_single_256bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind { 83; SSE-LABEL: load_single_256bit_elt_vector: 84; SSE: # %bb.0: 85; SSE-NEXT: movaps (%rdi), %xmm0 86; SSE-NEXT: movaps 16(%rdi), %xmm1 87; SSE-NEXT: xorps %xmm2, %xmm2 88; SSE-NEXT: movaps %xmm2, 48(%rdx) 89; SSE-NEXT: movaps %xmm2, 32(%rdx) 90; SSE-NEXT: movaps %xmm0, (%rdx) 91; SSE-NEXT: movaps %xmm1, 16(%rdx) 92; SSE-NEXT: retq 93; 94; AVX-LABEL: load_single_256bit_elt_vector: 95; AVX: # %bb.0: 96; AVX-NEXT: vmovaps (%rdi), %ymm0 97; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 98; AVX-NEXT: vmovaps %ymm1, 32(%rdx) 99; AVX-NEXT: vmovaps %ymm0, (%rdx) 100; AVX-NEXT: vzeroupper 101; AVX-NEXT: retq 102; 103; AVX2-LABEL: load_single_256bit_elt_vector: 104; AVX2: # %bb.0: 105; AVX2-NEXT: vmovaps (%rdi), %ymm0 106; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 107; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) 108; AVX2-NEXT: vmovaps %ymm0, (%rdx) 109; AVX2-NEXT: vzeroupper 110; AVX2-NEXT: retq 111; 112; AVX512F-LABEL: load_single_256bit_elt_vector: 113; AVX512F: # %bb.0: 114; AVX512F-NEXT: vmovaps (%rdi), %ymm0 115; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 116; AVX512F-NEXT: vmovaps %ymm1, 32(%rdx) 117; AVX512F-NEXT: vmovaps %ymm0, (%rdx) 118; AVX512F-NEXT: vzeroupper 119; AVX512F-NEXT: retq 120 %i0 = load <32 x i8>, ptr %in, align 64 121 %i1 = bitcast <32 x i8> %i0 to <1 x i256> 122 %i2 = shufflevector <1 x i256> %i1, <1 x i256> zeroinitializer, <2 x i32> <i32 0, i32 1> 123 %i3 = bitcast <2 x i256> %i2 to <64 x i8> 124 store <64 x i8> %i3, ptr %out, align 64 125 ret void 126} 127define void @store_single_256bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind { 128; SSE-LABEL: store_single_256bit_elt_vector: 129; SSE: # %bb.0: 130; SSE-NEXT: movaps (%rdi), %xmm0 131; SSE-NEXT: movaps 16(%rdi), %xmm1 132; SSE-NEXT: movaps %xmm0, (%rdx) 133; SSE-NEXT: movaps %xmm1, 16(%rdx) 134; SSE-NEXT: retq 135; 136; AVX-LABEL: store_single_256bit_elt_vector: 137; AVX: # %bb.0: 138; AVX-NEXT: vmovaps (%rdi), %ymm0 139; AVX-NEXT: vmovaps %ymm0, (%rdx) 140; AVX-NEXT: vzeroupper 141; AVX-NEXT: retq 142; 143; AVX2-LABEL: store_single_256bit_elt_vector: 144; AVX2: # %bb.0: 145; AVX2-NEXT: vmovaps (%rdi), %ymm0 146; AVX2-NEXT: vmovaps %ymm0, (%rdx) 147; AVX2-NEXT: vzeroupper 148; AVX2-NEXT: retq 149; 150; AVX512F-LABEL: store_single_256bit_elt_vector: 151; AVX512F: # %bb.0: 152; AVX512F-NEXT: vmovaps (%rdi), %ymm0 153; AVX512F-NEXT: vmovaps %ymm0, (%rdx) 154; AVX512F-NEXT: vzeroupper 155; AVX512F-NEXT: retq 156 %i0 = load <64 x i8>, ptr %in, align 64 157 %i1 = bitcast <64 x i8> %i0 to <2 x i256> 158 %i2 = shufflevector <2 x i256> %i1, <2 x i256> poison, <1 x i32> <i32 0> 159 %i3 = bitcast <1 x i256> %i2 to <32 x i8> 160 store <32 x i8> %i3, ptr %out, align 64 161 ret void 162} 163 164define void @load_single_512bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind { 165; SSE-LABEL: load_single_512bit_elt_vector: 166; SSE: # %bb.0: 167; SSE-NEXT: movaps (%rdi), %xmm0 168; SSE-NEXT: movaps 16(%rdi), %xmm1 169; SSE-NEXT: movaps 32(%rdi), %xmm2 170; SSE-NEXT: movaps 48(%rdi), %xmm3 171; SSE-NEXT: xorps %xmm4, %xmm4 172; SSE-NEXT: movaps %xmm4, 112(%rdx) 173; SSE-NEXT: movaps %xmm4, 96(%rdx) 174; SSE-NEXT: movaps %xmm4, 80(%rdx) 175; SSE-NEXT: movaps %xmm4, 64(%rdx) 176; SSE-NEXT: movaps %xmm3, 48(%rdx) 177; SSE-NEXT: movaps %xmm2, 32(%rdx) 178; SSE-NEXT: movaps %xmm1, 16(%rdx) 179; SSE-NEXT: movaps %xmm0, (%rdx) 180; SSE-NEXT: retq 181; 182; AVX-LABEL: load_single_512bit_elt_vector: 183; AVX: # %bb.0: 184; AVX-NEXT: vmovaps (%rdi), %ymm0 185; AVX-NEXT: vmovaps 32(%rdi), %ymm1 186; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 187; AVX-NEXT: vmovaps %ymm2, 96(%rdx) 188; AVX-NEXT: vmovaps %ymm2, 64(%rdx) 189; AVX-NEXT: vmovaps %ymm0, (%rdx) 190; AVX-NEXT: vmovaps %ymm1, 32(%rdx) 191; AVX-NEXT: vzeroupper 192; AVX-NEXT: retq 193; 194; AVX2-LABEL: load_single_512bit_elt_vector: 195; AVX2: # %bb.0: 196; AVX2-NEXT: vmovaps (%rdi), %ymm0 197; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 198; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 199; AVX2-NEXT: vmovaps %ymm2, 96(%rdx) 200; AVX2-NEXT: vmovaps %ymm2, 64(%rdx) 201; AVX2-NEXT: vmovaps %ymm0, (%rdx) 202; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) 203; AVX2-NEXT: vzeroupper 204; AVX2-NEXT: retq 205; 206; AVX512F-LABEL: load_single_512bit_elt_vector: 207; AVX512F: # %bb.0: 208; AVX512F-NEXT: vmovaps (%rdi), %zmm0 209; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 210; AVX512F-NEXT: vmovaps %zmm1, 64(%rdx) 211; AVX512F-NEXT: vmovaps %zmm0, (%rdx) 212; AVX512F-NEXT: vzeroupper 213; AVX512F-NEXT: retq 214 %i0 = load <64 x i8>, ptr %in, align 128 215 %i1 = bitcast <64 x i8> %i0 to <1 x i512> 216 %i2 = shufflevector <1 x i512> %i1, <1 x i512> zeroinitializer, <2 x i32> <i32 0, i32 1> 217 %i3 = bitcast <2 x i512> %i2 to <128 x i8> 218 store <128 x i8> %i3, ptr %out, align 128 219 ret void 220} 221define void @store_single_512bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind { 222; SSE-LABEL: store_single_512bit_elt_vector: 223; SSE: # %bb.0: 224; SSE-NEXT: movaps (%rdi), %xmm0 225; SSE-NEXT: movaps 16(%rdi), %xmm1 226; SSE-NEXT: movaps 32(%rdi), %xmm2 227; SSE-NEXT: movaps 48(%rdi), %xmm3 228; SSE-NEXT: movaps %xmm3, 48(%rdx) 229; SSE-NEXT: movaps %xmm0, (%rdx) 230; SSE-NEXT: movaps %xmm1, 16(%rdx) 231; SSE-NEXT: movaps %xmm2, 32(%rdx) 232; SSE-NEXT: retq 233; 234; AVX-LABEL: store_single_512bit_elt_vector: 235; AVX: # %bb.0: 236; AVX-NEXT: vmovaps (%rdi), %ymm0 237; AVX-NEXT: vmovaps 32(%rdi), %ymm1 238; AVX-NEXT: vmovaps %ymm0, (%rdx) 239; AVX-NEXT: vmovaps %ymm1, 32(%rdx) 240; AVX-NEXT: vzeroupper 241; AVX-NEXT: retq 242; 243; AVX2-LABEL: store_single_512bit_elt_vector: 244; AVX2: # %bb.0: 245; AVX2-NEXT: vmovaps (%rdi), %ymm0 246; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 247; AVX2-NEXT: vmovaps %ymm0, (%rdx) 248; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) 249; AVX2-NEXT: vzeroupper 250; AVX2-NEXT: retq 251; 252; AVX512F-LABEL: store_single_512bit_elt_vector: 253; AVX512F: # %bb.0: 254; AVX512F-NEXT: vmovaps (%rdi), %zmm0 255; AVX512F-NEXT: vmovaps %zmm0, (%rdx) 256; AVX512F-NEXT: vzeroupper 257; AVX512F-NEXT: retq 258 %i0 = load <128 x i8>, ptr %in, align 128 259 %i1 = bitcast <128 x i8> %i0 to <2 x i512> 260 %i2 = shufflevector <2 x i512> %i1, <2 x i512> poison, <1 x i32> <i32 0> 261 %i3 = bitcast <1 x i512> %i2 to <64 x i8> 262 store <64 x i8> %i3, ptr %out, align 128 263 ret void 264} 265;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 266; AVX1-ONLY: {{.*}} 267; AVX2-FAST: {{.*}} 268; AVX2-FAST-PERLANE: {{.*}} 269; AVX2-SLOW: {{.*}} 270; AVX512F-FAST: {{.*}} 271; AVX512F-SLOW: {{.*}} 272; FALLBACK0: {{.*}} 273; FALLBACK1: {{.*}} 274; FALLBACK2: {{.*}} 275; FALLBACK3: {{.*}} 276; FALLBACK4: {{.*}} 277; FALLBACK5: {{.*}} 278; FALLBACK6: {{.*}} 279; FALLBACK7: {{.*}} 280; SSE2: {{.*}} 281; SSE42: {{.*}} 282