1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512F 7 8define void @foo(<4 x float> %in, ptr %out) { 9; SSE2-LABEL: foo: 10; SSE2: # %bb.0: 11; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 12; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 13; SSE2-NEXT: packuswb %xmm0, %xmm0 14; SSE2-NEXT: packuswb %xmm0, %xmm0 15; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 16; SSE2-NEXT: movd %xmm0, (%rdi) 17; SSE2-NEXT: retq 18; 19; SSE42-LABEL: foo: 20; SSE42: # %bb.0: 21; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 22; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] 23; SSE42-NEXT: movl $255, %eax 24; SSE42-NEXT: pinsrb $3, %eax, %xmm0 25; SSE42-NEXT: movd %xmm0, (%rdi) 26; SSE42-NEXT: retq 27; 28; AVX-LABEL: foo: 29; AVX: # %bb.0: 30; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 31; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] 32; AVX-NEXT: movl $255, %eax 33; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 34; AVX-NEXT: vmovd %xmm0, (%rdi) 35; AVX-NEXT: retq 36 %t0 = fptosi <4 x float> %in to <4 x i32> 37 %t1 = trunc <4 x i32> %t0 to <4 x i16> 38 %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 39 %t3 = trunc <8 x i16> %t2 to <8 x i8> 40 %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 41 %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3 42 store <4 x i8> %t5, ptr %out 43 ret void 44} 45 46define <16 x i64> @catcat(<4 x i64> %x) { 47; SSE-LABEL: catcat: 48; SSE: # %bb.0: 49; SSE-NEXT: movq %rdi, %rax 50; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 51; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 52; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] 53; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 54; SSE-NEXT: movdqa %xmm1, 112(%rdi) 55; SSE-NEXT: movdqa %xmm1, 96(%rdi) 56; SSE-NEXT: movdqa %xmm3, 80(%rdi) 57; SSE-NEXT: movdqa %xmm3, 64(%rdi) 58; SSE-NEXT: movdqa %xmm0, 48(%rdi) 59; SSE-NEXT: movdqa %xmm0, 32(%rdi) 60; SSE-NEXT: movdqa %xmm2, 16(%rdi) 61; SSE-NEXT: movdqa %xmm2, (%rdi) 62; SSE-NEXT: retq 63; 64; AVX1-LABEL: catcat: 65; AVX1: # %bb.0: 66; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] 67; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4 68; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] 69; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 70; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 71; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm0[0,0,2,2] 72; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[1,1,3,3] 73; AVX1-NEXT: vmovaps %ymm4, %ymm0 74; AVX1-NEXT: retq 75; 76; AVX2-LABEL: catcat: 77; AVX2: # %bb.0: 78; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,1,1,1] 79; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] 80; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[3,3,3,3] 81; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 82; AVX2-NEXT: retq 83; 84; AVX512F-LABEL: catcat: 85; AVX512F: # %bb.0: 86; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 87; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] 88; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm2 89; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3] 90; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm1 91; AVX512F-NEXT: vmovaps %zmm2, %zmm0 92; AVX512F-NEXT: retq 93 %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 94 %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 95 %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 96 ret <16 x i64> %r 97} 98 99define <16 x i64> @load_catcat(ptr %p) { 100; SSE-LABEL: load_catcat: 101; SSE: # %bb.0: 102; SSE-NEXT: movq %rdi, %rax 103; SSE-NEXT: movdqa (%rsi), %xmm0 104; SSE-NEXT: movdqa 16(%rsi), %xmm1 105; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 106; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 107; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] 108; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 109; SSE-NEXT: movdqa %xmm1, 112(%rdi) 110; SSE-NEXT: movdqa %xmm1, 96(%rdi) 111; SSE-NEXT: movdqa %xmm3, 80(%rdi) 112; SSE-NEXT: movdqa %xmm3, 64(%rdi) 113; SSE-NEXT: movdqa %xmm0, 48(%rdi) 114; SSE-NEXT: movdqa %xmm0, 32(%rdi) 115; SSE-NEXT: movdqa %xmm2, 16(%rdi) 116; SSE-NEXT: movdqa %xmm2, (%rdi) 117; SSE-NEXT: retq 118; 119; AVX1-LABEL: load_catcat: 120; AVX1: # %bb.0: 121; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 122; AVX1-NEXT: vbroadcastsd 8(%rdi), %ymm1 123; AVX1-NEXT: vbroadcastsd 16(%rdi), %ymm2 124; AVX1-NEXT: vbroadcastsd 24(%rdi), %ymm3 125; AVX1-NEXT: retq 126; 127; AVX2-LABEL: load_catcat: 128; AVX2: # %bb.0: 129; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 130; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm1 131; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm2 132; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3 133; AVX2-NEXT: retq 134; 135; AVX512F-LABEL: load_catcat: 136; AVX512F: # %bb.0: 137; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] 138; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,4,0,4,1,5,1,5] 139; AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm0 140; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,6,2,6,3,7,3,7] 141; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 142; AVX512F-NEXT: retq 143 %x = load <4 x i64>, ptr %p 144 %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 145 %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 146 %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 147 ret <16 x i64> %r 148} 149 150; Use weird types to make sure we do not miscompile a case where 151; the source ops are not an even multiple size of the result. 152 153define <4 x i32> @cat_ext_straddle(ptr %px, ptr %py) { 154; SSE-LABEL: cat_ext_straddle: 155; SSE: # %bb.0: 156; SSE-NEXT: movaps 16(%rdi), %xmm0 157; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 158; SSE-NEXT: retq 159; 160; AVX-LABEL: cat_ext_straddle: 161; AVX: # %bb.0: 162; AVX-NEXT: vmovaps 16(%rdi), %xmm0 163; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 164; AVX-NEXT: retq 165 %x = load <6 x i32>, ptr %px 166 %y = load <6 x i32>, ptr %py 167 %cat = shufflevector <6 x i32> %x, <6 x i32> %y, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 168 %ext = shufflevector <12 x i32> %cat, <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 169 ret <4 x i32> %ext 170} 171