1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128 3; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128 4; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512 5; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512 6 7; Test optimizing interleaves to widening arithmetic. 8 9define <4 x i8> @interleave_v2i8(<2 x i8> %x, <2 x i8> %y) { 10; CHECK-LABEL: interleave_v2i8: 11; CHECK: # %bb.0: 12; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 13; CHECK-NEXT: vwaddu.vv v10, v8, v9 14; CHECK-NEXT: li a0, -1 15; CHECK-NEXT: vwmaccu.vx v10, a0, v9 16; CHECK-NEXT: vmv1r.v v8, v10 17; CHECK-NEXT: ret 18 %a = shufflevector <2 x i8> %x, <2 x i8> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 19 ret <4 x i8> %a 20} 21 22define <4 x i16> @interleave_v2i16(<2 x i16> %x, <2 x i16> %y) { 23; CHECK-LABEL: interleave_v2i16: 24; CHECK: # %bb.0: 25; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 26; CHECK-NEXT: vwaddu.vv v10, v8, v9 27; CHECK-NEXT: li a0, -1 28; CHECK-NEXT: vwmaccu.vx v10, a0, v9 29; CHECK-NEXT: vmv1r.v v8, v10 30; CHECK-NEXT: ret 31 %a = shufflevector <2 x i16> %x, <2 x i16> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 32 ret <4 x i16> %a 33} 34 35; Vector order switched for coverage. 36define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) { 37; CHECK-LABEL: interleave_v2i32: 38; CHECK: # %bb.0: 39; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 40; CHECK-NEXT: vwaddu.vv v10, v9, v8 41; CHECK-NEXT: li a0, -1 42; CHECK-NEXT: vwmaccu.vx v10, a0, v8 43; CHECK-NEXT: vmv1r.v v8, v10 44; CHECK-NEXT: ret 45 %a = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 2, i32 0, i32 3, i32 1> 46 ret <4 x i32> %a 47} 48 49; One vXi64 test case to very that we don't optimize it. 50; FIXME: Is there better codegen we can do here? 51define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) { 52; V128-LABEL: interleave_v2i64: 53; V128: # %bb.0: 54; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 55; V128-NEXT: vmv1r.v v12, v9 56; V128-NEXT: vid.v v9 57; V128-NEXT: vmv.v.i v0, 10 58; V128-NEXT: vsrl.vi v14, v9, 1 59; V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu 60; V128-NEXT: vrgatherei16.vv v10, v8, v14 61; V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t 62; V128-NEXT: vmv.v.v v8, v10 63; V128-NEXT: ret 64; 65; RV32-V512-LABEL: interleave_v2i64: 66; RV32-V512: # %bb.0: 67; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma 68; RV32-V512-NEXT: vid.v v10 69; RV32-V512-NEXT: vsrl.vi v11, v10, 1 70; RV32-V512-NEXT: vmv.v.i v0, 10 71; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu 72; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 73; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t 74; RV32-V512-NEXT: vmv.v.v v8, v10 75; RV32-V512-NEXT: ret 76; 77; RV64-V512-LABEL: interleave_v2i64: 78; RV64-V512: # %bb.0: 79; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu 80; RV64-V512-NEXT: vid.v v10 81; RV64-V512-NEXT: vsrl.vi v11, v10, 1 82; RV64-V512-NEXT: vmv.v.i v0, 10 83; RV64-V512-NEXT: vrgather.vv v10, v8, v11 84; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t 85; RV64-V512-NEXT: vmv.v.v v8, v10 86; RV64-V512-NEXT: ret 87 %a = shufflevector <2 x i64> %x, <2 x i64> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 88 ret <4 x i64> %a 89} 90 91; Vector order switched for coverage. 92define <8 x i8> @interleave_v4i8(<4 x i8> %x, <4 x i8> %y) { 93; V128-LABEL: interleave_v4i8: 94; V128: # %bb.0: 95; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 96; V128-NEXT: vwaddu.vv v10, v9, v8 97; V128-NEXT: li a0, -1 98; V128-NEXT: vwmaccu.vx v10, a0, v8 99; V128-NEXT: vmv1r.v v8, v10 100; V128-NEXT: ret 101; 102; V512-LABEL: interleave_v4i8: 103; V512: # %bb.0: 104; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma 105; V512-NEXT: vwaddu.vv v10, v9, v8 106; V512-NEXT: li a0, -1 107; V512-NEXT: vwmaccu.vx v10, a0, v8 108; V512-NEXT: vmv1r.v v8, v10 109; V512-NEXT: ret 110 %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3> 111 ret <8 x i8> %a 112} 113 114; Undef elements for coverage 115define <8 x i16> @interleave_v4i16(<4 x i16> %x, <4 x i16> %y) { 116; V128-LABEL: interleave_v4i16: 117; V128: # %bb.0: 118; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 119; V128-NEXT: vwaddu.vv v10, v8, v9 120; V128-NEXT: li a0, -1 121; V128-NEXT: vwmaccu.vx v10, a0, v9 122; V128-NEXT: vmv1r.v v8, v10 123; V128-NEXT: ret 124; 125; V512-LABEL: interleave_v4i16: 126; V512: # %bb.0: 127; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma 128; V512-NEXT: vwaddu.vv v10, v8, v9 129; V512-NEXT: li a0, -1 130; V512-NEXT: vwmaccu.vx v10, a0, v9 131; V512-NEXT: vmv1r.v v8, v10 132; V512-NEXT: ret 133 %a = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> <i32 0, i32 4, i32 undef, i32 5, i32 2, i32 undef, i32 3, i32 7> 134 ret <8 x i16> %a 135} 136 137define <8 x i32> @interleave_v4i32(<4 x i32> %x, <4 x i32> %y) { 138; V128-LABEL: interleave_v4i32: 139; V128: # %bb.0: 140; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma 141; V128-NEXT: vwaddu.vv v10, v8, v9 142; V128-NEXT: li a0, -1 143; V128-NEXT: vwmaccu.vx v10, a0, v9 144; V128-NEXT: vmv2r.v v8, v10 145; V128-NEXT: ret 146; 147; V512-LABEL: interleave_v4i32: 148; V512: # %bb.0: 149; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma 150; V512-NEXT: vwaddu.vv v10, v8, v9 151; V512-NEXT: li a0, -1 152; V512-NEXT: vwmaccu.vx v10, a0, v9 153; V512-NEXT: vmv1r.v v8, v10 154; V512-NEXT: ret 155 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 156 ret <8 x i32> %a 157} 158 159; %y should be slid down by 2 160define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) { 161; V128-LABEL: interleave_v4i32_offset_2: 162; V128: # %bb.0: 163; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma 164; V128-NEXT: vslidedown.vi v10, v9, 2 165; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 166; V128-NEXT: vwaddu.vv v9, v8, v10 167; V128-NEXT: li a0, -1 168; V128-NEXT: vwmaccu.vx v9, a0, v10 169; V128-NEXT: vmv1r.v v8, v9 170; V128-NEXT: ret 171; 172; V512-LABEL: interleave_v4i32_offset_2: 173; V512: # %bb.0: 174; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 175; V512-NEXT: vslidedown.vi v10, v9, 2 176; V512-NEXT: vwaddu.vv v9, v8, v10 177; V512-NEXT: li a0, -1 178; V512-NEXT: vwmaccu.vx v9, a0, v10 179; V512-NEXT: vmv1r.v v8, v9 180; V512-NEXT: ret 181 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 6, i32 1, i32 7> 182 ret <4 x i32> %a 183} 184 185; %y should be slid down by 1 186define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) { 187; V128-LABEL: interleave_v4i32_offset_1: 188; V128: # %bb.0: 189; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma 190; V128-NEXT: vid.v v10 191; V128-NEXT: vmv.v.i v0, 10 192; V128-NEXT: vsrl.vi v10, v10, 1 193; V128-NEXT: vadd.vi v11, v10, 1 194; V128-NEXT: vsetivli zero, 2, e64, m1, ta, ma 195; V128-NEXT: vzext.vf2 v10, v8 196; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu 197; V128-NEXT: vrgather.vv v10, v9, v11, v0.t 198; V128-NEXT: vmv.v.v v8, v10 199; V128-NEXT: ret 200; 201; V512-LABEL: interleave_v4i32_offset_1: 202; V512: # %bb.0: 203; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma 204; V512-NEXT: vid.v v10 205; V512-NEXT: vmv.v.i v0, 10 206; V512-NEXT: vsrl.vi v10, v10, 1 207; V512-NEXT: vadd.vi v11, v10, 1 208; V512-NEXT: vsetivli zero, 2, e64, m1, ta, ma 209; V512-NEXT: vzext.vf2 v10, v8 210; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu 211; V512-NEXT: vrgather.vv v10, v9, v11, v0.t 212; V512-NEXT: vmv1r.v v8, v10 213; V512-NEXT: ret 214 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 1, i32 6> 215 ret <4 x i32> %a 216} 217 218define <16 x i8> @interleave_v8i8(<8 x i8> %x, <8 x i8> %y) { 219; V128-LABEL: interleave_v8i8: 220; V128: # %bb.0: 221; V128-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 222; V128-NEXT: vwaddu.vv v10, v8, v9 223; V128-NEXT: li a0, -1 224; V128-NEXT: vwmaccu.vx v10, a0, v9 225; V128-NEXT: vmv1r.v v8, v10 226; V128-NEXT: ret 227; 228; V512-LABEL: interleave_v8i8: 229; V512: # %bb.0: 230; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, ma 231; V512-NEXT: vwaddu.vv v10, v8, v9 232; V512-NEXT: li a0, -1 233; V512-NEXT: vwmaccu.vx v10, a0, v9 234; V512-NEXT: vmv1r.v v8, v10 235; V512-NEXT: ret 236 %a = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 237 ret <16 x i8> %a 238} 239 240; Vector order switched for coverage. 241define <16 x i16> @interleave_v8i16(<8 x i16> %x, <8 x i16> %y) { 242; V128-LABEL: interleave_v8i16: 243; V128: # %bb.0: 244; V128-NEXT: vsetivli zero, 8, e16, m1, ta, ma 245; V128-NEXT: vwaddu.vv v10, v9, v8 246; V128-NEXT: li a0, -1 247; V128-NEXT: vwmaccu.vx v10, a0, v8 248; V128-NEXT: vmv2r.v v8, v10 249; V128-NEXT: ret 250; 251; V512-LABEL: interleave_v8i16: 252; V512: # %bb.0: 253; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma 254; V512-NEXT: vwaddu.vv v10, v9, v8 255; V512-NEXT: li a0, -1 256; V512-NEXT: vwmaccu.vx v10, a0, v8 257; V512-NEXT: vmv1r.v v8, v10 258; V512-NEXT: ret 259 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7> 260 ret <16 x i16> %a 261} 262 263define <16 x i32> @interleave_v8i32(<8 x i32> %x, <8 x i32> %y) { 264; V128-LABEL: interleave_v8i32: 265; V128: # %bb.0: 266; V128-NEXT: vsetivli zero, 8, e32, m2, ta, ma 267; V128-NEXT: vwaddu.vv v12, v8, v10 268; V128-NEXT: li a0, -1 269; V128-NEXT: vwmaccu.vx v12, a0, v10 270; V128-NEXT: vmv4r.v v8, v12 271; V128-NEXT: ret 272; 273; V512-LABEL: interleave_v8i32: 274; V512: # %bb.0: 275; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma 276; V512-NEXT: vwaddu.vv v10, v8, v9 277; V512-NEXT: li a0, -1 278; V512-NEXT: vwmaccu.vx v10, a0, v9 279; V512-NEXT: vmv1r.v v8, v10 280; V512-NEXT: ret 281 %a = shufflevector <8 x i32> %x, <8 x i32> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 282 ret <16 x i32> %a 283} 284 285define <32 x i8> @interleave_v16i8(<16 x i8> %x, <16 x i8> %y) { 286; V128-LABEL: interleave_v16i8: 287; V128: # %bb.0: 288; V128-NEXT: vsetivli zero, 16, e8, m1, ta, ma 289; V128-NEXT: vwaddu.vv v10, v8, v9 290; V128-NEXT: li a0, -1 291; V128-NEXT: vwmaccu.vx v10, a0, v9 292; V128-NEXT: vmv2r.v v8, v10 293; V128-NEXT: ret 294; 295; V512-LABEL: interleave_v16i8: 296; V512: # %bb.0: 297; V512-NEXT: vsetivli zero, 16, e8, mf4, ta, ma 298; V512-NEXT: vwaddu.vv v10, v8, v9 299; V512-NEXT: li a0, -1 300; V512-NEXT: vwmaccu.vx v10, a0, v9 301; V512-NEXT: vmv1r.v v8, v10 302; V512-NEXT: ret 303 %a = shufflevector <16 x i8> %x, <16 x i8> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 304 ret <32 x i8> %a 305} 306 307define <32 x i16> @interleave_v16i16(<16 x i16> %x, <16 x i16> %y) { 308; V128-LABEL: interleave_v16i16: 309; V128: # %bb.0: 310; V128-NEXT: vsetivli zero, 16, e16, m2, ta, ma 311; V128-NEXT: vwaddu.vv v12, v8, v10 312; V128-NEXT: li a0, -1 313; V128-NEXT: vwmaccu.vx v12, a0, v10 314; V128-NEXT: vmv4r.v v8, v12 315; V128-NEXT: ret 316; 317; V512-LABEL: interleave_v16i16: 318; V512: # %bb.0: 319; V512-NEXT: vsetivli zero, 16, e16, mf2, ta, ma 320; V512-NEXT: vwaddu.vv v10, v8, v9 321; V512-NEXT: li a0, -1 322; V512-NEXT: vwmaccu.vx v10, a0, v9 323; V512-NEXT: vmv1r.v v8, v10 324; V512-NEXT: ret 325 %a = shufflevector <16 x i16> %x, <16 x i16> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 326 ret <32 x i16> %a 327} 328 329define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) { 330; V128-LABEL: interleave_v16i32: 331; V128: # %bb.0: 332; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma 333; V128-NEXT: vwaddu.vv v16, v8, v12 334; V128-NEXT: li a0, -1 335; V128-NEXT: vwmaccu.vx v16, a0, v12 336; V128-NEXT: vmv8r.v v8, v16 337; V128-NEXT: ret 338; 339; V512-LABEL: interleave_v16i32: 340; V512: # %bb.0: 341; V512-NEXT: vsetivli zero, 16, e32, m1, ta, ma 342; V512-NEXT: vwaddu.vv v10, v8, v9 343; V512-NEXT: li a0, -1 344; V512-NEXT: vwmaccu.vx v10, a0, v9 345; V512-NEXT: vmv2r.v v8, v10 346; V512-NEXT: ret 347 %a = shufflevector <16 x i32> %x, <16 x i32> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 348 ret <32 x i32> %a 349} 350 351define <64 x i8> @interleave_v32i8(<32 x i8> %x, <32 x i8> %y) { 352; V128-LABEL: interleave_v32i8: 353; V128: # %bb.0: 354; V128-NEXT: li a0, 32 355; V128-NEXT: vsetvli zero, a0, e8, m2, ta, ma 356; V128-NEXT: vwaddu.vv v12, v8, v10 357; V128-NEXT: li a0, -1 358; V128-NEXT: vwmaccu.vx v12, a0, v10 359; V128-NEXT: vmv4r.v v8, v12 360; V128-NEXT: ret 361; 362; V512-LABEL: interleave_v32i8: 363; V512: # %bb.0: 364; V512-NEXT: li a0, 32 365; V512-NEXT: vsetvli zero, a0, e8, mf2, ta, ma 366; V512-NEXT: vwaddu.vv v10, v8, v9 367; V512-NEXT: li a0, -1 368; V512-NEXT: vwmaccu.vx v10, a0, v9 369; V512-NEXT: vmv1r.v v8, v10 370; V512-NEXT: ret 371 %a = shufflevector <32 x i8> %x, <32 x i8> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 372 ret <64 x i8> %a 373} 374 375define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) { 376; V128-LABEL: interleave_v32i16: 377; V128: # %bb.0: 378; V128-NEXT: li a0, 32 379; V128-NEXT: vsetvli zero, a0, e16, m4, ta, ma 380; V128-NEXT: vwaddu.vv v16, v8, v12 381; V128-NEXT: li a0, -1 382; V128-NEXT: vwmaccu.vx v16, a0, v12 383; V128-NEXT: vmv8r.v v8, v16 384; V128-NEXT: ret 385; 386; V512-LABEL: interleave_v32i16: 387; V512: # %bb.0: 388; V512-NEXT: li a0, 32 389; V512-NEXT: vsetvli zero, a0, e16, m1, ta, ma 390; V512-NEXT: vwaddu.vv v10, v8, v9 391; V512-NEXT: li a0, -1 392; V512-NEXT: vwmaccu.vx v10, a0, v9 393; V512-NEXT: vmv2r.v v8, v10 394; V512-NEXT: ret 395 %a = shufflevector <32 x i16> %x, <32 x i16> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 396 ret <64 x i16> %a 397} 398 399define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { 400; V128-LABEL: interleave_v32i32: 401; V128: # %bb.0: 402; V128-NEXT: addi sp, sp, -16 403; V128-NEXT: .cfi_def_cfa_offset 16 404; V128-NEXT: csrr a0, vlenb 405; V128-NEXT: slli a0, a0, 3 406; V128-NEXT: sub sp, sp, a0 407; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb 408; V128-NEXT: addi a0, sp, 16 409; V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill 410; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma 411; V128-NEXT: vslidedown.vi v24, v16, 16 412; V128-NEXT: li a0, 32 413; V128-NEXT: lui a1, 699051 414; V128-NEXT: vslidedown.vi v0, v8, 16 415; V128-NEXT: vsetivli zero, 16, e64, m8, ta, ma 416; V128-NEXT: vzext.vf2 v8, v24 417; V128-NEXT: addi a1, a1, -1366 418; V128-NEXT: vzext.vf2 v24, v0 419; V128-NEXT: vmv.s.x v0, a1 420; V128-NEXT: vsll.vx v8, v8, a0 421; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma 422; V128-NEXT: vmerge.vvm v24, v24, v8, v0 423; V128-NEXT: addi a0, sp, 16 424; V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload 425; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma 426; V128-NEXT: vwaddu.vv v0, v8, v16 427; V128-NEXT: li a0, -1 428; V128-NEXT: vwmaccu.vx v0, a0, v16 429; V128-NEXT: vmv8r.v v8, v0 430; V128-NEXT: vmv8r.v v16, v24 431; V128-NEXT: csrr a0, vlenb 432; V128-NEXT: slli a0, a0, 3 433; V128-NEXT: add sp, sp, a0 434; V128-NEXT: .cfi_def_cfa sp, 16 435; V128-NEXT: addi sp, sp, 16 436; V128-NEXT: .cfi_def_cfa_offset 0 437; V128-NEXT: ret 438; 439; V512-LABEL: interleave_v32i32: 440; V512: # %bb.0: 441; V512-NEXT: li a0, 32 442; V512-NEXT: vsetvli zero, a0, e32, m2, ta, ma 443; V512-NEXT: vwaddu.vv v12, v8, v10 444; V512-NEXT: li a0, -1 445; V512-NEXT: vwmaccu.vx v12, a0, v10 446; V512-NEXT: vmv4r.v v8, v12 447; V512-NEXT: ret 448 %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 449 ret <64 x i32> %a 450} 451 452define <4 x i8> @unary_interleave_v4i8(<4 x i8> %x) { 453; V128-LABEL: unary_interleave_v4i8: 454; V128: # %bb.0: 455; V128-NEXT: vsetivli zero, 2, e8, mf4, ta, ma 456; V128-NEXT: vslidedown.vi v10, v8, 2 457; V128-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 458; V128-NEXT: vwaddu.vv v9, v8, v10 459; V128-NEXT: li a0, -1 460; V128-NEXT: vwmaccu.vx v9, a0, v10 461; V128-NEXT: vmv1r.v v8, v9 462; V128-NEXT: ret 463; 464; V512-LABEL: unary_interleave_v4i8: 465; V512: # %bb.0: 466; V512-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 467; V512-NEXT: vslidedown.vi v10, v8, 2 468; V512-NEXT: vwaddu.vv v9, v8, v10 469; V512-NEXT: li a0, -1 470; V512-NEXT: vwmaccu.vx v9, a0, v10 471; V512-NEXT: vmv1r.v v8, v9 472; V512-NEXT: ret 473 %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 474 ret <4 x i8> %a 475} 476 477; This shouldn't be interleaved 478define <4 x i8> @unary_interleave_v4i8_invalid(<4 x i8> %x) { 479; V128-LABEL: unary_interleave_v4i8_invalid: 480; V128: # %bb.0: 481; V128-NEXT: lui a0, 16 482; V128-NEXT: addi a0, a0, 768 483; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma 484; V128-NEXT: vmv.s.x v10, a0 485; V128-NEXT: vsetvli zero, zero, e8, mf4, ta, ma 486; V128-NEXT: vrgather.vv v9, v8, v10 487; V128-NEXT: vmv1r.v v8, v9 488; V128-NEXT: ret 489; 490; V512-LABEL: unary_interleave_v4i8_invalid: 491; V512: # %bb.0: 492; V512-NEXT: lui a0, 16 493; V512-NEXT: addi a0, a0, 768 494; V512-NEXT: vsetivli zero, 4, e32, m1, ta, ma 495; V512-NEXT: vmv.s.x v10, a0 496; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma 497; V512-NEXT: vrgather.vv v9, v8, v10 498; V512-NEXT: vmv1r.v v8, v9 499; V512-NEXT: ret 500 %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 4> 501 ret <4 x i8> %a 502} 503 504define <4 x i16> @unary_interleave_v4i16(<4 x i16> %x) { 505; V128-LABEL: unary_interleave_v4i16: 506; V128: # %bb.0: 507; V128-NEXT: vsetivli zero, 2, e16, mf2, ta, ma 508; V128-NEXT: vslidedown.vi v10, v8, 2 509; V128-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 510; V128-NEXT: vwaddu.vv v9, v8, v10 511; V128-NEXT: li a0, -1 512; V128-NEXT: vwmaccu.vx v9, a0, v10 513; V128-NEXT: vmv1r.v v8, v9 514; V128-NEXT: ret 515; 516; V512-LABEL: unary_interleave_v4i16: 517; V512: # %bb.0: 518; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 519; V512-NEXT: vslidedown.vi v10, v8, 2 520; V512-NEXT: vwaddu.vv v9, v8, v10 521; V512-NEXT: li a0, -1 522; V512-NEXT: vwmaccu.vx v9, a0, v10 523; V512-NEXT: vmv1r.v v8, v9 524; V512-NEXT: ret 525 %a = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 526 ret <4 x i16> %a 527} 528 529define <4 x i32> @unary_interleave_v4i32(<4 x i32> %x) { 530; V128-LABEL: unary_interleave_v4i32: 531; V128: # %bb.0: 532; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma 533; V128-NEXT: vslidedown.vi v10, v8, 2 534; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 535; V128-NEXT: vwaddu.vv v9, v8, v10 536; V128-NEXT: li a0, -1 537; V128-NEXT: vwmaccu.vx v9, a0, v10 538; V128-NEXT: vmv1r.v v8, v9 539; V128-NEXT: ret 540; 541; V512-LABEL: unary_interleave_v4i32: 542; V512: # %bb.0: 543; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 544; V512-NEXT: vslidedown.vi v10, v8, 2 545; V512-NEXT: vwaddu.vv v9, v8, v10 546; V512-NEXT: li a0, -1 547; V512-NEXT: vwmaccu.vx v9, a0, v10 548; V512-NEXT: vmv1r.v v8, v9 549; V512-NEXT: ret 550 %a = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 551 ret <4 x i32> %a 552} 553 554; FIXME: Is there better codegen we can do here? 555define <4 x i64> @unary_interleave_v4i64(<4 x i64> %x) { 556; V128-LABEL: unary_interleave_v4i64: 557; V128: # %bb.0: 558; V128-NEXT: lui a0, 12304 559; V128-NEXT: addi a0, a0, 512 560; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma 561; V128-NEXT: vmv.s.x v10, a0 562; V128-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 563; V128-NEXT: vsext.vf2 v12, v10 564; V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma 565; V128-NEXT: vrgatherei16.vv v10, v8, v12 566; V128-NEXT: vmv.v.v v8, v10 567; V128-NEXT: ret 568; 569; RV32-V512-LABEL: unary_interleave_v4i64: 570; RV32-V512: # %bb.0: 571; RV32-V512-NEXT: lui a0, 12304 572; RV32-V512-NEXT: addi a0, a0, 512 573; RV32-V512-NEXT: vsetivli zero, 4, e32, m1, ta, ma 574; RV32-V512-NEXT: vmv.s.x v9, a0 575; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma 576; RV32-V512-NEXT: vsext.vf2 v10, v9 577; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, ma 578; RV32-V512-NEXT: vrgatherei16.vv v9, v8, v10 579; RV32-V512-NEXT: vmv.v.v v8, v9 580; RV32-V512-NEXT: ret 581; 582; RV64-V512-LABEL: unary_interleave_v4i64: 583; RV64-V512: # %bb.0: 584; RV64-V512-NEXT: lui a0, 12304 585; RV64-V512-NEXT: addi a0, a0, 512 586; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma 587; RV64-V512-NEXT: vmv.s.x v9, a0 588; RV64-V512-NEXT: vsext.vf8 v10, v9 589; RV64-V512-NEXT: vrgather.vv v9, v8, v10 590; RV64-V512-NEXT: vmv.v.v v8, v9 591; RV64-V512-NEXT: ret 592 %a = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 593 ret <4 x i64> %a 594} 595 596define <8 x i8> @unary_interleave_v8i8(<8 x i8> %x) { 597; V128-LABEL: unary_interleave_v8i8: 598; V128: # %bb.0: 599; V128-NEXT: vsetivli zero, 4, e8, mf2, ta, ma 600; V128-NEXT: vslidedown.vi v10, v8, 4 601; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 602; V128-NEXT: vwaddu.vv v9, v8, v10 603; V128-NEXT: li a0, -1 604; V128-NEXT: vwmaccu.vx v9, a0, v10 605; V128-NEXT: vmv1r.v v8, v9 606; V128-NEXT: ret 607; 608; V512-LABEL: unary_interleave_v8i8: 609; V512: # %bb.0: 610; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma 611; V512-NEXT: vslidedown.vi v10, v8, 4 612; V512-NEXT: vwaddu.vv v9, v8, v10 613; V512-NEXT: li a0, -1 614; V512-NEXT: vwmaccu.vx v9, a0, v10 615; V512-NEXT: vmv1r.v v8, v9 616; V512-NEXT: ret 617 %a = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 undef, i32 6, i32 3, i32 7> 618 ret <8 x i8> %a 619} 620 621define <8 x i16> @unary_interleave_v8i16(<8 x i16> %x) { 622; V128-LABEL: unary_interleave_v8i16: 623; V128: # %bb.0: 624; V128-NEXT: vsetivli zero, 4, e16, m1, ta, ma 625; V128-NEXT: vslidedown.vi v10, v8, 4 626; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 627; V128-NEXT: vwaddu.vv v9, v10, v8 628; V128-NEXT: li a0, -1 629; V128-NEXT: vwmaccu.vx v9, a0, v8 630; V128-NEXT: vmv1r.v v8, v9 631; V128-NEXT: ret 632; 633; V512-LABEL: unary_interleave_v8i16: 634; V512: # %bb.0: 635; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma 636; V512-NEXT: vslidedown.vi v10, v8, 4 637; V512-NEXT: vwaddu.vv v9, v10, v8 638; V512-NEXT: li a0, -1 639; V512-NEXT: vwmaccu.vx v9, a0, v8 640; V512-NEXT: vmv1r.v v8, v9 641; V512-NEXT: ret 642 %a = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3> 643 ret <8 x i16> %a 644} 645 646define <8 x i32> @unary_interleave_v8i32(<8 x i32> %x) { 647; V128-LABEL: unary_interleave_v8i32: 648; V128: # %bb.0: 649; V128-NEXT: vsetivli zero, 4, e32, m2, ta, ma 650; V128-NEXT: vslidedown.vi v12, v8, 4 651; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma 652; V128-NEXT: vwaddu.vv v10, v8, v12 653; V128-NEXT: li a0, -1 654; V128-NEXT: vwmaccu.vx v10, a0, v12 655; V128-NEXT: vmv2r.v v8, v10 656; V128-NEXT: ret 657; 658; V512-LABEL: unary_interleave_v8i32: 659; V512: # %bb.0: 660; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma 661; V512-NEXT: vslidedown.vi v10, v8, 4 662; V512-NEXT: vwaddu.vv v9, v8, v10 663; V512-NEXT: li a0, -1 664; V512-NEXT: vwmaccu.vx v9, a0, v10 665; V512-NEXT: vmv1r.v v8, v9 666; V512-NEXT: ret 667 %a = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 668 ret <8 x i32> %a 669} 670 671; This interleaves the first 2 elements of a vector in opposite order. With 672; undefs for the remaining elements. We use to miscompile this. 673define <4 x i8> @unary_interleave_10uu_v4i8(<4 x i8> %x) { 674; CHECK-LABEL: unary_interleave_10uu_v4i8: 675; CHECK: # %bb.0: 676; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 677; CHECK-NEXT: vsrl.vi v9, v8, 8 678; CHECK-NEXT: vsll.vi v8, v8, 8 679; CHECK-NEXT: vor.vv v8, v8, v9 680; CHECK-NEXT: ret 681 %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef> 682 ret <4 x i8> %a 683} 684 685;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 686; RV32-V128: {{.*}} 687; RV64-V128: {{.*}} 688