1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECKLE 3; RUN: llc < %s -mtriple=aarch64_be | FileCheck %s --check-prefixes=CHECKBE 4 5define <8 x i8> @vtrni8(ptr %A, ptr %B) nounwind { 6; CHECKLE-LABEL: vtrni8: 7; CHECKLE: // %bb.0: 8; CHECKLE-NEXT: ldr d0, [x0] 9; CHECKLE-NEXT: ldr d1, [x1] 10; CHECKLE-NEXT: trn1 v2.8b, v0.8b, v1.8b 11; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b 12; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b 13; CHECKLE-NEXT: ret 14; 15; CHECKBE-LABEL: vtrni8: 16; CHECKBE: // %bb.0: 17; CHECKBE-NEXT: ld1 { v0.8b }, [x0] 18; CHECKBE-NEXT: ld1 { v1.8b }, [x1] 19; CHECKBE-NEXT: trn1 v2.8b, v0.8b, v1.8b 20; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b 21; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b 22; CHECKBE-NEXT: rev64 v0.8b, v0.8b 23; CHECKBE-NEXT: ret 24 %tmp1 = load <8 x i8>, ptr %A 25 %tmp2 = load <8 x i8>, ptr %B 26 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 27 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 28 %tmp5 = add <8 x i8> %tmp3, %tmp4 29 ret <8 x i8> %tmp5 30} 31 32define <4 x i16> @vtrni16(ptr %A, ptr %B) nounwind { 33; CHECKLE-LABEL: vtrni16: 34; CHECKLE: // %bb.0: 35; CHECKLE-NEXT: ldr d0, [x0] 36; CHECKLE-NEXT: ldr d1, [x1] 37; CHECKLE-NEXT: trn1 v2.4h, v0.4h, v1.4h 38; CHECKLE-NEXT: trn2 v0.4h, v0.4h, v1.4h 39; CHECKLE-NEXT: add v0.4h, v2.4h, v0.4h 40; CHECKLE-NEXT: ret 41; 42; CHECKBE-LABEL: vtrni16: 43; CHECKBE: // %bb.0: 44; CHECKBE-NEXT: ld1 { v0.4h }, [x0] 45; CHECKBE-NEXT: ld1 { v1.4h }, [x1] 46; CHECKBE-NEXT: trn1 v2.4h, v0.4h, v1.4h 47; CHECKBE-NEXT: trn2 v0.4h, v0.4h, v1.4h 48; CHECKBE-NEXT: add v0.4h, v2.4h, v0.4h 49; CHECKBE-NEXT: rev64 v0.4h, v0.4h 50; CHECKBE-NEXT: ret 51 %tmp1 = load <4 x i16>, ptr %A 52 %tmp2 = load <4 x i16>, ptr %B 53 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 54 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 55 %tmp5 = add <4 x i16> %tmp3, %tmp4 56 ret <4 x i16> %tmp5 57} 58 59define <8 x i8> @vtrni16_viabitcast(ptr %A, ptr %B) nounwind { 60; CHECKLE-LABEL: vtrni16_viabitcast: 61; CHECKLE: // %bb.0: 62; CHECKLE-NEXT: ldr d0, [x0] 63; CHECKLE-NEXT: ldr d1, [x1] 64; CHECKLE-NEXT: trn1 v0.4h, v0.4h, v1.4h 65; CHECKLE-NEXT: ret 66; 67; CHECKBE-LABEL: vtrni16_viabitcast: 68; CHECKBE: // %bb.0: 69; CHECKBE-NEXT: ld1 { v0.4h }, [x0] 70; CHECKBE-NEXT: ld1 { v1.4h }, [x1] 71; CHECKBE-NEXT: trn1 v0.4h, v0.4h, v1.4h 72; CHECKBE-NEXT: rev64 v0.4h, v0.4h 73; CHECKBE-NEXT: ret 74 %l1 = load <4 x i16>, ptr %A 75 %l2 = load <4 x i16>, ptr %B 76 %b1 = bitcast <4 x i16> %l1 to <8 x i8> 77 %b2 = bitcast <4 x i16> %l2 to <8 x i8> 78 %tmp3 = shufflevector <8 x i8> %b1, <8 x i8> %b2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13> 79 ret <8 x i8> %tmp3 80} 81 82; 2xi32 TRN is redundant with ZIP 83define <2 x i32> @vtrni32(ptr %A, ptr %B) nounwind { 84; CHECKLE-LABEL: vtrni32: 85; CHECKLE: // %bb.0: 86; CHECKLE-NEXT: ldr d0, [x0] 87; CHECKLE-NEXT: ldr d1, [x1] 88; CHECKLE-NEXT: zip1 v2.2s, v0.2s, v1.2s 89; CHECKLE-NEXT: zip2 v0.2s, v0.2s, v1.2s 90; CHECKLE-NEXT: add v0.2s, v2.2s, v0.2s 91; CHECKLE-NEXT: ret 92; 93; CHECKBE-LABEL: vtrni32: 94; CHECKBE: // %bb.0: 95; CHECKBE-NEXT: ld1 { v0.2s }, [x0] 96; CHECKBE-NEXT: ld1 { v1.2s }, [x1] 97; CHECKBE-NEXT: zip1 v2.2s, v0.2s, v1.2s 98; CHECKBE-NEXT: zip2 v0.2s, v0.2s, v1.2s 99; CHECKBE-NEXT: add v0.2s, v2.2s, v0.2s 100; CHECKBE-NEXT: rev64 v0.2s, v0.2s 101; CHECKBE-NEXT: ret 102 %tmp1 = load <2 x i32>, ptr %A 103 %tmp2 = load <2 x i32>, ptr %B 104 %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2> 105 %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3> 106 %tmp5 = add <2 x i32> %tmp3, %tmp4 107 ret <2 x i32> %tmp5 108} 109 110define <2 x float> @vtrnf(ptr %A, ptr %B) nounwind { 111; CHECKLE-LABEL: vtrnf: 112; CHECKLE: // %bb.0: 113; CHECKLE-NEXT: ldr d0, [x0] 114; CHECKLE-NEXT: ldr d1, [x1] 115; CHECKLE-NEXT: zip1 v2.2s, v0.2s, v1.2s 116; CHECKLE-NEXT: zip2 v0.2s, v0.2s, v1.2s 117; CHECKLE-NEXT: fadd v0.2s, v2.2s, v0.2s 118; CHECKLE-NEXT: ret 119; 120; CHECKBE-LABEL: vtrnf: 121; CHECKBE: // %bb.0: 122; CHECKBE-NEXT: ld1 { v0.2s }, [x0] 123; CHECKBE-NEXT: ld1 { v1.2s }, [x1] 124; CHECKBE-NEXT: zip1 v2.2s, v0.2s, v1.2s 125; CHECKBE-NEXT: zip2 v0.2s, v0.2s, v1.2s 126; CHECKBE-NEXT: fadd v0.2s, v2.2s, v0.2s 127; CHECKBE-NEXT: rev64 v0.2s, v0.2s 128; CHECKBE-NEXT: ret 129 %tmp1 = load <2 x float>, ptr %A 130 %tmp2 = load <2 x float>, ptr %B 131 %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2> 132 %tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3> 133 %tmp5 = fadd <2 x float> %tmp3, %tmp4 134 ret <2 x float> %tmp5 135} 136 137define <16 x i8> @vtrnQi8(ptr %A, ptr %B) nounwind { 138; CHECKLE-LABEL: vtrnQi8: 139; CHECKLE: // %bb.0: 140; CHECKLE-NEXT: ldr q0, [x0] 141; CHECKLE-NEXT: ldr q1, [x1] 142; CHECKLE-NEXT: trn1 v2.16b, v0.16b, v1.16b 143; CHECKLE-NEXT: trn2 v0.16b, v0.16b, v1.16b 144; CHECKLE-NEXT: add v0.16b, v2.16b, v0.16b 145; CHECKLE-NEXT: ret 146; 147; CHECKBE-LABEL: vtrnQi8: 148; CHECKBE: // %bb.0: 149; CHECKBE-NEXT: ld1 { v0.16b }, [x0] 150; CHECKBE-NEXT: ld1 { v1.16b }, [x1] 151; CHECKBE-NEXT: trn1 v2.16b, v0.16b, v1.16b 152; CHECKBE-NEXT: trn2 v0.16b, v0.16b, v1.16b 153; CHECKBE-NEXT: add v0.16b, v2.16b, v0.16b 154; CHECKBE-NEXT: rev64 v0.16b, v0.16b 155; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 156; CHECKBE-NEXT: ret 157 %tmp1 = load <16 x i8>, ptr %A 158 %tmp2 = load <16 x i8>, ptr %B 159 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30> 160 %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31> 161 %tmp5 = add <16 x i8> %tmp3, %tmp4 162 ret <16 x i8> %tmp5 163} 164 165define <8 x i16> @vtrnQi16(ptr %A, ptr %B) nounwind { 166; CHECKLE-LABEL: vtrnQi16: 167; CHECKLE: // %bb.0: 168; CHECKLE-NEXT: ldr q0, [x0] 169; CHECKLE-NEXT: ldr q1, [x1] 170; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h 171; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h 172; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h 173; CHECKLE-NEXT: ret 174; 175; CHECKBE-LABEL: vtrnQi16: 176; CHECKBE: // %bb.0: 177; CHECKBE-NEXT: ld1 { v0.8h }, [x0] 178; CHECKBE-NEXT: ld1 { v1.8h }, [x1] 179; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h 180; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h 181; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h 182; CHECKBE-NEXT: rev64 v0.8h, v0.8h 183; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 184; CHECKBE-NEXT: ret 185 %tmp1 = load <8 x i16>, ptr %A 186 %tmp2 = load <8 x i16>, ptr %B 187 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 188 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 189 %tmp5 = add <8 x i16> %tmp3, %tmp4 190 ret <8 x i16> %tmp5 191} 192 193define <4 x i32> @vtrnQi32(ptr %A, ptr %B) nounwind { 194; CHECKLE-LABEL: vtrnQi32: 195; CHECKLE: // %bb.0: 196; CHECKLE-NEXT: ldr q0, [x0] 197; CHECKLE-NEXT: ldr q1, [x1] 198; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s 199; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s 200; CHECKLE-NEXT: add v0.4s, v2.4s, v0.4s 201; CHECKLE-NEXT: ret 202; 203; CHECKBE-LABEL: vtrnQi32: 204; CHECKBE: // %bb.0: 205; CHECKBE-NEXT: ld1 { v0.4s }, [x0] 206; CHECKBE-NEXT: ld1 { v1.4s }, [x1] 207; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s 208; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s 209; CHECKBE-NEXT: add v0.4s, v2.4s, v0.4s 210; CHECKBE-NEXT: rev64 v0.4s, v0.4s 211; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 212; CHECKBE-NEXT: ret 213 %tmp1 = load <4 x i32>, ptr %A 214 %tmp2 = load <4 x i32>, ptr %B 215 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 216 %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 217 %tmp5 = add <4 x i32> %tmp3, %tmp4 218 ret <4 x i32> %tmp5 219} 220 221define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind { 222; CHECKLE-LABEL: vtrnQf: 223; CHECKLE: // %bb.0: 224; CHECKLE-NEXT: ldr q0, [x0] 225; CHECKLE-NEXT: ldr q1, [x1] 226; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s 227; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s 228; CHECKLE-NEXT: fadd v0.4s, v2.4s, v0.4s 229; CHECKLE-NEXT: ret 230; 231; CHECKBE-LABEL: vtrnQf: 232; CHECKBE: // %bb.0: 233; CHECKBE-NEXT: ld1 { v0.4s }, [x0] 234; CHECKBE-NEXT: ld1 { v1.4s }, [x1] 235; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s 236; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s 237; CHECKBE-NEXT: fadd v0.4s, v2.4s, v0.4s 238; CHECKBE-NEXT: rev64 v0.4s, v0.4s 239; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 240; CHECKBE-NEXT: ret 241 %tmp1 = load <4 x float>, ptr %A 242 %tmp2 = load <4 x float>, ptr %B 243 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 244 %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 245 %tmp5 = fadd <4 x float> %tmp3, %tmp4 246 ret <4 x float> %tmp5 247} 248 249; Undef shuffle indices should not prevent matching to VTRN: 250 251define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind { 252; CHECKLE-LABEL: vtrni8_undef: 253; CHECKLE: // %bb.0: 254; CHECKLE-NEXT: ldr d0, [x0] 255; CHECKLE-NEXT: ldr d1, [x1] 256; CHECKLE-NEXT: trn1 v2.8b, v0.8b, v1.8b 257; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b 258; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b 259; CHECKLE-NEXT: ret 260; 261; CHECKBE-LABEL: vtrni8_undef: 262; CHECKBE: // %bb.0: 263; CHECKBE-NEXT: ld1 { v0.8b }, [x0] 264; CHECKBE-NEXT: ld1 { v1.8b }, [x1] 265; CHECKBE-NEXT: trn1 v2.8b, v0.8b, v1.8b 266; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b 267; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b 268; CHECKBE-NEXT: rev64 v0.8b, v0.8b 269; CHECKBE-NEXT: ret 270 %tmp1 = load <8 x i8>, ptr %A 271 %tmp2 = load <8 x i8>, ptr %B 272 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14> 273 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15> 274 %tmp5 = add <8 x i8> %tmp3, %tmp4 275 ret <8 x i8> %tmp5 276} 277 278define <8 x i16> @vtrnQi16_undef(ptr %A, ptr %B) nounwind { 279; CHECKLE-LABEL: vtrnQi16_undef: 280; CHECKLE: // %bb.0: 281; CHECKLE-NEXT: ldr q0, [x0] 282; CHECKLE-NEXT: ldr q1, [x1] 283; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h 284; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h 285; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h 286; CHECKLE-NEXT: ret 287; 288; CHECKBE-LABEL: vtrnQi16_undef: 289; CHECKBE: // %bb.0: 290; CHECKBE-NEXT: ld1 { v0.8h }, [x0] 291; CHECKBE-NEXT: ld1 { v1.8h }, [x1] 292; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h 293; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h 294; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h 295; CHECKBE-NEXT: rev64 v0.8h, v0.8h 296; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 297; CHECKBE-NEXT: ret 298 %tmp1 = load <8 x i16>, ptr %A 299 %tmp2 = load <8 x i16>, ptr %B 300 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14> 301 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef> 302 %tmp5 = add <8 x i16> %tmp3, %tmp4 303 ret <8 x i16> %tmp5 304} 305