1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=aarch64 | FileCheck %s 3 4define <4 x i16> @mlai16_trunc(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) { 5; CHECK-LABEL: mlai16_trunc: 6; CHECK: // %bb.0: // %entry 7; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h 8; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h 9; CHECK-NEXT: xtn v0.4h, v0.4s 10; CHECK-NEXT: ret 11entry: 12 %v0 = sext <4 x i16> %vec0 to <4 x i32> 13 %v1 = sext <4 x i16> %vec1 to <4 x i32> 14 %v2 = sext <4 x i16> %vec2 to <4 x i32> 15 %v3 = mul <4 x i32> %v1, %v0 16 %v4 = add <4 x i32> %v3, %v2 17 %v5 = trunc <4 x i32> %v4 to <4 x i16> 18 ret <4 x i16> %v5 19} 20 21define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) { 22; CHECK-LABEL: mlai16_and: 23; CHECK: // %bb.0: // %entry 24; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h 25; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff 26; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h 27; CHECK-NEXT: and v0.16b, v0.16b, v1.16b 28; CHECK-NEXT: ret 29entry: 30 %v0 = sext <4 x i16> %vec0 to <4 x i32> 31 %v1 = sext <4 x i16> %vec1 to <4 x i32> 32 %v2 = sext <4 x i16> %vec2 to <4 x i32> 33 %v3 = mul <4 x i32> %v1, %v0 34 %v4 = add <4 x i32> %v3, %v2 35 %v5 = and <4 x i32> %v4, <i32 65535, i32 65535, i32 65535, i32 65535> 36 ret <4 x i32> %v5 37} 38 39define void @mlai16_loadstore(ptr %a, ptr %b, ptr %c) { 40; CHECK-LABEL: mlai16_loadstore: 41; CHECK: // %bb.0: // %entry 42; CHECK-NEXT: ldr d0, [x0, #16] 43; CHECK-NEXT: ldr d1, [x1, #16] 44; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h 45; CHECK-NEXT: ldr d1, [x2, #16] 46; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h 47; CHECK-NEXT: xtn v0.4h, v0.4s 48; CHECK-NEXT: str d0, [x0, #16] 49; CHECK-NEXT: ret 50entry: 51 %scevgep0 = getelementptr i16, ptr %a, i32 8 52 %vec0 = load <4 x i16>, ptr %scevgep0, align 8 53 %v0 = sext <4 x i16> %vec0 to <4 x i32> 54 %scevgep1 = getelementptr i16, ptr %b, i32 8 55 %vec1 = load <4 x i16>, ptr %scevgep1, align 8 56 %v1 = sext <4 x i16> %vec1 to <4 x i32> 57 %scevgep2 = getelementptr i16, ptr %c, i32 8 58 %vec2 = load <4 x i16>, ptr %scevgep2, align 8 59 %v2 = sext <4 x i16> %vec2 to <4 x i32> 60 %v3 = mul <4 x i32> %v1, %v0 61 %v4 = add <4 x i32> %v3, %v2 62 %v5 = trunc <4 x i32> %v4 to <4 x i16> 63 %scevgep3 = getelementptr i16, ptr %a, i32 8 64 store <4 x i16> %v5, ptr %scevgep3, align 8 65 ret void 66} 67 68define <4 x i16> @addmuli16_trunc(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) { 69; CHECK-LABEL: addmuli16_trunc: 70; CHECK: // %bb.0: // %entry 71; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h 72; CHECK-NEXT: smlal v1.4s, v0.4h, v2.4h 73; CHECK-NEXT: xtn v0.4h, v1.4s 74; CHECK-NEXT: ret 75entry: 76 %v0 = sext <4 x i16> %vec0 to <4 x i32> 77 %v1 = sext <4 x i16> %vec1 to <4 x i32> 78 %v2 = sext <4 x i16> %vec2 to <4 x i32> 79 %v3 = add <4 x i32> %v1, %v0 80 %v4 = mul <4 x i32> %v3, %v2 81 %v5 = trunc <4 x i32> %v4 to <4 x i16> 82 ret <4 x i16> %v5 83} 84 85define <4 x i32> @addmuli16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) { 86; CHECK-LABEL: addmuli16_and: 87; CHECK: // %bb.0: // %entry 88; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h 89; CHECK-NEXT: smlal v1.4s, v0.4h, v2.4h 90; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff 91; CHECK-NEXT: and v0.16b, v1.16b, v0.16b 92; CHECK-NEXT: ret 93entry: 94 %v0 = sext <4 x i16> %vec0 to <4 x i32> 95 %v1 = sext <4 x i16> %vec1 to <4 x i32> 96 %v2 = sext <4 x i16> %vec2 to <4 x i32> 97 %v3 = add <4 x i32> %v1, %v0 98 %v4 = mul <4 x i32> %v3, %v2 99 %v5 = and <4 x i32> %v4, <i32 65535, i32 65535, i32 65535, i32 65535> 100 ret <4 x i32> %v5 101} 102 103define void @addmuli16_loadstore(ptr %a, ptr %b, ptr %c) { 104; CHECK-LABEL: addmuli16_loadstore: 105; CHECK: // %bb.0: // %entry 106; CHECK-NEXT: ldr d0, [x1, #16] 107; CHECK-NEXT: ldr d1, [x2, #16] 108; CHECK-NEXT: ldr d2, [x0, #16] 109; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h 110; CHECK-NEXT: smlal v0.4s, v2.4h, v1.4h 111; CHECK-NEXT: xtn v0.4h, v0.4s 112; CHECK-NEXT: str d0, [x0, #16] 113; CHECK-NEXT: ret 114entry: 115 %scevgep0 = getelementptr i16, ptr %a, i32 8 116 %vec0 = load <4 x i16>, ptr %scevgep0, align 8 117 %v0 = sext <4 x i16> %vec0 to <4 x i32> 118 %scevgep1 = getelementptr i16, ptr %b, i32 8 119 %vec1 = load <4 x i16>, ptr %scevgep1, align 8 120 %v1 = sext <4 x i16> %vec1 to <4 x i32> 121 %scevgep2 = getelementptr i16, ptr %c, i32 8 122 %vec2 = load <4 x i16>, ptr %scevgep2, align 8 123 %v2 = sext <4 x i16> %vec2 to <4 x i32> 124 %v3 = add <4 x i32> %v1, %v0 125 %v4 = mul <4 x i32> %v3, %v2 126 %v5 = trunc <4 x i32> %v4 to <4 x i16> 127 %scevgep3 = getelementptr i16, ptr %a, i32 8 128 store <4 x i16> %v5, ptr %scevgep3, align 8 129 ret void 130} 131 132define <2 x i32> @mlai32_trunc(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) { 133; CHECK-LABEL: mlai32_trunc: 134; CHECK: // %bb.0: // %entry 135; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s 136; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s 137; CHECK-NEXT: xtn v0.2s, v0.2d 138; CHECK-NEXT: ret 139entry: 140 %v0 = sext <2 x i32> %vec0 to <2 x i64> 141 %v1 = sext <2 x i32> %vec1 to <2 x i64> 142 %v2 = sext <2 x i32> %vec2 to <2 x i64> 143 %v3 = mul <2 x i64> %v1, %v0 144 %v4 = add <2 x i64> %v3, %v2 145 %v5 = trunc <2 x i64> %v4 to <2 x i32> 146 ret <2 x i32> %v5 147} 148 149define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) { 150; CHECK-LABEL: mlai32_and: 151; CHECK: // %bb.0: // %entry 152; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s 153; CHECK-NEXT: movi v1.2d, #0x000000ffffffff 154; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s 155; CHECK-NEXT: and v0.16b, v0.16b, v1.16b 156; CHECK-NEXT: ret 157entry: 158 %v0 = sext <2 x i32> %vec0 to <2 x i64> 159 %v1 = sext <2 x i32> %vec1 to <2 x i64> 160 %v2 = sext <2 x i32> %vec2 to <2 x i64> 161 %v3 = mul <2 x i64> %v1, %v0 162 %v4 = add <2 x i64> %v3, %v2 163 %v5 = and <2 x i64> %v4, <i64 4294967295, i64 4294967295> 164 ret <2 x i64> %v5 165} 166 167define void @mlai32_loadstore(ptr %a, ptr %b, ptr %c) { 168; CHECK-LABEL: mlai32_loadstore: 169; CHECK: // %bb.0: // %entry 170; CHECK-NEXT: ldr d0, [x0, #32] 171; CHECK-NEXT: ldr d1, [x1, #32] 172; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s 173; CHECK-NEXT: ldr d1, [x2, #32] 174; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s 175; CHECK-NEXT: xtn v0.2s, v0.2d 176; CHECK-NEXT: str d0, [x0, #32] 177; CHECK-NEXT: ret 178entry: 179 %scevgep0 = getelementptr i32, ptr %a, i32 8 180 %vec0 = load <2 x i32>, ptr %scevgep0, align 8 181 %v0 = sext <2 x i32> %vec0 to <2 x i64> 182 %scevgep1 = getelementptr i32, ptr %b, i32 8 183 %vec1 = load <2 x i32>, ptr %scevgep1, align 8 184 %v1 = sext <2 x i32> %vec1 to <2 x i64> 185 %scevgep2 = getelementptr i32, ptr %c, i32 8 186 %vec2 = load <2 x i32>, ptr %scevgep2, align 8 187 %v2 = sext <2 x i32> %vec2 to <2 x i64> 188 %v3 = mul <2 x i64> %v1, %v0 189 %v4 = add <2 x i64> %v3, %v2 190 %v5 = trunc <2 x i64> %v4 to <2 x i32> 191 %scevgep3 = getelementptr i32, ptr %a, i32 8 192 store <2 x i32> %v5, ptr %scevgep3, align 8 193 ret void 194} 195 196define <2 x i32> @addmuli32_trunc(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) { 197; CHECK-LABEL: addmuli32_trunc: 198; CHECK: // %bb.0: // %entry 199; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s 200; CHECK-NEXT: smlal v1.2d, v0.2s, v2.2s 201; CHECK-NEXT: xtn v0.2s, v1.2d 202; CHECK-NEXT: ret 203entry: 204 %v0 = sext <2 x i32> %vec0 to <2 x i64> 205 %v1 = sext <2 x i32> %vec1 to <2 x i64> 206 %v2 = sext <2 x i32> %vec2 to <2 x i64> 207 %v3 = add <2 x i64> %v1, %v0 208 %v4 = mul <2 x i64> %v3, %v2 209 %v5 = trunc <2 x i64> %v4 to <2 x i32> 210 ret <2 x i32> %v5 211} 212 213define <2 x i64> @addmuli32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) { 214; CHECK-LABEL: addmuli32_and: 215; CHECK: // %bb.0: // %entry 216; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s 217; CHECK-NEXT: smlal v1.2d, v0.2s, v2.2s 218; CHECK-NEXT: movi v0.2d, #0x000000ffffffff 219; CHECK-NEXT: and v0.16b, v1.16b, v0.16b 220; CHECK-NEXT: ret 221entry: 222 %v0 = sext <2 x i32> %vec0 to <2 x i64> 223 %v1 = sext <2 x i32> %vec1 to <2 x i64> 224 %v2 = sext <2 x i32> %vec2 to <2 x i64> 225 %v3 = add <2 x i64> %v1, %v0 226 %v4 = mul <2 x i64> %v3, %v2 227 %v5 = and <2 x i64> %v4, <i64 4294967295, i64 4294967295> 228 ret <2 x i64> %v5 229} 230 231define void @addmuli32_loadstore(ptr %a, ptr %b, ptr %c) { 232; CHECK-LABEL: addmuli32_loadstore: 233; CHECK: // %bb.0: // %entry 234; CHECK-NEXT: ldr d0, [x1, #32] 235; CHECK-NEXT: ldr d1, [x2, #32] 236; CHECK-NEXT: ldr d2, [x0, #32] 237; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s 238; CHECK-NEXT: smlal v0.2d, v2.2s, v1.2s 239; CHECK-NEXT: xtn v0.2s, v0.2d 240; CHECK-NEXT: str d0, [x0, #32] 241; CHECK-NEXT: ret 242entry: 243 %scevgep0 = getelementptr i32, ptr %a, i32 8 244 %vec0 = load <2 x i32>, ptr %scevgep0, align 8 245 %v0 = sext <2 x i32> %vec0 to <2 x i64> 246 %scevgep1 = getelementptr i32, ptr %b, i32 8 247 %vec1 = load <2 x i32>, ptr %scevgep1, align 8 248 %v1 = sext <2 x i32> %vec1 to <2 x i64> 249 %scevgep2 = getelementptr i32, ptr %c, i32 8 250 %vec2 = load <2 x i32>, ptr %scevgep2, align 8 251 %v2 = sext <2 x i32> %vec2 to <2 x i64> 252 %v3 = add <2 x i64> %v1, %v0 253 %v4 = mul <2 x i64> %v3, %v2 254 %v5 = trunc <2 x i64> %v4 to <2 x i32> 255 %scevgep3 = getelementptr i32, ptr %a, i32 8 256 store <2 x i32> %v5, ptr %scevgep3, align 8 257 ret void 258} 259 260define void @func1(ptr %a, ptr %b, ptr %c) { 261; CHECK-LABEL: func1: 262; CHECK: // %bb.0: // %entry 263; CHECK-NEXT: ldr d0, [x2, #16] 264; CHECK-NEXT: ldr d1, [x1, #16] 265; CHECK-NEXT: ldr d2, [x0, #16] 266; CHECK-NEXT: sshll v0.4s, v0.4h, #0 267; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h 268; CHECK-NEXT: xtn v1.4h, v0.4s 269; CHECK-NEXT: str d1, [x0, #16] 270; CHECK-NEXT: ldr d1, [x2, #16] 271; CHECK-NEXT: sshll v1.4s, v1.4h, #0 272; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s 273; CHECK-NEXT: xtn v1.4h, v0.4s 274; CHECK-NEXT: str d1, [x1, #16] 275; CHECK-NEXT: ldr d1, [x2, #16] 276; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h 277; CHECK-NEXT: xtn v0.4h, v0.4s 278; CHECK-NEXT: str d0, [x0, #16] 279; CHECK-NEXT: ret 280entry: 281; The test case trying to vectorize the pseudo code below. 282; a[i] = b[i] + c[i]; 283; b[i] = aptr c[i]; 284; a[i] = b[i] + aptr c[i]; 285; Checking that vector load a[i] for "a[i] = b[i] + aptr c[i]" is 286; scheduled before the first vector store to "a[i] = b[i] + c[i]". 287; Checking that there is no vector load a[i] scheduled between the vector 288; stores to a[i], otherwise the load of a[i] will be polluted by the first 289; vector store to a[i]. 290; This test case check that the chain information is updated during 291; lowerMUL for the new created Load SDNode. 292 293 294 %scevgep0 = getelementptr i16, ptr %a, i32 8 295 %vec0 = load <4 x i16>, ptr %scevgep0, align 8 296 %scevgep1 = getelementptr i16, ptr %b, i32 8 297 %vec1 = load <4 x i16>, ptr %scevgep1, align 8 298 %0 = zext <4 x i16> %vec1 to <4 x i32> 299 %scevgep2 = getelementptr i16, ptr %c, i32 8 300 %vec2 = load <4 x i16>, ptr %scevgep2, align 8 301 %1 = sext <4 x i16> %vec2 to <4 x i32> 302 %vec3 = add <4 x i32> %1, %0 303 %2 = trunc <4 x i32> %vec3 to <4 x i16> 304 %scevgep3 = getelementptr i16, ptr %a, i32 8 305 store <4 x i16> %2, ptr %scevgep3, align 8 306 %vec4 = load <4 x i16>, ptr %scevgep2, align 8 307 %3 = sext <4 x i16> %vec4 to <4 x i32> 308 %vec5 = mul <4 x i32> %3, %vec3 309 %4 = trunc <4 x i32> %vec5 to <4 x i16> 310 store <4 x i16> %4, ptr %scevgep1, align 8 311 %5 = sext <4 x i16> %vec0 to <4 x i32> 312 %vec6 = load <4 x i16>, ptr %scevgep2, align 8 313 %6 = sext <4 x i16> %vec6 to <4 x i32> 314 %vec7 = mul <4 x i32> %6, %5 315 %vec8 = add <4 x i32> %vec7, %vec5 316 %7 = trunc <4 x i32> %vec8 to <4 x i16> 317 store <4 x i16> %7, ptr %scevgep3, align 8 318 ret void 319} 320 321define void @func2(ptr %a, ptr %b, ptr %c) { 322; CHECK-LABEL: func2: 323; CHECK: // %bb.0: // %entry 324; CHECK-NEXT: ldr d0, [x2, #16] 325; CHECK-NEXT: ldr d1, [x1, #16] 326; CHECK-NEXT: ldr d2, [x0, #16] 327; CHECK-NEXT: sshll v0.4s, v0.4h, #0 328; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h 329; CHECK-NEXT: xtn v1.4h, v0.4s 330; CHECK-NEXT: str d1, [x0, #16] 331; CHECK-NEXT: ldr d1, [x2, #16] 332; CHECK-NEXT: sshll v1.4s, v1.4h, #0 333; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s 334; CHECK-NEXT: xtn v1.4h, v0.4s 335; CHECK-NEXT: str d1, [x1, #16] 336; CHECK-NEXT: ldr d1, [x2, #16] 337; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h 338; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h 339; CHECK-NEXT: xtn v0.4h, v0.4s 340; CHECK-NEXT: str d0, [x0, #16] 341; CHECK-NEXT: ret 342entry: 343; The test case trying to vectorize the pseudo code below. 344; a[i] = b[i] + c[i]; 345; b[i] = aptr c[i]; 346; a[i] = b[i] + aptr c[i] + a[i]; 347; Checking that vector load a[i] for "a[i] = b[i] + aptr c[i] + a[i]" 348; is scheduled before the first vector store to "a[i] = b[i] + c[i]". 349; Checking that there is no vector load a[i] scheduled between the first 350; vector store to a[i] and the vector add of a[i], otherwise the load of 351; a[i] will be polluted by the first vector store to a[i]. 352; This test case check that both the chain and value of the new created 353; Load SDNode are updated during lowerMUL. 354 355 356 %scevgep0 = getelementptr i16, ptr %a, i32 8 357 %vec0 = load <4 x i16>, ptr %scevgep0, align 8 358 %scevgep1 = getelementptr i16, ptr %b, i32 8 359 %vec1 = load <4 x i16>, ptr %scevgep1, align 8 360 %0 = zext <4 x i16> %vec1 to <4 x i32> 361 %scevgep2 = getelementptr i16, ptr %c, i32 8 362 %vec2 = load <4 x i16>, ptr %scevgep2, align 8 363 %1 = sext <4 x i16> %vec2 to <4 x i32> 364 %vec3 = add <4 x i32> %1, %0 365 %2 = trunc <4 x i32> %vec3 to <4 x i16> 366 %scevgep3 = getelementptr i16, ptr %a, i32 8 367 store <4 x i16> %2, ptr %scevgep3, align 8 368 %vec4 = load <4 x i16>, ptr %scevgep2, align 8 369 %3 = sext <4 x i16> %vec4 to <4 x i32> 370 %vec5 = mul <4 x i32> %3, %vec3 371 %4 = trunc <4 x i32> %vec5 to <4 x i16> 372 store <4 x i16> %4, ptr %scevgep1, align 8 373 %5 = sext <4 x i16> %vec0 to <4 x i32> 374 %vec6 = load <4 x i16>, ptr %scevgep2, align 8 375 %6 = sext <4 x i16> %vec6 to <4 x i32> 376 %vec7 = mul <4 x i32> %6, %5 377 %vec8 = add <4 x i32> %vec7, %vec5 378 %vec9 = add <4 x i32> %vec8, %5 379 %7 = trunc <4 x i32> %vec9 to <4 x i16> 380 store <4 x i16> %7, ptr %scevgep3, align 8 381 ret void 382} 383