1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s 3 4%struct.float16x8x2_t = type { [2 x <8 x half>] } 5%struct.uint8x16x4_t = type { [4 x <16 x i8>] } 6%struct.uint32x4x2_t = type { [2 x <4 x i32>] } 7%struct.int8x16x4_t = type { [4 x <16 x i8>] } 8 9define arm_aapcs_vfpcc %struct.float16x8x2_t @test_vld2q_f16(ptr %addr) { 10; CHECK-LABEL: test_vld2q_f16: 11; CHECK: @ %bb.0: @ %entry 12; CHECK-NEXT: vld20.16 {q0, q1}, [r0] 13; CHECK-NEXT: vld21.16 {q0, q1}, [r0] 14; CHECK-NEXT: bx lr 15entry: 16 %0 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr %addr) 17 %1 = extractvalue { <8 x half>, <8 x half> } %0, 0 18 %2 = insertvalue %struct.float16x8x2_t undef, <8 x half> %1, 0, 0 19 %3 = extractvalue { <8 x half>, <8 x half> } %0, 1 20 %4 = insertvalue %struct.float16x8x2_t %2, <8 x half> %3, 0, 1 21 ret %struct.float16x8x2_t %4 22} 23 24define arm_aapcs_vfpcc ptr @test_vld2q_f16_post(ptr %addr, ptr %dst) { 25; CHECK-LABEL: test_vld2q_f16_post: 26; CHECK: @ %bb.0: @ %entry 27; CHECK-NEXT: vld20.16 {q0, q1}, [r0] 28; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! 29; CHECK-NEXT: vstrw.32 q0, [r1] 30; CHECK-NEXT: bx lr 31entry: 32 %0 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr %addr) 33 %1 = extractvalue { <8 x half>, <8 x half> } %0, 0 34 store <8 x half> %1, ptr %dst, align 4 35 %2 = getelementptr half, ptr %addr, i32 16 36 ret ptr %2 37} 38 39declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr) 40 41define arm_aapcs_vfpcc %struct.uint8x16x4_t @test_vld4q_u8(ptr %addr) { 42; CHECK-LABEL: test_vld4q_u8: 43; CHECK: @ %bb.0: @ %entry 44; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] 45; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] 46; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] 47; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0] 48; CHECK-NEXT: bx lr 49entry: 50 %0 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8.p0(ptr %addr) 51 %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 0 52 %2 = insertvalue %struct.uint8x16x4_t undef, <16 x i8> %1, 0, 0 53 %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 1 54 %4 = insertvalue %struct.uint8x16x4_t %2, <16 x i8> %3, 0, 1 55 %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 2 56 %6 = insertvalue %struct.uint8x16x4_t %4, <16 x i8> %5, 0, 2 57 %7 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 3 58 %8 = insertvalue %struct.uint8x16x4_t %6, <16 x i8> %7, 0, 3 59 ret %struct.uint8x16x4_t %8 60} 61 62define arm_aapcs_vfpcc ptr @test_vld4q_u8_post(ptr %addr, ptr %dst) { 63; CHECK-LABEL: test_vld4q_u8_post: 64; CHECK: @ %bb.0: @ %entry 65; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] 66; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] 67; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] 68; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0]! 69; CHECK-NEXT: vstrw.32 q0, [r1] 70; CHECK-NEXT: bx lr 71entry: 72 %0 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8.p0(ptr %addr) 73 %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 0 74 store <16 x i8> %1, ptr %dst, align 4 75 %2 = getelementptr i8, ptr %addr, i32 64 76 ret ptr %2 77} 78 79declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8.p0(ptr) 80 81define arm_aapcs_vfpcc void @test_vst2q_u32(ptr %addr, %struct.uint32x4x2_t %value.coerce) { 82; CHECK-LABEL: test_vst2q_u32: 83; CHECK: @ %bb.0: @ %entry 84; CHECK-NEXT: vst20.32 {q0, q1}, [r0] 85; CHECK-NEXT: vst21.32 {q0, q1}, [r0] 86; CHECK-NEXT: bx lr 87entry: 88 %value.coerce.fca.0.0.extract = extractvalue %struct.uint32x4x2_t %value.coerce, 0, 0 89 %value.coerce.fca.0.1.extract = extractvalue %struct.uint32x4x2_t %value.coerce, 0, 1 90 tail call void @llvm.arm.mve.vst2q.p0.v4i32(ptr %addr, <4 x i32> %value.coerce.fca.0.0.extract, <4 x i32> %value.coerce.fca.0.1.extract, i32 0) 91 tail call void @llvm.arm.mve.vst2q.p0.v4i32(ptr %addr, <4 x i32> %value.coerce.fca.0.0.extract, <4 x i32> %value.coerce.fca.0.1.extract, i32 1) 92 ret void 93} 94 95define arm_aapcs_vfpcc ptr @test_vst2q_u32_post(ptr %addr, %struct.uint32x4x2_t %value.coerce) { 96; CHECK-LABEL: test_vst2q_u32_post: 97; CHECK: @ %bb.0: @ %entry 98; CHECK-NEXT: vst20.32 {q0, q1}, [r0] 99; CHECK-NEXT: vst21.32 {q0, q1}, [r0]! 100; CHECK-NEXT: bx lr 101entry: 102 %value.coerce.fca.0.0.extract = extractvalue %struct.uint32x4x2_t %value.coerce, 0, 0 103 %value.coerce.fca.0.1.extract = extractvalue %struct.uint32x4x2_t %value.coerce, 0, 1 104 tail call void @llvm.arm.mve.vst2q.p0.v4i32(ptr %addr, <4 x i32> %value.coerce.fca.0.0.extract, <4 x i32> %value.coerce.fca.0.1.extract, i32 0) 105 tail call void @llvm.arm.mve.vst2q.p0.v4i32(ptr %addr, <4 x i32> %value.coerce.fca.0.0.extract, <4 x i32> %value.coerce.fca.0.1.extract, i32 1) 106 %g = getelementptr i32, ptr %addr, i32 8 107 ret ptr %g 108} 109 110declare void @llvm.arm.mve.vst2q.p0.v4i32(ptr, <4 x i32>, <4 x i32>, i32) 111 112define arm_aapcs_vfpcc void @test_vst2q_f16(ptr %addr, %struct.float16x8x2_t %value.coerce) { 113; CHECK-LABEL: test_vst2q_f16: 114; CHECK: @ %bb.0: @ %entry 115; CHECK-NEXT: vst20.16 {q0, q1}, [r0] 116; CHECK-NEXT: vst21.16 {q0, q1}, [r0] 117; CHECK-NEXT: bx lr 118entry: 119 %value.coerce.fca.0.0.extract = extractvalue %struct.float16x8x2_t %value.coerce, 0, 0 120 %value.coerce.fca.0.1.extract = extractvalue %struct.float16x8x2_t %value.coerce, 0, 1 121 call void @llvm.arm.mve.vst2q.p0.v8f16(ptr %addr, <8 x half> %value.coerce.fca.0.0.extract, <8 x half> %value.coerce.fca.0.1.extract, i32 0) 122 call void @llvm.arm.mve.vst2q.p0.v8f16(ptr %addr, <8 x half> %value.coerce.fca.0.0.extract, <8 x half> %value.coerce.fca.0.1.extract, i32 1) 123 ret void 124} 125 126define arm_aapcs_vfpcc ptr @test_vst2q_f16_post(ptr %addr, %struct.float16x8x2_t %value.coerce) { 127; CHECK-LABEL: test_vst2q_f16_post: 128; CHECK: @ %bb.0: @ %entry 129; CHECK-NEXT: vst20.16 {q0, q1}, [r0] 130; CHECK-NEXT: vst21.16 {q0, q1}, [r0]! 131; CHECK-NEXT: bx lr 132entry: 133 %value.coerce.fca.0.0.extract = extractvalue %struct.float16x8x2_t %value.coerce, 0, 0 134 %value.coerce.fca.0.1.extract = extractvalue %struct.float16x8x2_t %value.coerce, 0, 1 135 call void @llvm.arm.mve.vst2q.p0.v8f16(ptr %addr, <8 x half> %value.coerce.fca.0.0.extract, <8 x half> %value.coerce.fca.0.1.extract, i32 0) 136 call void @llvm.arm.mve.vst2q.p0.v8f16(ptr %addr, <8 x half> %value.coerce.fca.0.0.extract, <8 x half> %value.coerce.fca.0.1.extract, i32 1) 137 %g = getelementptr half, ptr %addr, i32 16 138 ret ptr %g 139} 140 141declare void @llvm.arm.mve.vst2q.p0.v8f16(ptr, <8 x half>, <8 x half>, i32) 142 143define arm_aapcs_vfpcc void @test_vst4q_s8(ptr %addr, %struct.int8x16x4_t %value.coerce) { 144; CHECK-LABEL: test_vst4q_s8: 145; CHECK: @ %bb.0: @ %entry 146; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r0] 147; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r0] 148; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r0] 149; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r0] 150; CHECK-NEXT: bx lr 151entry: 152 %value.coerce.fca.0.0.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 0 153 %value.coerce.fca.0.1.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 1 154 %value.coerce.fca.0.2.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 2 155 %value.coerce.fca.0.3.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 3 156 tail call void @llvm.arm.mve.vst4q.p0.v16i8(ptr %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 0) 157 tail call void @llvm.arm.mve.vst4q.p0.v16i8(ptr %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 1) 158 tail call void @llvm.arm.mve.vst4q.p0.v16i8(ptr %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 2) 159 tail call void @llvm.arm.mve.vst4q.p0.v16i8(ptr %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 3) 160 ret void 161} 162 163define arm_aapcs_vfpcc ptr @test_vst4q_s8_post(ptr %addr, %struct.int8x16x4_t %value.coerce) { 164; CHECK-LABEL: test_vst4q_s8_post: 165; CHECK: @ %bb.0: @ %entry 166; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r0] 167; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r0] 168; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r0] 169; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r0]! 170; CHECK-NEXT: bx lr 171entry: 172 %value.coerce.fca.0.0.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 0 173 %value.coerce.fca.0.1.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 1 174 %value.coerce.fca.0.2.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 2 175 %value.coerce.fca.0.3.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 3 176 tail call void @llvm.arm.mve.vst4q.p0.v16i8(ptr %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 0) 177 tail call void @llvm.arm.mve.vst4q.p0.v16i8(ptr %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 1) 178 tail call void @llvm.arm.mve.vst4q.p0.v16i8(ptr %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 2) 179 tail call void @llvm.arm.mve.vst4q.p0.v16i8(ptr %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 3) 180 %g = getelementptr i8, ptr %addr, i32 64 181 ret ptr %g 182} 183 184declare void @llvm.arm.mve.vst4q.p0.v16i8(ptr, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) 185