xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
6; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
7; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
8; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
9
10; GCN-LABEL: test_local_misaligned_v2:
11; GCN-DAG: ds_{{read2|load_2addr}}_b32
12; GCN-DAG: ds_{{write2|store_2addr}}_b32
13define amdgpu_kernel void @test_local_misaligned_v2(ptr addrspace(3) %arg) {
14bb:
15  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
16  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
17  %load = load <2 x i32>, ptr addrspace(3) %gep, align 4
18  %v1 = extractelement <2 x i32> %load, i32 0
19  %v2 = extractelement <2 x i32> %load, i32 1
20  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
21  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
22  store <2 x i32> %v4, ptr addrspace(3) %gep, align 4
23  ret void
24}
25
26; GCN-LABEL: test_local_misaligned_v4:
27; ALIGNED-DAG: ds_{{read2|load_2addr}}_b32
28; ALIGNED-DAG: ds_{{read2|load_2addr}}_b32
29; ALIGNED-DAG: ds_{{write2|store_2addr}}_b32
30; ALIGNED-DAG: ds_{{write2|store_2addr}}_b32
31; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64
32; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64
33define amdgpu_kernel void @test_local_misaligned_v4(ptr addrspace(3) %arg) {
34bb:
35  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
36  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
37  %load = load <4 x i32>, ptr addrspace(3) %gep, align 4
38  %v1 = extractelement <4 x i32> %load, i32 0
39  %v2 = extractelement <4 x i32> %load, i32 1
40  %v3 = extractelement <4 x i32> %load, i32 2
41  %v4 = extractelement <4 x i32> %load, i32 3
42  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
43  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
44  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
45  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
46  store <4 x i32> %v8, ptr addrspace(3) %gep, align 4
47  ret void
48}
49
50; GCN-LABEL: test_local_misaligned_v3:
51; ALIGNED-DAG: ds_{{read2|load_2addr}}_b32
52; ALIGNED-DAG: ds_{{read|load}}_b32
53; ALIGNED-DAG: ds_{{write2|store_2addr}}_b32
54; ALIGNED-DAG: ds_{{write|store}}_b32
55; UNALIGNED-DAG: ds_{{read|load}}_b96
56; UNALIGNED-DAG: ds_{{write|store}}_b96
57define amdgpu_kernel void @test_local_misaligned_v3(ptr addrspace(3) %arg) {
58bb:
59  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
60  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
61  %load = load <3 x i32>, ptr addrspace(3) %gep, align 4
62  %v1 = extractelement <3 x i32> %load, i32 0
63  %v2 = extractelement <3 x i32> %load, i32 1
64  %v3 = extractelement <3 x i32> %load, i32 2
65  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
66  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
67  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
68  store <3 x i32> %v7, ptr addrspace(3) %gep, align 4
69  ret void
70}
71
72; GCN-LABEL: test_local_aligned_v2:
73; GCN-DAG: ds_{{read|load}}_b64
74; GCN-DAG: ds_{{write|store}}_b64
75define amdgpu_kernel void @test_local_aligned_v2(ptr addrspace(3) %arg) {
76bb:
77  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
78  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
79  %load = load <2 x i32>, ptr addrspace(3) %gep, align 8
80  %v1 = extractelement <2 x i32> %load, i32 0
81  %v2 = extractelement <2 x i32> %load, i32 1
82  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
83  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
84  store <2 x i32> %v4, ptr addrspace(3) %gep, align 8
85  ret void
86}
87
88; GCN-LABEL: test_local_aligned_v3:
89; GCN-DAG: ds_{{read|load}}_b96
90; GCN-DAG: ds_{{write|store}}_b96
91define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) {
92bb:
93  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
94  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
95  %load = load <3 x i32>, ptr addrspace(3) %gep, align 16
96  %v1 = extractelement <3 x i32> %load, i32 0
97  %v2 = extractelement <3 x i32> %load, i32 1
98  %v3 = extractelement <3 x i32> %load, i32 2
99  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
100  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
101  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
102  store <3 x i32> %v7, ptr addrspace(3) %gep, align 16
103  ret void
104}
105
106; GCN-LABEL: test_local_v4_aligned8:
107; ALIGNED-WGP-DAG: ds_{{read2|load_2addr}}_b32
108; ALIGNED-WGP-DAG: ds_{{read2|load_2addr}}_b32
109; ALIGNED-WGP-DAG: ds_{{write2|store_2addr}}_b32
110; ALIGNED-WGP-DAG: ds_{{write2|store_2addr}}_b32
111; ALIGNED-CU-DAG: ds_{{read2|load_2addr}}_b64
112; ALIGNED-CU-DAG: ds_{{write2|store_2addr}}_b64
113; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64
114; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64
115define amdgpu_kernel void @test_local_v4_aligned8(ptr addrspace(3) %arg) {
116bb:
117  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
118  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
119  %load = load <4 x i32>, ptr addrspace(3) %gep, align 8
120  %v1 = extractelement <4 x i32> %load, i32 0
121  %v2 = extractelement <4 x i32> %load, i32 1
122  %v3 = extractelement <4 x i32> %load, i32 2
123  %v4 = extractelement <4 x i32> %load, i32 3
124  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
125  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
126  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
127  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
128  store <4 x i32> %v8, ptr addrspace(3) %gep, align 8
129  ret void
130}
131
132declare i32 @llvm.amdgcn.workitem.id.x()
133