1; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s 2; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s 5; RUN: llc -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s 6; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s 7target datalayout = "A5" 8 9; OPT-LABEL: @vector_read( 10; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index 11; OPT: store i32 %0, ptr addrspace(1) %out, align 4 12 13; FUNC-LABEL: {{^}}vector_read: 14; EG: MOV 15; EG: MOV 16; EG: MOV 17; EG: MOV 18; EG: MOVA_INT 19define amdgpu_kernel void @vector_read(ptr addrspace(1) %out, i32 %index) { 20entry: 21 %tmp = alloca [4 x i32], addrspace(5) 22 %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 23 %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 24 %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 25 store i32 0, ptr addrspace(5) %tmp 26 store i32 1, ptr addrspace(5) %y 27 store i32 2, ptr addrspace(5) %z 28 store i32 3, ptr addrspace(5) %w 29 %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index 30 %tmp2 = load i32, ptr addrspace(5) %tmp1 31 store i32 %tmp2, ptr addrspace(1) %out 32 ret void 33} 34 35; OPT-LABEL: @vector_write( 36; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index 37; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index 38; OPT: store i32 %1, ptr addrspace(1) %out, align 4 39 40; FUNC-LABEL: {{^}}vector_write: 41; EG: MOV 42; EG: MOV 43; EG: MOV 44; EG: MOV 45; EG: MOVA_INT 46; EG: MOVA_INT 47define amdgpu_kernel void @vector_write(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) { 48entry: 49 %tmp = alloca [4 x i32], addrspace(5) 50 %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 51 %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 52 %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 53 store i32 0, ptr addrspace(5) %tmp 54 store i32 0, ptr addrspace(5) %y 55 store i32 0, ptr addrspace(5) %z 56 store i32 0, ptr addrspace(5) %w 57 %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index 58 store i32 1, ptr addrspace(5) %tmp1 59 %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index 60 %tmp3 = load i32, ptr addrspace(5) %tmp2 61 store i32 %tmp3, ptr addrspace(1) %out 62 ret void 63} 64 65; This test should be optimize to: 66; store i32 0, ptr addrspace(1) %out 67 68; OPT-LABEL: @bitcast_gep( 69; OPT-LABEL: store i32 0, ptr addrspace(1) %out, align 4 70 71; FUNC-LABEL: {{^}}bitcast_gep: 72; EG: STORE_RAW 73define amdgpu_kernel void @bitcast_gep(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) { 74entry: 75 %tmp = alloca [4 x i32], addrspace(5) 76 %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 77 %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 78 %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 79 store i32 0, ptr addrspace(5) %tmp 80 store i32 0, ptr addrspace(5) %y 81 store i32 0, ptr addrspace(5) %z 82 store i32 0, ptr addrspace(5) %w 83 %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 84 %tmp4 = load i32, ptr addrspace(5) %tmp1 85 store i32 %tmp4, ptr addrspace(1) %out 86 ret void 87} 88 89; OPT-LABEL: @vector_read_bitcast_gep( 90; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index 91; OPT: store i32 %0, ptr addrspace(1) %out, align 4 92define amdgpu_kernel void @vector_read_bitcast_gep(ptr addrspace(1) %out, i32 %index) { 93entry: 94 %tmp = alloca [4 x i32], addrspace(5) 95 %y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 96 %z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 97 %w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 98 store float 1.0, ptr addrspace(5) %tmp 99 store i32 1, ptr addrspace(5) %y 100 store i32 2, ptr addrspace(5) %z 101 store i32 3, ptr addrspace(5) %w 102 %tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index 103 %tmp2 = load i32, ptr addrspace(5) %tmp1 104 store i32 %tmp2, ptr addrspace(1) %out 105 ret void 106} 107 108; OPT-LABEL: @vector_read_bitcast_alloca( 109; OPT: %0 = extractelement <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, i32 %index 110; OPT: store float %0, ptr addrspace(1) %out, align 4 111define amdgpu_kernel void @vector_read_bitcast_alloca(ptr addrspace(1) %out, i32 %index) { 112entry: 113 %tmp = alloca [4 x i32], addrspace(5) 114 %y = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 1 115 %z = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 2 116 %w = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 3 117 store float 0.0, ptr addrspace(5) %tmp 118 store float 1.0, ptr addrspace(5) %y 119 store float 2.0, ptr addrspace(5) %z 120 store float 4.0, ptr addrspace(5) %w 121 %tmp1 = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 %index 122 %tmp2 = load float, ptr addrspace(5) %tmp1 123 store float %tmp2, ptr addrspace(1) %out 124 ret void 125} 126 127; The pointer arguments in local address space should not affect promotion to vector. 128 129; OPT-LABEL: @vector_read_with_local_arg( 130; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index 131; OPT: store i32 %0, ptr addrspace(1) %out, align 4 132define amdgpu_kernel void @vector_read_with_local_arg(ptr addrspace(3) %stopper, ptr addrspace(1) %out, i32 %index) { 133entry: 134 %tmp = alloca [4 x i32], addrspace(5) 135 %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 136 %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 137 %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 138 store i32 0, ptr addrspace(5) %tmp 139 store i32 1, ptr addrspace(5) %y 140 store i32 2, ptr addrspace(5) %z 141 store i32 3, ptr addrspace(5) %w 142 %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index 143 %tmp2 = load i32, ptr addrspace(5) %tmp1 144 store i32 %tmp2, ptr addrspace(1) %out 145 ret void 146} 147