1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -o - %s | FileCheck -check-prefix=GFX7 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s 6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10 %s 7 8define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) { 9; GFX7-LABEL: s_bswap_i32: 10; GFX7: ; %bb.0: 11; GFX7-NEXT: v_alignbit_b32 v0, s0, s0, 8 12; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 24 13; GFX7-NEXT: s_mov_b32 s0, 0xff00ff 14; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0 15; GFX7-NEXT: v_readfirstlane_b32 s0, v0 16; GFX7-NEXT: ; return to shader part epilog 17; 18; GFX8-LABEL: s_bswap_i32: 19; GFX8: ; %bb.0: 20; GFX8-NEXT: v_mov_b32_e32 v0, s0 21; GFX8-NEXT: s_mov_b32 s0, 0x10203 22; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0 23; GFX8-NEXT: v_readfirstlane_b32 s0, v0 24; GFX8-NEXT: ; return to shader part epilog 25; 26; GFX9-LABEL: s_bswap_i32: 27; GFX9: ; %bb.0: 28; GFX9-NEXT: v_mov_b32_e32 v0, s0 29; GFX9-NEXT: s_mov_b32 s0, 0x10203 30; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 31; GFX9-NEXT: v_readfirstlane_b32 s0, v0 32; GFX9-NEXT: ; return to shader part epilog 33; 34; GFX10-LABEL: s_bswap_i32: 35; GFX10: ; %bb.0: 36; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x10203 37; GFX10-NEXT: v_readfirstlane_b32 s0, v0 38; GFX10-NEXT: ; return to shader part epilog 39 %bswap = call i32 @llvm.bswap.i32(i32 %src) 40 ret i32 %bswap 41} 42 43define i32 @v_bswap_i32(i32 %src) { 44; GFX7-LABEL: v_bswap_i32: 45; GFX7: ; %bb.0: 46; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 47; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8 48; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 49; GFX7-NEXT: s_mov_b32 s4, 0xff00ff 50; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 51; GFX7-NEXT: s_setpc_b64 s[30:31] 52; 53; GFX8-LABEL: v_bswap_i32: 54; GFX8: ; %bb.0: 55; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; GFX8-NEXT: s_mov_b32 s4, 0x10203 57; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 58; GFX8-NEXT: s_setpc_b64 s[30:31] 59; 60; GFX9-LABEL: v_bswap_i32: 61; GFX9: ; %bb.0: 62; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 63; GFX9-NEXT: s_mov_b32 s4, 0x10203 64; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 65; GFX9-NEXT: s_setpc_b64 s[30:31] 66; 67; GFX10-LABEL: v_bswap_i32: 68; GFX10: ; %bb.0: 69; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 70; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x10203 71; GFX10-NEXT: s_setpc_b64 s[30:31] 72 %bswap = call i32 @llvm.bswap.i32(i32 %src) 73 ret i32 %bswap 74} 75 76define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) { 77; GFX7-LABEL: s_bswap_v2i32: 78; GFX7: ; %bb.0: 79; GFX7-NEXT: v_alignbit_b32 v0, s0, s0, 8 80; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 24 81; GFX7-NEXT: s_mov_b32 s0, 0xff00ff 82; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0 83; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 8 84; GFX7-NEXT: v_alignbit_b32 v2, s1, s1, 24 85; GFX7-NEXT: v_bfi_b32 v1, s0, v2, v1 86; GFX7-NEXT: v_readfirstlane_b32 s0, v0 87; GFX7-NEXT: v_readfirstlane_b32 s1, v1 88; GFX7-NEXT: ; return to shader part epilog 89; 90; GFX8-LABEL: s_bswap_v2i32: 91; GFX8: ; %bb.0: 92; GFX8-NEXT: v_mov_b32_e32 v0, s0 93; GFX8-NEXT: s_mov_b32 s0, 0x10203 94; GFX8-NEXT: v_mov_b32_e32 v1, s1 95; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0 96; GFX8-NEXT: v_perm_b32 v1, 0, v1, s0 97; GFX8-NEXT: v_readfirstlane_b32 s0, v0 98; GFX8-NEXT: v_readfirstlane_b32 s1, v1 99; GFX8-NEXT: ; return to shader part epilog 100; 101; GFX9-LABEL: s_bswap_v2i32: 102; GFX9: ; %bb.0: 103; GFX9-NEXT: v_mov_b32_e32 v0, s0 104; GFX9-NEXT: s_mov_b32 s0, 0x10203 105; GFX9-NEXT: v_mov_b32_e32 v1, s1 106; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 107; GFX9-NEXT: v_perm_b32 v1, 0, v1, s0 108; GFX9-NEXT: v_readfirstlane_b32 s0, v0 109; GFX9-NEXT: v_readfirstlane_b32 s1, v1 110; GFX9-NEXT: ; return to shader part epilog 111; 112; GFX10-LABEL: s_bswap_v2i32: 113; GFX10: ; %bb.0: 114; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x10203 115; GFX10-NEXT: v_perm_b32 v1, 0, s1, 0x10203 116; GFX10-NEXT: v_readfirstlane_b32 s0, v0 117; GFX10-NEXT: v_readfirstlane_b32 s1, v1 118; GFX10-NEXT: ; return to shader part epilog 119 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) 120 ret <2 x i32> %bswap 121} 122 123define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) { 124; GFX7-LABEL: v_bswap_v2i32: 125; GFX7: ; %bb.0: 126; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 127; GFX7-NEXT: v_alignbit_b32 v2, v0, v0, 8 128; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 129; GFX7-NEXT: s_mov_b32 s4, 0xff00ff 130; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2 131; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8 132; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24 133; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 134; GFX7-NEXT: s_setpc_b64 s[30:31] 135; 136; GFX8-LABEL: v_bswap_v2i32: 137; GFX8: ; %bb.0: 138; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 139; GFX8-NEXT: s_mov_b32 s4, 0x10203 140; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 141; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4 142; GFX8-NEXT: s_setpc_b64 s[30:31] 143; 144; GFX9-LABEL: v_bswap_v2i32: 145; GFX9: ; %bb.0: 146; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 147; GFX9-NEXT: s_mov_b32 s4, 0x10203 148; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 149; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 150; GFX9-NEXT: s_setpc_b64 s[30:31] 151; 152; GFX10-LABEL: v_bswap_v2i32: 153; GFX10: ; %bb.0: 154; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 155; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x10203 156; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203 157; GFX10-NEXT: s_setpc_b64 s[30:31] 158 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) 159 ret <2 x i32> %bswap 160} 161 162define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) { 163; GFX7-LABEL: s_bswap_i64: 164; GFX7: ; %bb.0: 165; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8 166; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 24 167; GFX7-NEXT: s_mov_b32 s1, 0xff00ff 168; GFX7-NEXT: v_bfi_b32 v0, s1, v1, v0 169; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 8 170; GFX7-NEXT: v_alignbit_b32 v2, s0, s0, 24 171; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1 172; GFX7-NEXT: v_readfirstlane_b32 s0, v0 173; GFX7-NEXT: v_readfirstlane_b32 s1, v1 174; GFX7-NEXT: ; return to shader part epilog 175; 176; GFX8-LABEL: s_bswap_i64: 177; GFX8: ; %bb.0: 178; GFX8-NEXT: v_mov_b32_e32 v0, s1 179; GFX8-NEXT: s_mov_b32 s1, 0x10203 180; GFX8-NEXT: v_mov_b32_e32 v1, s0 181; GFX8-NEXT: v_perm_b32 v0, 0, v0, s1 182; GFX8-NEXT: v_perm_b32 v1, 0, v1, s1 183; GFX8-NEXT: v_readfirstlane_b32 s0, v0 184; GFX8-NEXT: v_readfirstlane_b32 s1, v1 185; GFX8-NEXT: ; return to shader part epilog 186; 187; GFX9-LABEL: s_bswap_i64: 188; GFX9: ; %bb.0: 189; GFX9-NEXT: v_mov_b32_e32 v0, s1 190; GFX9-NEXT: s_mov_b32 s1, 0x10203 191; GFX9-NEXT: v_mov_b32_e32 v1, s0 192; GFX9-NEXT: v_perm_b32 v0, 0, v0, s1 193; GFX9-NEXT: v_perm_b32 v1, 0, v1, s1 194; GFX9-NEXT: v_readfirstlane_b32 s0, v0 195; GFX9-NEXT: v_readfirstlane_b32 s1, v1 196; GFX9-NEXT: ; return to shader part epilog 197; 198; GFX10-LABEL: s_bswap_i64: 199; GFX10: ; %bb.0: 200; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203 201; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203 202; GFX10-NEXT: v_readfirstlane_b32 s0, v0 203; GFX10-NEXT: v_readfirstlane_b32 s1, v1 204; GFX10-NEXT: ; return to shader part epilog 205 %bswap = call i64 @llvm.bswap.i64(i64 %src) 206 ret i64 %bswap 207} 208 209define i64 @v_bswap_i64(i64 %src) { 210; GFX7-LABEL: v_bswap_i64: 211; GFX7: ; %bb.0: 212; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 213; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8 214; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24 215; GFX7-NEXT: s_mov_b32 s4, 0xff00ff 216; GFX7-NEXT: v_bfi_b32 v2, s4, v1, v2 217; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8 218; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 219; GFX7-NEXT: v_bfi_b32 v1, s4, v0, v1 220; GFX7-NEXT: v_mov_b32_e32 v0, v2 221; GFX7-NEXT: s_setpc_b64 s[30:31] 222; 223; GFX8-LABEL: v_bswap_i64: 224; GFX8: ; %bb.0: 225; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 226; GFX8-NEXT: s_mov_b32 s4, 0x10203 227; GFX8-NEXT: v_perm_b32 v2, 0, v1, s4 228; GFX8-NEXT: v_perm_b32 v1, 0, v0, s4 229; GFX8-NEXT: v_mov_b32_e32 v0, v2 230; GFX8-NEXT: s_setpc_b64 s[30:31] 231; 232; GFX9-LABEL: v_bswap_i64: 233; GFX9: ; %bb.0: 234; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 235; GFX9-NEXT: s_mov_b32 s4, 0x10203 236; GFX9-NEXT: v_perm_b32 v2, 0, v1, s4 237; GFX9-NEXT: v_perm_b32 v1, 0, v0, s4 238; GFX9-NEXT: v_mov_b32_e32 v0, v2 239; GFX9-NEXT: s_setpc_b64 s[30:31] 240; 241; GFX10-LABEL: v_bswap_i64: 242; GFX10: ; %bb.0: 243; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 244; GFX10-NEXT: v_perm_b32 v2, 0, v1, 0x10203 245; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203 246; GFX10-NEXT: v_mov_b32_e32 v0, v2 247; GFX10-NEXT: s_setpc_b64 s[30:31] 248 %bswap = call i64 @llvm.bswap.i64(i64 %src) 249 ret i64 %bswap 250} 251 252define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) { 253; GFX7-LABEL: s_bswap_v2i64: 254; GFX7: ; %bb.0: 255; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8 256; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 24 257; GFX7-NEXT: s_mov_b32 s1, 0xff00ff 258; GFX7-NEXT: v_bfi_b32 v0, s1, v1, v0 259; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 8 260; GFX7-NEXT: v_alignbit_b32 v2, s0, s0, 24 261; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1 262; GFX7-NEXT: v_alignbit_b32 v2, s3, s3, 8 263; GFX7-NEXT: v_alignbit_b32 v3, s3, s3, 24 264; GFX7-NEXT: v_bfi_b32 v2, s1, v3, v2 265; GFX7-NEXT: v_alignbit_b32 v3, s2, s2, 8 266; GFX7-NEXT: v_alignbit_b32 v4, s2, s2, 24 267; GFX7-NEXT: v_bfi_b32 v3, s1, v4, v3 268; GFX7-NEXT: v_readfirstlane_b32 s0, v0 269; GFX7-NEXT: v_readfirstlane_b32 s1, v1 270; GFX7-NEXT: v_readfirstlane_b32 s2, v2 271; GFX7-NEXT: v_readfirstlane_b32 s3, v3 272; GFX7-NEXT: ; return to shader part epilog 273; 274; GFX8-LABEL: s_bswap_v2i64: 275; GFX8: ; %bb.0: 276; GFX8-NEXT: v_mov_b32_e32 v0, s1 277; GFX8-NEXT: s_mov_b32 s1, 0x10203 278; GFX8-NEXT: v_mov_b32_e32 v1, s0 279; GFX8-NEXT: v_mov_b32_e32 v2, s3 280; GFX8-NEXT: v_mov_b32_e32 v3, s2 281; GFX8-NEXT: v_perm_b32 v0, 0, v0, s1 282; GFX8-NEXT: v_perm_b32 v1, 0, v1, s1 283; GFX8-NEXT: v_perm_b32 v2, 0, v2, s1 284; GFX8-NEXT: v_perm_b32 v3, 0, v3, s1 285; GFX8-NEXT: v_readfirstlane_b32 s0, v0 286; GFX8-NEXT: v_readfirstlane_b32 s1, v1 287; GFX8-NEXT: v_readfirstlane_b32 s2, v2 288; GFX8-NEXT: v_readfirstlane_b32 s3, v3 289; GFX8-NEXT: ; return to shader part epilog 290; 291; GFX9-LABEL: s_bswap_v2i64: 292; GFX9: ; %bb.0: 293; GFX9-NEXT: v_mov_b32_e32 v0, s1 294; GFX9-NEXT: s_mov_b32 s1, 0x10203 295; GFX9-NEXT: v_mov_b32_e32 v1, s0 296; GFX9-NEXT: v_mov_b32_e32 v2, s3 297; GFX9-NEXT: v_mov_b32_e32 v3, s2 298; GFX9-NEXT: v_perm_b32 v0, 0, v0, s1 299; GFX9-NEXT: v_perm_b32 v1, 0, v1, s1 300; GFX9-NEXT: v_perm_b32 v2, 0, v2, s1 301; GFX9-NEXT: v_perm_b32 v3, 0, v3, s1 302; GFX9-NEXT: v_readfirstlane_b32 s0, v0 303; GFX9-NEXT: v_readfirstlane_b32 s1, v1 304; GFX9-NEXT: v_readfirstlane_b32 s2, v2 305; GFX9-NEXT: v_readfirstlane_b32 s3, v3 306; GFX9-NEXT: ; return to shader part epilog 307; 308; GFX10-LABEL: s_bswap_v2i64: 309; GFX10: ; %bb.0: 310; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203 311; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203 312; GFX10-NEXT: v_perm_b32 v2, 0, s3, 0x10203 313; GFX10-NEXT: v_perm_b32 v3, 0, s2, 0x10203 314; GFX10-NEXT: v_readfirstlane_b32 s0, v0 315; GFX10-NEXT: v_readfirstlane_b32 s1, v1 316; GFX10-NEXT: v_readfirstlane_b32 s2, v2 317; GFX10-NEXT: v_readfirstlane_b32 s3, v3 318; GFX10-NEXT: ; return to shader part epilog 319 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src) 320 ret <2 x i64> %bswap 321} 322 323define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) { 324; GFX7-LABEL: v_bswap_v2i64: 325; GFX7: ; %bb.0: 326; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 327; GFX7-NEXT: v_alignbit_b32 v4, v1, v1, 8 328; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24 329; GFX7-NEXT: s_mov_b32 s4, 0xff00ff 330; GFX7-NEXT: v_bfi_b32 v4, s4, v1, v4 331; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8 332; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 333; GFX7-NEXT: v_bfi_b32 v1, s4, v0, v1 334; GFX7-NEXT: v_alignbit_b32 v0, v3, v3, 8 335; GFX7-NEXT: v_alignbit_b32 v3, v3, v3, 24 336; GFX7-NEXT: v_bfi_b32 v5, s4, v3, v0 337; GFX7-NEXT: v_alignbit_b32 v0, v2, v2, 8 338; GFX7-NEXT: v_alignbit_b32 v2, v2, v2, 24 339; GFX7-NEXT: v_bfi_b32 v3, s4, v2, v0 340; GFX7-NEXT: v_mov_b32_e32 v0, v4 341; GFX7-NEXT: v_mov_b32_e32 v2, v5 342; GFX7-NEXT: s_setpc_b64 s[30:31] 343; 344; GFX8-LABEL: v_bswap_v2i64: 345; GFX8: ; %bb.0: 346; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 347; GFX8-NEXT: s_mov_b32 s4, 0x10203 348; GFX8-NEXT: v_perm_b32 v4, 0, v1, s4 349; GFX8-NEXT: v_perm_b32 v5, 0, v3, s4 350; GFX8-NEXT: v_perm_b32 v1, 0, v0, s4 351; GFX8-NEXT: v_perm_b32 v3, 0, v2, s4 352; GFX8-NEXT: v_mov_b32_e32 v0, v4 353; GFX8-NEXT: v_mov_b32_e32 v2, v5 354; GFX8-NEXT: s_setpc_b64 s[30:31] 355; 356; GFX9-LABEL: v_bswap_v2i64: 357; GFX9: ; %bb.0: 358; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 359; GFX9-NEXT: s_mov_b32 s4, 0x10203 360; GFX9-NEXT: v_perm_b32 v4, 0, v1, s4 361; GFX9-NEXT: v_perm_b32 v5, 0, v3, s4 362; GFX9-NEXT: v_perm_b32 v1, 0, v0, s4 363; GFX9-NEXT: v_perm_b32 v3, 0, v2, s4 364; GFX9-NEXT: v_mov_b32_e32 v0, v4 365; GFX9-NEXT: v_mov_b32_e32 v2, v5 366; GFX9-NEXT: s_setpc_b64 s[30:31] 367; 368; GFX10-LABEL: v_bswap_v2i64: 369; GFX10: ; %bb.0: 370; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 371; GFX10-NEXT: v_perm_b32 v4, 0, v1, 0x10203 372; GFX10-NEXT: v_perm_b32 v5, 0, v3, 0x10203 373; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203 374; GFX10-NEXT: v_perm_b32 v3, 0, v2, 0x10203 375; GFX10-NEXT: v_mov_b32_e32 v0, v4 376; GFX10-NEXT: v_mov_b32_e32 v2, v5 377; GFX10-NEXT: s_setpc_b64 s[30:31] 378 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src) 379 ret <2 x i64> %bswap 380} 381 382define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) { 383; GFX7-LABEL: s_bswap_i16: 384; GFX7: ; %bb.0: 385; GFX7-NEXT: s_lshl_b32 s1, s0, 8 386; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008 387; GFX7-NEXT: s_or_b32 s0, s0, s1 388; GFX7-NEXT: ; return to shader part epilog 389; 390; GFX8-LABEL: s_bswap_i16: 391; GFX8: ; %bb.0: 392; GFX8-NEXT: v_mov_b32_e32 v0, s0 393; GFX8-NEXT: s_mov_b32 s0, 0xc0c0001 394; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0 395; GFX8-NEXT: v_readfirstlane_b32 s0, v0 396; GFX8-NEXT: ; return to shader part epilog 397; 398; GFX9-LABEL: s_bswap_i16: 399; GFX9: ; %bb.0: 400; GFX9-NEXT: v_mov_b32_e32 v0, s0 401; GFX9-NEXT: s_mov_b32 s0, 0xc0c0001 402; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 403; GFX9-NEXT: v_readfirstlane_b32 s0, v0 404; GFX9-NEXT: ; return to shader part epilog 405; 406; GFX10-LABEL: s_bswap_i16: 407; GFX10: ; %bb.0: 408; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0xc0c0001 409; GFX10-NEXT: v_readfirstlane_b32 s0, v0 410; GFX10-NEXT: ; return to shader part epilog 411 %bswap = call i16 @llvm.bswap.i16(i16 %src) 412 ret i16 %bswap 413} 414 415define i16 @v_bswap_i16(i16 %src) { 416; GFX7-LABEL: v_bswap_i16: 417; GFX7: ; %bb.0: 418; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 419; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 420; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 421; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 422; GFX7-NEXT: s_setpc_b64 s[30:31] 423; 424; GFX8-LABEL: v_bswap_i16: 425; GFX8: ; %bb.0: 426; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 427; GFX8-NEXT: s_mov_b32 s4, 0xc0c0001 428; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 429; GFX8-NEXT: s_setpc_b64 s[30:31] 430; 431; GFX9-LABEL: v_bswap_i16: 432; GFX9: ; %bb.0: 433; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 434; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001 435; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 436; GFX9-NEXT: s_setpc_b64 s[30:31] 437; 438; GFX10-LABEL: v_bswap_i16: 439; GFX10: ; %bb.0: 440; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 441; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 442; GFX10-NEXT: s_setpc_b64 s[30:31] 443 %bswap = call i16 @llvm.bswap.i16(i16 %src) 444 ret i16 %bswap 445} 446 447define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) { 448; GFX7-LABEL: s_bswap_v2i16: 449; GFX7: ; %bb.0: 450; GFX7-NEXT: s_lshl_b32 s2, s0, 8 451; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008 452; GFX7-NEXT: s_or_b32 s0, s0, s2 453; GFX7-NEXT: s_lshl_b32 s2, s1, 8 454; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008 455; GFX7-NEXT: s_or_b32 s1, s1, s2 456; GFX7-NEXT: s_and_b32 s1, 0xffff, s1 457; GFX7-NEXT: s_and_b32 s0, 0xffff, s0 458; GFX7-NEXT: s_lshl_b32 s1, s1, 16 459; GFX7-NEXT: s_or_b32 s0, s0, s1 460; GFX7-NEXT: ; return to shader part epilog 461; 462; GFX8-LABEL: s_bswap_v2i16: 463; GFX8: ; %bb.0: 464; GFX8-NEXT: v_mov_b32_e32 v0, s0 465; GFX8-NEXT: s_mov_b32 s0, 0x2030001 466; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0 467; GFX8-NEXT: v_readfirstlane_b32 s0, v0 468; GFX8-NEXT: ; return to shader part epilog 469; 470; GFX9-LABEL: s_bswap_v2i16: 471; GFX9: ; %bb.0: 472; GFX9-NEXT: v_mov_b32_e32 v0, s0 473; GFX9-NEXT: s_mov_b32 s0, 0x2030001 474; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 475; GFX9-NEXT: v_readfirstlane_b32 s0, v0 476; GFX9-NEXT: ; return to shader part epilog 477; 478; GFX10-LABEL: s_bswap_v2i16: 479; GFX10: ; %bb.0: 480; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x2030001 481; GFX10-NEXT: v_readfirstlane_b32 s0, v0 482; GFX10-NEXT: ; return to shader part epilog 483 %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) 484 %cast = bitcast <2 x i16> %bswap to i32 485 ret i32 %cast 486} 487 488define i32 @v_bswap_i16_zext_to_i32(i16 %src) { 489; GFX7-LABEL: v_bswap_i16_zext_to_i32: 490; GFX7: ; %bb.0: 491; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 492; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 493; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 494; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 495; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 496; GFX7-NEXT: s_setpc_b64 s[30:31] 497; 498; GFX8-LABEL: v_bswap_i16_zext_to_i32: 499; GFX8: ; %bb.0: 500; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 501; GFX8-NEXT: s_mov_b32 s4, 0xc0c0001 502; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 503; GFX8-NEXT: s_setpc_b64 s[30:31] 504; 505; GFX9-LABEL: v_bswap_i16_zext_to_i32: 506; GFX9: ; %bb.0: 507; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 508; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001 509; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 510; GFX9-NEXT: s_setpc_b64 s[30:31] 511; 512; GFX10-LABEL: v_bswap_i16_zext_to_i32: 513; GFX10: ; %bb.0: 514; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 515; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 516; GFX10-NEXT: s_setpc_b64 s[30:31] 517 %bswap = call i16 @llvm.bswap.i16(i16 %src) 518 %zext = zext i16 %bswap to i32 519 ret i32 %zext 520} 521 522define i32 @v_bswap_i16_sext_to_i32(i16 %src) { 523; GFX7-LABEL: v_bswap_i16_sext_to_i32: 524; GFX7: ; %bb.0: 525; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 526; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 527; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 528; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 529; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 530; GFX7-NEXT: s_setpc_b64 s[30:31] 531; 532; GFX8-LABEL: v_bswap_i16_sext_to_i32: 533; GFX8: ; %bb.0: 534; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 535; GFX8-NEXT: s_mov_b32 s4, 0xc0c0001 536; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 537; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16 538; GFX8-NEXT: s_setpc_b64 s[30:31] 539; 540; GFX9-LABEL: v_bswap_i16_sext_to_i32: 541; GFX9: ; %bb.0: 542; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 543; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001 544; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 545; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 546; GFX9-NEXT: s_setpc_b64 s[30:31] 547; 548; GFX10-LABEL: v_bswap_i16_sext_to_i32: 549; GFX10: ; %bb.0: 550; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 551; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 552; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 553; GFX10-NEXT: s_setpc_b64 s[30:31] 554 %bswap = call i16 @llvm.bswap.i16(i16 %src) 555 %zext = sext i16 %bswap to i32 556 ret i32 %zext 557} 558 559define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) { 560; GFX7-LABEL: v_bswap_v2i16: 561; GFX7: ; %bb.0: 562; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 563; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v0 564; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 565; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 566; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1 567; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8 568; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 569; GFX7-NEXT: s_setpc_b64 s[30:31] 570; 571; GFX8-LABEL: v_bswap_v2i16: 572; GFX8: ; %bb.0: 573; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 574; GFX8-NEXT: s_mov_b32 s4, 0x2030001 575; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 576; GFX8-NEXT: s_setpc_b64 s[30:31] 577; 578; GFX9-LABEL: v_bswap_v2i16: 579; GFX9: ; %bb.0: 580; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 581; GFX9-NEXT: s_mov_b32 s4, 0x2030001 582; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 583; GFX9-NEXT: s_setpc_b64 s[30:31] 584; 585; GFX10-LABEL: v_bswap_v2i16: 586; GFX10: ; %bb.0: 587; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 588; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x2030001 589; GFX10-NEXT: s_setpc_b64 s[30:31] 590 %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) 591 ret <2 x i16> %bswap 592} 593 594define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { 595; GFX7-LABEL: v_bswap_v3i16: 596; GFX7: ; %bb.0: 597; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 598; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v0 599; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 600; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 601; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v1 602; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8 603; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 604; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v2 605; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 606; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 607; GFX7-NEXT: s_setpc_b64 s[30:31] 608; 609; GFX8-LABEL: v_bswap_v3i16: 610; GFX8: ; %bb.0: 611; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 612; GFX8-NEXT: s_mov_b32 s4, 0x2030001 613; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 614; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4 615; GFX8-NEXT: s_setpc_b64 s[30:31] 616; 617; GFX9-LABEL: v_bswap_v3i16: 618; GFX9: ; %bb.0: 619; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 620; GFX9-NEXT: s_mov_b32 s4, 0x2030001 621; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 622; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 623; GFX9-NEXT: s_setpc_b64 s[30:31] 624; 625; GFX10-LABEL: v_bswap_v3i16: 626; GFX10: ; %bb.0: 627; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 628; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x2030001 629; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x2030001 630; GFX10-NEXT: s_setpc_b64 s[30:31] 631 %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src) 632 ret <3 x i16> %bswap 633} 634 635define i64 @v_bswap_i48(i64 %src) { 636; GFX7-LABEL: v_bswap_i48: 637; GFX7: ; %bb.0: 638; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 639; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8 640; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24 641; GFX7-NEXT: s_mov_b32 s4, 0xff00ff 642; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 643; GFX7-NEXT: v_alignbit_b32 v2, v0, v0, 8 644; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 645; GFX7-NEXT: v_bfi_b32 v2, s4, v0, v2 646; GFX7-NEXT: v_lshr_b64 v[0:1], v[1:2], 16 647; GFX7-NEXT: s_setpc_b64 s[30:31] 648; 649; GFX8-LABEL: v_bswap_i48: 650; GFX8: ; %bb.0: 651; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 652; GFX8-NEXT: s_mov_b32 s4, 0x10203 653; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4 654; GFX8-NEXT: v_perm_b32 v2, 0, v0, s4 655; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] 656; GFX8-NEXT: s_setpc_b64 s[30:31] 657; 658; GFX9-LABEL: v_bswap_i48: 659; GFX9: ; %bb.0: 660; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 661; GFX9-NEXT: s_mov_b32 s4, 0x10203 662; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 663; GFX9-NEXT: v_perm_b32 v2, 0, v0, s4 664; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] 665; GFX9-NEXT: s_setpc_b64 s[30:31] 666; 667; GFX10-LABEL: v_bswap_i48: 668; GFX10: ; %bb.0: 669; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 670; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203 671; GFX10-NEXT: v_perm_b32 v2, 0, v0, 0x10203 672; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] 673; GFX10-NEXT: s_setpc_b64 s[30:31] 674 %trunc = trunc i64 %src to i48 675 %bswap = call i48 @llvm.bswap.i48(i48 %trunc) 676 %zext = zext i48 %bswap to i64 677 ret i64 %zext 678} 679 680declare i16 @llvm.bswap.i16(i16) #1 681declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) #1 682declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) #1 683declare i32 @llvm.bswap.i32(i32) #1 684declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) #1 685declare i64 @llvm.bswap.i64(i64) #1 686declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) #1 687declare i48 @llvm.bswap.i48(i48) #1 688 689attributes #0 = { convergent nounwind readnone } 690attributes #1 = { nounwind readnone speculatable willreturn } 691