1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s 7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s 8 9define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { 10; GFX6-LABEL: v_usubsat_i8: 11; GFX6: ; %bb.0: 12; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 14; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 15; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 16; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 17; GFX6-NEXT: s_setpc_b64 s[30:31] 18; 19; GFX8-LABEL: v_usubsat_i8: 20; GFX8: ; %bb.0: 21; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 23; GFX8-NEXT: s_setpc_b64 s[30:31] 24; 25; GFX9-LABEL: v_usubsat_i8: 26; GFX9: ; %bb.0: 27; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 29; GFX9-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX10-LABEL: v_usubsat_i8: 32; GFX10: ; %bb.0: 33; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 35; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 36; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp 37; GFX10-NEXT: s_setpc_b64 s[30:31] 38; 39; GFX11-TRUE16-LABEL: v_usubsat_i8: 40; GFX11-TRUE16: ; %bb.0: 41; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 42; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l 43; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l 44; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp 45; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 46; 47; GFX11-FAKE16-LABEL: v_usubsat_i8: 48; GFX11-FAKE16: ; %bb.0: 49; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 51; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 52; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp 53; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 54 %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) 55 ret i8 %result 56} 57 58define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) { 59; GFX6-LABEL: v_usubsat_i16: 60; GFX6: ; %bb.0: 61; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 62; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 63; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 64; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 65; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 66; GFX6-NEXT: s_setpc_b64 s[30:31] 67; 68; GFX8-LABEL: v_usubsat_i16: 69; GFX8: ; %bb.0: 70; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 71; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 72; GFX8-NEXT: s_setpc_b64 s[30:31] 73; 74; GFX9-LABEL: v_usubsat_i16: 75; GFX9: ; %bb.0: 76; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 78; GFX9-NEXT: s_setpc_b64 s[30:31] 79; 80; GFX10-LABEL: v_usubsat_i16: 81; GFX10: ; %bb.0: 82; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp 84; GFX10-NEXT: s_setpc_b64 s[30:31] 85; 86; GFX11-TRUE16-LABEL: v_usubsat_i16: 87; GFX11-TRUE16: ; %bb.0: 88; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 89; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l 90; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp 91; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 92; 93; GFX11-FAKE16-LABEL: v_usubsat_i16: 94; GFX11-FAKE16: ; %bb.0: 95; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 96; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp 97; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 98 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) 99 ret i16 %result 100} 101 102define i16 @usubsat_as_bithack_i16(i16 %x) { 103; GFX6-LABEL: usubsat_as_bithack_i16: 104; GFX6: ; %bb.0: 105; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 106; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 107; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 108; GFX6-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 109; GFX6-NEXT: v_and_b32_e32 v0, v1, v0 110; GFX6-NEXT: s_setpc_b64 s[30:31] 111; 112; GFX8-LABEL: usubsat_as_bithack_i16: 113; GFX8: ; %bb.0: 114; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 115; GFX8-NEXT: s_movk_i32 s4, 0x8000 116; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 117; GFX8-NEXT: s_setpc_b64 s[30:31] 118; 119; GFX9-LABEL: usubsat_as_bithack_i16: 120; GFX9: ; %bb.0: 121; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 122; GFX9-NEXT: s_movk_i32 s4, 0x8000 123; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 124; GFX9-NEXT: s_setpc_b64 s[30:31] 125; 126; GFX10-LABEL: usubsat_as_bithack_i16: 127; GFX10: ; %bb.0: 128; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 129; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp 130; GFX10-NEXT: s_setpc_b64 s[30:31] 131; 132; GFX11-TRUE16-LABEL: usubsat_as_bithack_i16: 133; GFX11-TRUE16: ; %bb.0: 134; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 135; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp 136; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 137; 138; GFX11-FAKE16-LABEL: usubsat_as_bithack_i16: 139; GFX11-FAKE16: ; %bb.0: 140; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 141; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp 142; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 143 %signsplat = ashr i16 %x, 15 144 %flipsign = xor i16 %x, 32768 145 %result = and i16 %signsplat, %flipsign 146 ret i16 %result 147} 148 149define i16 @usubsat_as_bithack2_i16(i16 %x) { 150; GFX6-LABEL: usubsat_as_bithack2_i16: 151; GFX6: ; %bb.0: 152; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 153; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 154; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 155; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xffff8000, v0 156; GFX6-NEXT: v_and_b32_e32 v0, v1, v0 157; GFX6-NEXT: s_setpc_b64 s[30:31] 158; 159; GFX8-LABEL: usubsat_as_bithack2_i16: 160; GFX8: ; %bb.0: 161; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 162; GFX8-NEXT: s_movk_i32 s4, 0x8000 163; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 164; GFX8-NEXT: s_setpc_b64 s[30:31] 165; 166; GFX9-LABEL: usubsat_as_bithack2_i16: 167; GFX9: ; %bb.0: 168; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 169; GFX9-NEXT: s_movk_i32 s4, 0x8000 170; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 171; GFX9-NEXT: s_setpc_b64 s[30:31] 172; 173; GFX10-LABEL: usubsat_as_bithack2_i16: 174; GFX10: ; %bb.0: 175; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 176; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp 177; GFX10-NEXT: s_setpc_b64 s[30:31] 178; 179; GFX11-TRUE16-LABEL: usubsat_as_bithack2_i16: 180; GFX11-TRUE16: ; %bb.0: 181; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 182; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp 183; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 184; 185; GFX11-FAKE16-LABEL: usubsat_as_bithack2_i16: 186; GFX11-FAKE16: ; %bb.0: 187; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 188; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp 189; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 190 %signsplat = ashr i16 %x, 15 191 %flipsign = add i16 %x, 32768 192 %result = and i16 %signsplat, %flipsign 193 ret i16 %result 194} 195 196define i16 @usubsat_as_bithack_commute_i16(i16 %x) { 197; GFX6-LABEL: usubsat_as_bithack_commute_i16: 198; GFX6: ; %bb.0: 199; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 200; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 201; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 202; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xffff8000, v0 203; GFX6-NEXT: v_and_b32_e32 v0, v0, v1 204; GFX6-NEXT: s_setpc_b64 s[30:31] 205; 206; GFX8-LABEL: usubsat_as_bithack_commute_i16: 207; GFX8: ; %bb.0: 208; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 209; GFX8-NEXT: s_movk_i32 s4, 0x8000 210; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 211; GFX8-NEXT: s_setpc_b64 s[30:31] 212; 213; GFX9-LABEL: usubsat_as_bithack_commute_i16: 214; GFX9: ; %bb.0: 215; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 216; GFX9-NEXT: s_movk_i32 s4, 0x8000 217; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 218; GFX9-NEXT: s_setpc_b64 s[30:31] 219; 220; GFX10-LABEL: usubsat_as_bithack_commute_i16: 221; GFX10: ; %bb.0: 222; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 223; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp 224; GFX10-NEXT: s_setpc_b64 s[30:31] 225; 226; GFX11-TRUE16-LABEL: usubsat_as_bithack_commute_i16: 227; GFX11-TRUE16: ; %bb.0: 228; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 229; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp 230; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 231; 232; GFX11-FAKE16-LABEL: usubsat_as_bithack_commute_i16: 233; GFX11-FAKE16: ; %bb.0: 234; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 235; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp 236; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 237 %signsplat = ashr i16 %x, 15 238 %flipsign = add i16 %x, 32768 239 %result = and i16 %flipsign, %signsplat 240 ret i16 %result 241} 242 243define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) { 244; GFX6-LABEL: v_usubsat_i32: 245; GFX6: ; %bb.0: 246; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 247; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 248; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 249; GFX6-NEXT: s_setpc_b64 s[30:31] 250; 251; GFX8-LABEL: v_usubsat_i32: 252; GFX8: ; %bb.0: 253; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 254; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp 255; GFX8-NEXT: s_setpc_b64 s[30:31] 256; 257; GFX9-LABEL: v_usubsat_i32: 258; GFX9: ; %bb.0: 259; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 260; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp 261; GFX9-NEXT: s_setpc_b64 s[30:31] 262; 263; GFX10PLUS-LABEL: v_usubsat_i32: 264; GFX10PLUS: ; %bb.0: 265; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 266; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp 267; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 268 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) 269 ret i32 %result 270} 271 272define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { 273; GFX6-LABEL: v_usubsat_v2i16: 274; GFX6: ; %bb.0: 275; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 276; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 277; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 278; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 279; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 280; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 281; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 282; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 283; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 284; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 285; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 286; GFX6-NEXT: s_setpc_b64 s[30:31] 287; 288; GFX8-LABEL: v_usubsat_v2i16: 289; GFX8: ; %bb.0: 290; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 291; GFX8-NEXT: v_sub_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 292; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 293; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 294; GFX8-NEXT: s_setpc_b64 s[30:31] 295; 296; GFX9-LABEL: v_usubsat_v2i16: 297; GFX9: ; %bb.0: 298; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 299; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 300; GFX9-NEXT: s_setpc_b64 s[30:31] 301; 302; GFX10PLUS-LABEL: v_usubsat_v2i16: 303; GFX10PLUS: ; %bb.0: 304; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 305; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 306; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 307 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 308 ret <2 x i16> %result 309} 310 311define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { 312; GFX6-LABEL: v_usubsat_v3i16: 313; GFX6: ; %bb.0: 314; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 315; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v4 316; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 317; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 318; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 319; GFX6-NEXT: v_max_u32_e32 v1, v1, v6 320; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 321; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 322; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 323; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 324; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 325; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 326; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 327; GFX6-NEXT: v_max_u32_e32 v1, v2, v5 328; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v1, v5 329; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 330; GFX6-NEXT: s_setpc_b64 s[30:31] 331; 332; GFX8-LABEL: v_usubsat_v3i16: 333; GFX8: ; %bb.0: 334; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 335; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 336; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp 337; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp 338; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 339; GFX8-NEXT: s_setpc_b64 s[30:31] 340; 341; GFX9-LABEL: v_usubsat_v3i16: 342; GFX9: ; %bb.0: 343; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp 345; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp 346; GFX9-NEXT: s_setpc_b64 s[30:31] 347; 348; GFX10PLUS-LABEL: v_usubsat_v3i16: 349; GFX10PLUS: ; %bb.0: 350; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 351; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v2 clamp 352; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v3 clamp 353; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 354 %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 355 ret <3 x i16> %result 356} 357 358define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { 359; GFX6-LABEL: v_usubsat_v4i16: 360; GFX6: ; %bb.0: 361; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 362; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v5 363; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 364; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 365; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 366; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 367; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 368; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 369; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v7 370; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 371; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 372; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 373; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 374; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 375; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 376; GFX6-NEXT: v_max_u32_e32 v1, v2, v6 377; GFX6-NEXT: v_max_u32_e32 v2, v3, v8 378; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 379; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 380; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 381; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 382; GFX6-NEXT: s_setpc_b64 s[30:31] 383; 384; GFX8-LABEL: v_usubsat_v4i16: 385; GFX8: ; %bb.0: 386; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 387; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 388; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp 389; GFX8-NEXT: v_sub_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 390; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp 391; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 392; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 393; GFX8-NEXT: s_setpc_b64 s[30:31] 394; 395; GFX9-LABEL: v_usubsat_v4i16: 396; GFX9: ; %bb.0: 397; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 398; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp 399; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp 400; GFX9-NEXT: s_setpc_b64 s[30:31] 401; 402; GFX10PLUS-LABEL: v_usubsat_v4i16: 403; GFX10PLUS: ; %bb.0: 404; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 405; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v2 clamp 406; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v3 clamp 407; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 408 %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 409 %cast = bitcast <4 x i16> %result to <2 x float> 410 ret <2 x float> %cast 411} 412 413define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { 414; GFX6-LABEL: v_usubsat_v2i32: 415; GFX6: ; %bb.0: 416; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 417; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 418; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 419; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 420; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 421; GFX6-NEXT: s_setpc_b64 s[30:31] 422; 423; GFX8-LABEL: v_usubsat_v2i32: 424; GFX8: ; %bb.0: 425; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 426; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp 427; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp 428; GFX8-NEXT: s_setpc_b64 s[30:31] 429; 430; GFX9-LABEL: v_usubsat_v2i32: 431; GFX9: ; %bb.0: 432; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 433; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp 434; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp 435; GFX9-NEXT: s_setpc_b64 s[30:31] 436; 437; GFX10PLUS-LABEL: v_usubsat_v2i32: 438; GFX10PLUS: ; %bb.0: 439; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 440; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp 441; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp 442; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 443 %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 444 ret <2 x i32> %result 445} 446 447define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { 448; GFX6-LABEL: v_usubsat_v3i32: 449; GFX6: ; %bb.0: 450; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 451; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 452; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 453; GFX6-NEXT: v_max_u32_e32 v2, v2, v5 454; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 455; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 456; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 457; GFX6-NEXT: s_setpc_b64 s[30:31] 458; 459; GFX8-LABEL: v_usubsat_v3i32: 460; GFX8: ; %bb.0: 461; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 462; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp 463; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp 464; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp 465; GFX8-NEXT: s_setpc_b64 s[30:31] 466; 467; GFX9-LABEL: v_usubsat_v3i32: 468; GFX9: ; %bb.0: 469; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 470; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp 471; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp 472; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp 473; GFX9-NEXT: s_setpc_b64 s[30:31] 474; 475; GFX10PLUS-LABEL: v_usubsat_v3i32: 476; GFX10PLUS: ; %bb.0: 477; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 478; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp 479; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp 480; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp 481; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 482 %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 483 ret <3 x i32> %result 484} 485 486define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { 487; GFX6-LABEL: v_usubsat_v4i32: 488; GFX6: ; %bb.0: 489; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 490; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 491; GFX6-NEXT: v_max_u32_e32 v1, v1, v5 492; GFX6-NEXT: v_max_u32_e32 v2, v2, v6 493; GFX6-NEXT: v_max_u32_e32 v3, v3, v7 494; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 495; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 496; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 497; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 498; GFX6-NEXT: s_setpc_b64 s[30:31] 499; 500; GFX8-LABEL: v_usubsat_v4i32: 501; GFX8: ; %bb.0: 502; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 503; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp 504; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp 505; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp 506; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp 507; GFX8-NEXT: s_setpc_b64 s[30:31] 508; 509; GFX9-LABEL: v_usubsat_v4i32: 510; GFX9: ; %bb.0: 511; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 512; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp 513; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp 514; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp 515; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp 516; GFX9-NEXT: s_setpc_b64 s[30:31] 517; 518; GFX10PLUS-LABEL: v_usubsat_v4i32: 519; GFX10PLUS: ; %bb.0: 520; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 521; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp 522; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp 523; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp 524; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp 525; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 526 %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 527 ret <4 x i32> %result 528} 529 530define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { 531; GFX6-LABEL: v_usubsat_v8i32: 532; GFX6: ; %bb.0: 533; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 534; GFX6-NEXT: v_max_u32_e32 v0, v0, v8 535; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 536; GFX6-NEXT: v_max_u32_e32 v2, v2, v10 537; GFX6-NEXT: v_max_u32_e32 v3, v3, v11 538; GFX6-NEXT: v_max_u32_e32 v4, v4, v12 539; GFX6-NEXT: v_max_u32_e32 v5, v5, v13 540; GFX6-NEXT: v_max_u32_e32 v6, v6, v14 541; GFX6-NEXT: v_max_u32_e32 v7, v7, v15 542; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 543; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 544; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 545; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v11 546; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 547; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v13 548; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v14 549; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v15 550; GFX6-NEXT: s_setpc_b64 s[30:31] 551; 552; GFX8-LABEL: v_usubsat_v8i32: 553; GFX8: ; %bb.0: 554; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 555; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v8 clamp 556; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v9 clamp 557; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v10 clamp 558; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v11 clamp 559; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v12 clamp 560; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v13 clamp 561; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v14 clamp 562; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v15 clamp 563; GFX8-NEXT: s_setpc_b64 s[30:31] 564; 565; GFX9-LABEL: v_usubsat_v8i32: 566; GFX9: ; %bb.0: 567; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 568; GFX9-NEXT: v_sub_u32_e64 v0, v0, v8 clamp 569; GFX9-NEXT: v_sub_u32_e64 v1, v1, v9 clamp 570; GFX9-NEXT: v_sub_u32_e64 v2, v2, v10 clamp 571; GFX9-NEXT: v_sub_u32_e64 v3, v3, v11 clamp 572; GFX9-NEXT: v_sub_u32_e64 v4, v4, v12 clamp 573; GFX9-NEXT: v_sub_u32_e64 v5, v5, v13 clamp 574; GFX9-NEXT: v_sub_u32_e64 v6, v6, v14 clamp 575; GFX9-NEXT: v_sub_u32_e64 v7, v7, v15 clamp 576; GFX9-NEXT: s_setpc_b64 s[30:31] 577; 578; GFX10PLUS-LABEL: v_usubsat_v8i32: 579; GFX10PLUS: ; %bb.0: 580; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 581; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v8 clamp 582; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v9 clamp 583; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v10 clamp 584; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v11 clamp 585; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, v4, v12 clamp 586; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v5, v5, v13 clamp 587; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v6, v6, v14 clamp 588; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v7, v7, v15 clamp 589; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 590 %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) 591 ret <8 x i32> %result 592} 593 594define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { 595; GFX6-LABEL: v_usubsat_v16i32: 596; GFX6: ; %bb.0: 597; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 598; GFX6-NEXT: v_max_u32_e32 v0, v0, v16 599; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 600; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 601; GFX6-NEXT: v_max_u32_e32 v1, v1, v17 602; GFX6-NEXT: v_max_u32_e32 v2, v2, v18 603; GFX6-NEXT: v_max_u32_e32 v3, v3, v19 604; GFX6-NEXT: v_max_u32_e32 v4, v4, v20 605; GFX6-NEXT: v_max_u32_e32 v5, v5, v21 606; GFX6-NEXT: v_max_u32_e32 v6, v6, v22 607; GFX6-NEXT: v_max_u32_e32 v7, v7, v23 608; GFX6-NEXT: v_max_u32_e32 v8, v8, v24 609; GFX6-NEXT: v_max_u32_e32 v9, v9, v25 610; GFX6-NEXT: v_max_u32_e32 v10, v10, v26 611; GFX6-NEXT: v_max_u32_e32 v11, v11, v27 612; GFX6-NEXT: v_max_u32_e32 v12, v12, v28 613; GFX6-NEXT: v_max_u32_e32 v13, v13, v29 614; GFX6-NEXT: v_max_u32_e32 v14, v14, v30 615; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 616; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v18 617; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v19 618; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v20 619; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v21 620; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v22 621; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v23 622; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v24 623; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v25 624; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v26 625; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v27 626; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v28 627; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v29 628; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v30 629; GFX6-NEXT: s_waitcnt vmcnt(0) 630; GFX6-NEXT: v_max_u32_e32 v15, v15, v16 631; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 632; GFX6-NEXT: s_setpc_b64 s[30:31] 633; 634; GFX8-LABEL: v_usubsat_v16i32: 635; GFX8: ; %bb.0: 636; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 637; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp 638; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 639; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp 640; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp 641; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp 642; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp 643; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp 644; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp 645; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp 646; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp 647; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp 648; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp 649; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp 650; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp 651; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp 652; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp 653; GFX8-NEXT: s_waitcnt vmcnt(0) 654; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v16 clamp 655; GFX8-NEXT: s_setpc_b64 s[30:31] 656; 657; GFX9-LABEL: v_usubsat_v16i32: 658; GFX9: ; %bb.0: 659; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 660; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp 661; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 662; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp 663; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp 664; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp 665; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp 666; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp 667; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp 668; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp 669; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp 670; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp 671; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp 672; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp 673; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp 674; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp 675; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp 676; GFX9-NEXT: s_waitcnt vmcnt(0) 677; GFX9-NEXT: v_sub_u32_e64 v15, v15, v16 clamp 678; GFX9-NEXT: s_setpc_b64 s[30:31] 679; 680; GFX10-LABEL: v_usubsat_v16i32: 681; GFX10: ; %bb.0: 682; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 683; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 684; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp 685; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp 686; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp 687; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp 688; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp 689; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp 690; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp 691; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp 692; GFX10-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp 693; GFX10-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp 694; GFX10-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp 695; GFX10-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp 696; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp 697; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp 698; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp 699; GFX10-NEXT: s_waitcnt vmcnt(0) 700; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp 701; GFX10-NEXT: s_setpc_b64 s[30:31] 702; 703; GFX11-LABEL: v_usubsat_v16i32: 704; GFX11: ; %bb.0: 705; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 706; GFX11-NEXT: scratch_load_b32 v31, off, s32 707; GFX11-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp 708; GFX11-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp 709; GFX11-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp 710; GFX11-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp 711; GFX11-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp 712; GFX11-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp 713; GFX11-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp 714; GFX11-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp 715; GFX11-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp 716; GFX11-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp 717; GFX11-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp 718; GFX11-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp 719; GFX11-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp 720; GFX11-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp 721; GFX11-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp 722; GFX11-NEXT: s_waitcnt vmcnt(0) 723; GFX11-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp 724; GFX11-NEXT: s_setpc_b64 s[30:31] 725 %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 726 ret <16 x i32> %result 727} 728 729 730define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) { 731; GFX6-LABEL: v_usubsat_i64: 732; GFX6: ; %bb.0: 733; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 734; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 735; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 736; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] 737; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc 738; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc 739; GFX6-NEXT: s_setpc_b64 s[30:31] 740; 741; GFX8-LABEL: v_usubsat_i64: 742; GFX8: ; %bb.0: 743; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 744; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 745; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 746; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] 747; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc 748; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc 749; GFX8-NEXT: s_setpc_b64 s[30:31] 750; 751; GFX9-LABEL: v_usubsat_i64: 752; GFX9: ; %bb.0: 753; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 754; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 755; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 756; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] 757; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc 758; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc 759; GFX9-NEXT: s_setpc_b64 s[30:31] 760; 761; GFX10PLUS-LABEL: v_usubsat_i64: 762; GFX10PLUS: ; %bb.0: 763; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 764; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 765; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo 766; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1] 767; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo 768; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo 769; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 770 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) 771 ret i64 %result 772} 773 774declare i8 @llvm.usub.sat.i8(i8, i8) #0 775declare i16 @llvm.usub.sat.i16(i16, i16) #0 776declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0 777declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0 778declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0 779declare i32 @llvm.usub.sat.i32(i32, i32) #0 780declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0 781declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0 782declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0 783declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>) #0 784declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0 785declare i64 @llvm.usub.sat.i64(i64, i64) #0 786 787attributes #0 = { nounwind readnone speculatable willreturn } 788