Skip to content

Commit

Permalink
[AMDGPU] Add alignment check for v3 to v4 load type promotion
Browse files Browse the repository at this point in the history
It should be enabled only when the load alignment is at least 8-byte.

Fixes: SWDEV-256824

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D90404

Change-Id: I5b09c8afa3259956e02578ea5eff1988da63e140
  • Loading branch information
cdevadas authored and searlmc1 committed Jan 12, 2021
1 parent 836e9b0 commit 1100ebe
Show file tree
Hide file tree
Showing 7 changed files with 188 additions and 94 deletions.
22 changes: 15 additions & 7 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7850,6 +7850,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
LoadSDNode *Load = cast<LoadSDNode>(Op);
ISD::LoadExtType ExtType = Load->getExtensionType();
EVT MemVT = Load->getMemoryVT();
MachineMemOperand *MMO = Load->getMemOperand();

if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
Expand All @@ -7860,7 +7861,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {

SDValue Chain = Load->getChain();
SDValue BasePtr = Load->getBasePtr();
MachineMemOperand *MMO = Load->getMemOperand();

EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;

Expand Down Expand Up @@ -7923,13 +7923,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;

unsigned NumElements = MemVT.getVectorNumElements();
bool Is16ByteKnownDereferenceable = MMO->getPointerInfo().isDereferenceable(
16, *DAG.getContext(), DAG.getDataLayout());

if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
if (NumElements == 3)
if (NumElements == 3 && (Alignment >= 8 || Is16ByteKnownDereferenceable))
return WidenVectorLoad(Op, DAG);
return SplitVectorLoad(Op, DAG);
}
Expand All @@ -7947,7 +7949,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
Alignment >= 4 && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
if (NumElements == 3)
if (NumElements == 3 && (Alignment >= 8 || Is16ByteKnownDereferenceable))
return WidenVectorLoad(Op, DAG);
return SplitVectorLoad(Op, DAG);
}
Expand All @@ -7963,8 +7965,11 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
// v3 loads not supported on SI.
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
return WidenVectorLoad(Op, DAG);
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) {
if (Alignment >= 8 || Is16ByteKnownDereferenceable)
return WidenVectorLoad(Op, DAG);
return SplitVectorLoad(Op, DAG);
}
// v3 and v4 loads are supported for private and global memory.
return SDValue();
}
Expand All @@ -7987,8 +7992,11 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
// v3 loads not supported on SI.
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
return WidenVectorLoad(Op, DAG);
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) {
if (Alignment >= 8 || Is16ByteKnownDereferenceable)
return WidenVectorLoad(Op, DAG);
return SplitVectorLoad(Op, DAG);
}
return SDValue();
default:
llvm_unreachable("unsupported private_element_size");
Expand Down
63 changes: 33 additions & 30 deletions llvm/test/CodeGen/AMDGPU/fshl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,57 +12,60 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; SI-LABEL: fshl_i32:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
; SI-NEXT: s_load_dword s0, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sub_i32 s3, 32, s2
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: s_and_b32 s1, s2, 31
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: s_cmp_eq_u32 s1, 0
; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_sub_i32 s1, 32, s0
; SI-NEXT: s_and_b32 s0, s0, 31
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: s_cmp_eq_u32 s0, 0
; SI-NEXT: v_alignbit_b32 v0, s2, v0, v1
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshl_i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sub_i32 s3, 32, s2
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: s_and_b32 s1, s2, 31
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_cmp_eq_u32 s1, 0
; VI-NEXT: v_alignbit_b32 v0, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: s_sub_i32 s1, 32, s0
; VI-NEXT: s_and_b32 s0, s0, 31
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cmp_eq_u32 s0, 0
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshl_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_i32 s3, 32, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_and_b32 s1, s2, 31
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_cmp_eq_u32 s1, 0
; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: s_sub_i32 s1, 32, s0
; GFX9-NEXT: s_and_b32 s0, s0, 31
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
Expand Down
39 changes: 21 additions & 18 deletions llvm/test/CodeGen/AMDGPU/fshr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,39 +21,42 @@ define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; SI-LABEL: fshr_i32:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
; SI-NEXT: s_load_dword s0, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: v_alignbit_b32 v0, s2, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshr_i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshr_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
Expand Down
9 changes: 6 additions & 3 deletions llvm/test/CodeGen/AMDGPU/merge-stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,8 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace
}

; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
; SI-DAG: buffer_load_dwordx4
; SI-DAG: buffer_load_dwordx2
; SI-DAG: buffer_load_dword
; CI-DAG: buffer_load_dwordx3
; GCN: s_waitcnt
; SI-DAG: buffer_store_dwordx2
Expand Down Expand Up @@ -613,7 +614,8 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)*

; GCN-LABEL: {{^}}copy_v3i32_align4:
; GCN-NOT: SCRATCH_RSRC_DWORD
; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-NOT: offen
; GCN: s_waitcnt vmcnt
Expand Down Expand Up @@ -647,7 +649,8 @@ define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %ou

; GCN-LABEL: {{^}}copy_v3f32_align4:
; GCN-NOT: SCRATCH_RSRC_DWORD
; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-NOT: offen
; GCN: s_waitcnt vmcnt
Expand Down
71 changes: 71 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s

; The type promotion for the vector loads v3i32/v3f32 into v4i32/v4f32 is enabled
; only when the alignment is 8-byte or higher.
; Otherwise, split the load into two separate loads (dwordx2 + dword).
; This type promotion on smaller aligned loads can cause a page fault error
; while accessing one extra dword beyond the buffer.

define protected amdgpu_kernel void @load_v3i32_align4(<3 x i32> addrspace(1)* %arg) #0 {
; GCN-LABEL: load_v3i32_align4:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
; GCN-NEXT: s_load_dword s{{[0-9]+}}, s[0:1], 0x8
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 4
store <3 x i32> %vec, <3 x i32> addrspace(1)* undef, align 4
ret void
}

define protected amdgpu_kernel void @load_v3i32_align8(<3 x i32> addrspace(1)* %arg) #0 {
; GCN-LABEL: load_v3i32_align8:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 8
store <3 x i32> %vec, <3 x i32> addrspace(1)* undef, align 8
ret void
}

define protected amdgpu_kernel void @load_v3i32_align16(<3 x i32> addrspace(1)* %arg) #0 {
; GCN-LABEL: load_v3i32_align16:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 16
store <3 x i32> %vec, <3 x i32> addrspace(1)* undef, align 16
ret void
}

define protected amdgpu_kernel void @load_v3f32_align4(<3 x float> addrspace(1)* %arg) #0 {
; GCN-LABEL: load_v3f32_align4:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
; GCN-NEXT: s_load_dword s{{[0-9]+}}, s[0:1], 0x8
%vec = load <3 x float>, <3 x float> addrspace(1)* %arg, align 4
store <3 x float> %vec, <3 x float> addrspace(1)* undef, align 4
ret void
}

define protected amdgpu_kernel void @load_v3f32_align8(<3 x float> addrspace(1)* %arg) #0 {
; GCN-LABEL: load_v3f32_align8:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
%vec = load <3 x float>, <3 x float> addrspace(1)* %arg, align 8
store <3 x float> %vec, <3 x float> addrspace(1)* undef, align 8
ret void
}

define protected amdgpu_kernel void @load_v3f32_align16(<3 x float> addrspace(1)* %arg) #0 {
; GCN-LABEL: load_v3f32_align16:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
%vec = load <3 x float>, <3 x float> addrspace(1)* %arg, align 16
store <3 x float> %vec, <3 x float> addrspace(1)* undef, align 16
ret void
}

attributes #0 = { nounwind noinline }
Loading

0 comments on commit 1100ebe

Please sign in to comment.