[AMDGPU] Refactor insertRelease into insertWriteback + insertWait (NFC)#199486
Open
ssahasra wants to merge 1 commit into
Open
[AMDGPU] Refactor insertRelease into insertWriteback + insertWait (NFC)#199486ssahasra wants to merge 1 commit into
ssahasra wants to merge 1 commit into
Conversation
A release consists of two actions: write-back the current cache, and wait for "relevant" outstanding operations to complete. With the new memory model, it is possible to disable the cache write-back using "av none" semantics. This patch cleanly separates the existing implementation so that the write-backs can be selectively applied when such metadata is present. Assisted-By: Claude Opus 4.6
|
@llvm/pr-subscribers-backend-amdgpu Author: Sameer Sahasrabuddhe (ssahasra) ChangesA release consists of two actions: write-back the current cache, and wait for "relevant" outstanding operations to complete. With the new memory model, it is possible to disable the cache write-back using "non-av". This patch cleanly separates the existing implementation so that the write-backs can be selectively applied after checking for non-av semantics. Assisted-By: Claude Opus 4.6 Full diff: https://github.com/llvm/llvm-project/pull/199486.diff 1 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 25922c5af7da1..b721dcaf49d0f 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -442,17 +442,27 @@ class SICacheControl {
SIAtomicAddrSpace AddrSpace,
Position Pos) const = 0;
- /// Inserts any necessary instructions at position \p Pos relative to
- /// instruction \p MI to ensure previous memory instructions by this thread
- /// with address spaces \p AddrSpace have completed and can be observed by
- /// subsequent memory instructions by any thread executing in memory scope \p
- /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
- /// between address spaces. Returns true iff any instructions inserted.
- virtual bool insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const = 0;
+ /// Inserts any necessary writeback instructions at position \p Pos relative
+ /// to instruction \p MI to make previous memory operations by this thread
+ /// with address spaces \p AddrSpace available to other threads in memory
+ /// scope \p Scope. Does not insert waits; callers must call insertWait
+ /// separately. Returns true iff any instructions inserted.
+ virtual bool insertWriteback(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace,
+ Position Pos) const = 0;
+
+ /// Inserts writeback followed by an unconditional wait to implement a
+ /// release operation.
+ bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+ Changed |= insertWriteback(MI, Scope, AddrSpace, Pos);
+ Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos,
+ AtomicOrdering::Release, /*AtomicsOnly=*/false);
+ return Changed;
+ }
/// Handle operations that are considered non-volatile.
/// See \ref isNonVolatileMemoryAccess
@@ -496,11 +506,9 @@ class SIGfx6CacheControl final : public SICacheControl {
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
- bool insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
+ bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
};
/// Generates code sequences for the memory model of GFX10/11.
@@ -537,12 +545,10 @@ class SIGfx10CacheControl final : public SICacheControl {
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
- bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
- Position Pos) const override {
- return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
- /*AtomicsOnly=*/false);
+ bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override {
+ return false;
}
};
@@ -594,9 +600,9 @@ class SIGfx12CacheControl final : public SICacheControl {
bool handleCooperativeAtomic(MachineInstr &MI) const override;
- bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
+ bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -1464,67 +1470,56 @@ bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- bool Changed = false;
+bool SIGfx6CacheControl::insertWriteback(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ if (!ST.hasGFX90AInsts())
+ return false;
- if (ST.hasGFX90AInsts()) {
- MachineBasicBlock &MBB = *MI->getParent();
- const DebugLoc &DL = MI->getDebugLoc();
+ bool Changed = false;
+ MachineBasicBlock &MBB = *MI->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
- if (Pos == Position::AFTER)
- ++MI;
+ if (Pos == Position::AFTER)
+ ++MI;
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
- // to initiate writeback of any dirty cache lines of earlier writes by
- // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
- // writeback has completed.
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+ // to initiate writeback of any dirty cache lines of earlier writes by
+ // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+ // writeback has completed.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ if (ST.hasGFX940Insts()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate system scope.
- .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
Changed = true;
- break;
- case SIAtomicScope::AGENT:
- if (ST.hasGFX940Insts()) {
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate agent scope.
- .addImm(AMDGPU::CPol::SC1);
-
- // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
- // SIAtomicScope::AGENT, the following insertWait will generate the
- // required "S_WAITCNT vmcnt(0)".
- Changed = true;
- }
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
- // would writeback, and would require an otherwise unnecessary
- // "S_WAITCNT vmcnt(0)".
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
}
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
+ // would writeback, and would require an otherwise unnecessary
+ // "S_WAITCNT vmcnt(0)".
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
}
-
- if (Pos == Position::AFTER)
- --MI;
}
- // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
- // S_WAITCNT needed.
- Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
- /*AtomicsOnly=*/false);
+ if (Pos == Position::AFTER)
+ --MI;
return Changed;
}
@@ -2068,75 +2063,66 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return true;
}
-bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- const DebugLoc &DL = MI->getDebugLoc();
-
+bool SIGfx12CacheControl::insertWriteback(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
// The scratch address space does not need the global memory cache
// writeback as all memory operations by the same thread are
// sequentially consistent, and no other thread can access scratch
// memory.
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- if (Pos == Position::AFTER)
- ++MI;
+ if (!canAffectGlobalAddrSpace(AddrSpace))
+ return false;
- // global_wb is only necessary at system scope for GFX12.0,
- // they're also necessary at device scope for GFX12.5 as stores
- // cannot report completion earlier than L2.
- //
- // Emitting it for lower scopes is a slow no-op, so we omit it
- // for performance.
- std::optional<AMDGPU::CPol::CPol> NeedsWB;
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- NeedsWB = AMDGPU::CPol::SCOPE_SYS;
- break;
- case SIAtomicScope::AGENT:
- // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
- if (ST.hasGFX1250Insts())
- NeedsWB = AMDGPU::CPol::SCOPE_DEV;
- break;
- case SIAtomicScope::CLUSTER:
- case SIAtomicScope::WORKGROUP:
- // No WB necessary, but we still have to wait.
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No WB or wait necessary here, but insertWait takes care of that.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
+ bool Changed = false;
+ MachineBasicBlock &MBB = *MI->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
- if (NeedsWB) {
- // Target requires a waitcnt to ensure that the proceeding store
- // proceeding store/rmw operations have completed in L2 so their data will
- // be written back by the WB instruction.
- if (ST.hasINVWBL2WaitCntRequirement())
- insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- /*IsCrossAddrSpaceOrdering=*/false, Pos,
- AtomicOrdering::Release,
- /*AtomicsOnly=*/false);
-
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(*NeedsWB);
- Changed = true;
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ // global_wb is only necessary at system scope for GFX12.0,
+ // they're also necessary at device scope for GFX12.5 as stores
+ // cannot report completion earlier than L2.
+ //
+ // Emitting it for lower scopes is a slow no-op, so we omit it
+ // for performance.
+ std::optional<AMDGPU::CPol::CPol> NeedsWB;
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ NeedsWB = AMDGPU::CPol::SCOPE_SYS;
+ break;
+ case SIAtomicScope::AGENT:
+ // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
+ if (ST.hasGFX1250Insts())
+ NeedsWB = AMDGPU::CPol::SCOPE_DEV;
+ break;
+ case SIAtomicScope::CLUSTER:
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+
+ if (NeedsWB) {
+ // Target requires a waitcnt to ensure that the proceeding store
+ // proceeding store/rmw operations have completed in L2 so their data will
+ // be written back by the WB instruction.
+ if (ST.hasINVWBL2WaitCntRequirement()) {
+ insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ /*IsCrossAddrSpaceOrdering=*/false, Pos,
+ AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
}
- if (Pos == Position::AFTER)
- --MI;
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(*NeedsWB);
+ Changed = true;
}
- // We always have to wait for previous memory operations (load/store) to
- // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
- // we of course need to wait for that as well.
- Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
- /*AtomicsOnly=*/false);
+ if (Pos == Position::AFTER)
+ --MI;
return Changed;
}
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
A release consists of two actions: write-back the current cache, and wait for "relevant" outstanding operations to complete. With the new memory model, it is possible to disable the cache write-back using "non-av". This patch cleanly separates the existing implementation so that the write-backs can be selectively applied after checking for non-av semantics.
Part of a stack:
Assisted-By: Claude Opus 4.6