GPUOpen-Drivers · qiaojbao · Mar 13, 2025
@@ -29,3 +29,5 @@ dne = "dne"
 offen = "offen"
 varing = "varing"
 Derivate = "Derivate"
+TESE = "TESE"
+SER = "SER"
@@ -1,7 +1,7 @@
 ##
  #######################################################################################################################
  #
- #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #  Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved.
  #
  #  Permission is hereby granted, free of charge, to any person obtaining a copy
  #  of this software and associated documentation files (the "Software"), to
@@ -38,6 +38,12 @@ if(LLPC_BUILD_GFX11)
 endif()
 #endif
 
+#if LLPC_BUILD_GFX12
+if(LLPC_BUILD_GFX12)
+    target_compile_definitions(vkgc_headers INTERFACE LLPC_BUILD_GFX12)
+endif()
+#endif
+
 #if LLPC_RAY_TRACING
 if(LLPC_RAY_TRACING)
     if(NOT LLPC_IS_STANDALONE)

@@ -1,7 +1,7 @@
 ##
  #######################################################################################################################
  #
- #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #  Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved.
  #
  #  Permission is hereby granted, free of charge, to any person obtaining a copy
  #  of this software and associated documentation files (the "Software"), to

@@ -1,7 +1,7 @@
 ##
  #######################################################################################################################
  #
- #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #  Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved.
  #
  #  Permission is hereby granted, free of charge, to any person obtaining a copy
  #  of this software and associated documentation files (the "Software"), to

@@ -1,7 +1,7 @@
 ##
  #######################################################################################################################
  #
- #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #  Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved.
  #
  #  Permission is hereby granted, free of charge, to any person obtaining a copy
  #  of this software and associated documentation files (the "Software"), to

@@ -1,7 +1,7 @@
 /*
  ***********************************************************************************************************************
  *
- *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *  Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved.
  *
  *  Permission is hereby granted, free of charge, to any person obtaining a copy
  *  of this software and associated documentation files (the "Software"), to

@@ -453,6 +453,20 @@ struct CompileTimeConst {
   } values;          ///< The compile-time values for this slot.
 };
 
+#if LLPC_BUILD_GFX12
+/// Handle temporal hint
+enum TemporalHintOpType {
+  TemporalHintAtmWrite = 0,
+  TemporalHintImageRead = 4,
+  TemporalHintImageWrite = 8,
+  TemporalHintTessFactorWrite = 12,
+  TemporalHintTessRead = 16,
+  TemporalHintTessWrite = 20,
+  TemporalHintBufferRead = 24,
+  TemporalHintBufferWrite = 28,
+};
+#endif
+
 /// Represents info of compile-time constants within a shader of a specified stage.
 struct CompileConstInfo {
   unsigned numCompileTimeConstants;        ///< Number of compile time constants.
@@ -496,7 +510,11 @@ struct PipelineOptions {
   bool reverseThreadGroup;                        ///< If set, enable thread group reversing
   bool internalRtShaders;                         ///< Whether this pipeline has internal raytracing shaders
   unsigned forceNonUniformResourceIndexStageMask; ///< Mask of the stage to force using non-uniform resource index.
+#if LLPC_BUILD_GFX12
+  bool expertSchedulingMode;
+#else
   bool reserved16;
+#endif
 
   struct GLState {
     bool replaceSetWithResourceType; ///< For OGL only, replace 'set' with resource type during spirv translate
@@ -519,14 +537,27 @@ struct PipelineOptions {
   } glState;
   const auto &getGlState() const { return glState; }
 
+#if LLPC_BUILD_GFX12
+  unsigned cacheScopePolicyControl; ///< Control cache scope policy. attributes-through-memory read/write is
+                                    ///  available.
+#else
   unsigned reserved20;
+#endif
   bool enablePrimGeneratedQuery; ///< If set, primitive generated query is enabled
   bool disablePerCompFetch;      ///< Disable per component fetch in uber fetch shader.
   bool reserved21;
   bool optimizePointSizeWrite;        ///< If set, the write of PointSize in the last vertex processing stage will be
                                       ///< eliminated if the write value is 1.0.
   CompileConstInfo *compileConstInfo; ///< Compile time constant data.
+#if LLPC_BUILD_GFX12
+  unsigned temporalHintControl; ///< Override value for temporal hint. A load/store occupies 4 bits. The highest bit
+                                ///  of 4 bits marks whether to override temporal hint.
+                                ///  Arrange from the low bit to high bit in the following order:
+                                ///  TemporalHintAtmWrite,TemporalHintImageRead, TemporalHintImageWrite,
+                                ///  TemporalHintTessFactorWrite, TemporalHintTessRead, TemporalHintTessWrite
+#else
   unsigned reserved22;
+#endif
   bool padBufferSizeToNextDword; ///< Vulkan only, set if the driver rounds the buffer size up the next dword
 };
 
@@ -776,6 +807,28 @@ inline unsigned compact32(ShaderHash hash) {
 /// Represent a pipeline option which can be automatic as well as explicitly set.
 enum InvariantLoads : unsigned { Auto = 0, EnableOptimization = 1, DisableOptimization = 2, ClearInvariants = 3 };
 
+#if LLPC_BUILD_GFX12
+/// Control cache policy: whether to use LLC (last level cache, aka set noAlloc).
+struct CachePolicyLlc {
+  union NoAllocResource {
+    struct {
+      unsigned set : 5;      ///< Resource set
+      unsigned binding : 16; ///< Resource binding
+      unsigned noAlloc : 1;  ///< llc_noAlloc policy
+      unsigned : 10;
+    };
+    struct {
+      unsigned resourceId : 21; ///< Resource set
+      unsigned : 11;
+    };
+    unsigned u32All;
+  };
+
+  const unsigned *noAllocs; // Set for each resource.
+  unsigned resourceCount;   // The count of resources
+};
+#endif
+
 /// Represents per shader stage options.
 struct PipelineShaderOptions {
   ShaderHash clientHash;      ///< Client-supplied unique shader hash. A value of zero indicates that LLPC should
@@ -918,6 +971,10 @@ struct PipelineShaderOptions {
   /// Application workaround: forward propagate NoContraction decoration to any related FAdd operation.
   bool forwardPropagateNoContract;
 
+#if LLPC_BUILD_GFX12
+  /// Enable round-robin mode for waves in workgroup.
+  bool workgroupRoundRobin;
+#endif
   /// Binding ID offset of default uniform block
   unsigned constantBufferBindingOffset;
 
@@ -931,6 +988,15 @@ struct PipelineShaderOptions {
   /// will be assigned values as if they were decorated as DeviceIndex.
   bool viewIndexFromDeviceIndex;
 
+#if LLPC_BUILD_GFX12
+  /// Control LLC cache policy
+  CachePolicyLlc cachePolicyLlc;
+
+  /// Override value for temporal hint. A load/store occupies 4 bits. The highest bit of 4 bits marks whether to
+  /// override temporal hint.
+  unsigned temporalHintShaderControl;
+#endif
+
   /// Indicate whether the vertex shader is used by transform pipeline
   bool enableTransformShader;
 
@@ -1471,13 +1537,18 @@ struct RayTracingPipelineBuildInfo {
   unsigned pipelineLibStageMask; ///< Pipeline library stage mask
   //@}
 
-  unsigned payloadSizeMaxInLib;     ///< Pipeline library maxPayloadSize
-  unsigned attributeSizeMaxInLib;   ///< Pipeline library maxAttributeSize
-  bool isReplay;                    ///< Pipeline is created for replaying
-  const void *pClientMetadata;      ///< Pointer to (optional) client-defined data to be
-                                    ///  stored inside the ELF
-  size_t clientMetadataSize;        ///< Size (in bytes) of the client-defined data
-  unsigned cpsFlags;                ///< Cps feature flags
+  unsigned payloadSizeMaxInLib;   ///< Pipeline library maxPayloadSize
+  unsigned attributeSizeMaxInLib; ///< Pipeline library maxAttributeSize
+  bool isReplay;                  ///< Pipeline is created for replaying
+  const void *pClientMetadata;    ///< Pointer to (optional) client-defined data to be
+                                  ///  stored inside the ELF
+  size_t clientMetadataSize;      ///< Size (in bytes) of the client-defined data
+  unsigned cpsFlags;              ///< Cps feature flags
+#if LLPC_BUILD_GFX12
+  bool disableDynamicVgpr;       ///< Whether to disable dynamic VGPR mode for continuations. If not set, dVGPR mode is
+                                 /// enabled by default.
+  unsigned dynamicVgprBlockSize; ///< The size of the VGPR allocation granule used in dVGPR mode.
+#endif
   GpurtOption *pGpurtOptions;       ///< Array of GPURT options
   unsigned gpurtOptionCount;        ///< Number of GPURT options
   bool rtIgnoreDeclaredPayloadSize; ///< Ignore the declared payload size in the shader to address issues with Proton.

@@ -197,6 +197,12 @@ target_sources(LLVMlgc PRIVATE
     lowering/LowerRayQueryWrapper.cpp
 )
 
+#if LLPC_BUILD_GFX12
+if(LLPC_BUILD_GFX12)
+    target_sources(LLVMlgc PRIVATE lowering/AddBufferOperationMetadata.cpp)
+endif()
+#endif
+
 # include/lgc/lowering
 target_sources(LLVMlgc PRIVATE
     include/lgc/lowering/AddLoopMetadata.h
@@ -235,6 +241,12 @@ target_sources(LLVMlgc PRIVATE
     include/lgc/lowering/WorkaroundDsSubdwordWrite.h
 )
 
+#if LLPC_BUILD_GFX12
+if(LLPC_BUILD_GFX12)
+    target_sources(LLVMlgc PRIVATE include/lgc/lowering/AddBufferOperationMetadata.h)
+endif()
+#endif
+
 # lgc/state
 target_sources(LLVMlgc PRIVATE
     state/Compiler.cpp

@@ -74,6 +74,51 @@ Type *BuilderBase::getConditionallyVectorizedTy(Type *elementTy, Type *maybeVecT
 // @param vector2 : The float vector 2
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateDotProduct(Value *const vector1, Value *const vector2, const Twine &instName) {
+#if LLPC_BUILD_GFX12
+  if (getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 12) {
+    // Use a chain of v_dot2_f16_f16/v_dot2_bf16_bf16 on gfx12+.
+    //
+    // Note: GFX11 has this instruction, but its precision doesn't satisfy Vulkan requirements.
+    //
+    // Note: GFX10 chips may have v_dot2_f32_f16, which we could consider generating in cases where bitexact results
+    //       are not required.
+    //
+    // Note: v_dot2_f16_f16/v_dot2_bf16_bf16 only respects RTE mode according to HW spec. We must check the
+    //       specified rounding mode before using it. Also, v_dot2_f16_f16/v_dot2_bf16_bf16 is not IEEE compliant
+    //       so we must check NSZ as well.
+    const auto fp16RoundMode =
+        getPipelineState()->getShaderModes()->getCommonShaderMode(m_shaderStage.value()).fp16RoundMode;
+    const auto vectorTy = dyn_cast<FixedVectorType>(vector1->getType());
+    if (vectorTy && (vectorTy->getScalarSizeInBits() == 16) &&
+        (fp16RoundMode == FpRoundMode::DontCare || fp16RoundMode == FpRoundMode::Even) &&
+        getFastMathFlags().noSignedZeros()) {
+      int compCount = vectorTy->getNumElements();
+      Value *result = nullptr;
+      Type *basicType = getHalfTy();
+      Intrinsic::AMDGCNIntrinsics inst = Intrinsic::amdgcn_fdot2_f16_f16;
+      if (vectorTy->getScalarType()->isBFloatTy()) {
+        basicType = getBFloatTy();
+        inst = Intrinsic::amdgcn_fdot2_bf16_bf16;
+      }
+
+      if (compCount % 2 == 0) {
+        result = ConstantFP::get(basicType, 0.0);
+      } else {
+        // If the component count is odd, prefer feeding the last product (odd one out) as initial value.
+        Value *lhs = CreateExtractElement(vector1, compCount - 1);
+        Value *rhs = CreateExtractElement(vector2, compCount - 1);
+        result = CreateFMul(lhs, rhs);
+      }
+
+      for (int i = 0; i + 1 < compCount; i += 2) {
+        Value *lhs = CreateShuffleVector(vector1, {i, i + 1});
+        Value *rhs = CreateShuffleVector(vector2, {i, i + 1});
+        result = CreateIntrinsic(basicType, inst, {lhs, rhs, result});
+      }
+      return result;
+    }
+  }
+#endif
 
   Value *product = CreateFMul(vector1, vector2);
   if (!isa<VectorType>(product->getType()))
@@ -254,6 +299,9 @@ Value *BuilderImpl::CreateIntegerDotProduct(Value *vector1, Value *vector2, Valu
 bool BuilderImpl::supportWaveWideBPermute(ShaderStageEnum shaderStage) const {
   auto gfxIp = getPipelineState()->getTargetInfo().getGfxIpVersion().major;
   auto supportBPermute = gfxIp == 8 || gfxIp == 9;
+#if LLPC_BUILD_GFX12
+  supportBPermute = supportBPermute || (gfxIp == 12);
+#endif
   auto waveSize = getPipelineState()->getShaderWaveSize(shaderStage);
   supportBPermute = supportBPermute || waveSize == 32;
   return supportBPermute;
@@ -265,6 +313,14 @@ bool BuilderImpl::supportPermLane64Dpp() const {
   return getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 11;
 }
 
+#if LLPC_BUILD_GFX12
+// =====================================================================================================================
+// Get whether the context we are building in supports permute lane var operations.
+bool BuilderImpl::supportPermLaneVar() const {
+  return getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 12;
+}
+#endif
+
 // =====================================================================================================================
 // Create an "if..endif" or "if..else..endif" structure. The current basic block becomes the "endif" block, and all
 // instructions in that block before the insert point are moved to the "if" block. The insert point is moved to

@@ -422,7 +422,16 @@ Value *BuilderImpl::buildBufferCompactDesc(Value *desc, Value *stride) {
       sqBufRsrcWord3.gfx11.format = BUF_FORMAT_32_UINT;
       sqBufRsrcWord3.gfx11.oobSelect = stride ? 3 : 2;
       assert(sqBufRsrcWord3.u32All == 0x20014FAC || sqBufRsrcWord3.u32All == 0x30014FAC);
-    } else {
+    }
+#if LLPC_BUILD_GFX12
+    else if (gfxIp.major == 12) {
+      sqBufRsrcWord3.gfx12.format = BUF_FORMAT_32_UINT;
+      sqBufRsrcWord3.gfx12.compressionEn = 1;
+      sqBufRsrcWord3.gfx12.oobSelect = stride ? 3 : 2;
+      assert(sqBufRsrcWord3.u32All == 0x22014FAC || sqBufRsrcWord3.u32All == 0x32014FAC);
+    }
+#endif
+    else {
       llvm_unreachable("Not implemented!");
     }
     bufDesc = CreateInsertElement(bufDesc, getInt32(sqBufRsrcWord3.u32All), 3);