Skip to content
Open
4 changes: 4 additions & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,10 @@ class CodeGen final : public CodeGenInterface

void genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroed);

#if defined(TARGET_ARM64)
void genUnknownSizeFrame();
#endif

#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
bool genInstrWithConstant(instruction ins,
emitAttr attr,
Expand Down
45 changes: 45 additions & 0 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4865,9 +4865,54 @@ void CodeGen::genPushCalleeSavedRegisters()
m_compiler->compFrameInfo.calleeSaveSpOffset = calleeSaveSpOffset;
m_compiler->compFrameInfo.calleeSaveSpDelta = calleeSaveSpDelta;
m_compiler->compFrameInfo.offsetSpToSavedFp = offsetSpToSavedFp;

if (m_compiler->compUsesUnknownSizeFrame)
{
genUnknownSizeFrame();
}
#endif // TARGET_ARM64
}

#if defined(TARGET_ARM64)
/*****************************************************************************
*
* Generates code for creating the UnknownSizeFrame stack space.
*
* See Compiler::UnknownSizeFrame for implementation details. The space contains
* stack allocations for Vector<T>.
*/

void CodeGen::genUnknownSizeFrame()
Comment thread
snickolls-arm marked this conversation as resolved.
{
assert(m_compiler->compLocallocUsed && m_compiler->compUsesUnknownSizeFrame);
assert(m_compiler->unkSizeFrame.isFinalized);
unsigned totalVectorCount = m_compiler->unkSizeFrame.FrameSizeInVectors();

// We reserve REG_UNKBASE for addressing SVE locals. This will always point at the top of
// of the UnknownSizeFrame and we index into it.
// TODO-SVE: We may want this to point into the middle of the frame to reduce address
// computations (we have a signed 9-bit indexing immediate).
inst_Mov(TYP_I_IMPL, REG_UNKBASE, REG_SP, false);

if (0 < totalVectorCount && totalVectorCount <= 32)
{
GetEmitter()->emitIns_R_R_I(INS_sve_addvl, EA_8BYTE, REG_SP, REG_SP, -(ssize_t)totalVectorCount);
}
else
{
// Generate `sp = sp - totalVectorCount * VL`
assert(totalVectorCount != 0);
regNumber rsvd = rsGetRsvdReg();
// mov rsvd, #totalVectorCount
// rdvl scratch, #1
// msub sp, rsvd, scratch, sp
instGen_Set_Reg_To_Imm(EA_8BYTE, rsvd, totalVectorCount);
GetEmitter()->emitIns_R_I(INS_sve_rdvl, EA_8BYTE, REG_SCRATCH, 1);
GetEmitter()->emitIns_R_R_R_R(INS_msub, EA_8BYTE, REG_SP, rsvd, REG_SCRATCH, REG_SP);
}
}
#endif

/*****************************************************************************
*
* Generates code for a function epilog.
Expand Down
21 changes: 21 additions & 0 deletions src/coreclr/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3708,6 +3708,11 @@ void CodeGen::genCheckUseBlockInit()
continue;
}

if (m_compiler->lvaIsUnknownSizeLocal(varNum))
{
continue;
}

if (m_compiler->fgVarIsNeverZeroInitializedInProlog(varNum))
{
varDsc->lvMustInit = 0;
Expand Down Expand Up @@ -4065,6 +4070,12 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,

noway_assert(varDsc->lvOnFrame);

if (m_compiler->lvaIsUnknownSizeLocal(varNum))
{
// This local will belong on the UnknownSizeFrame, which will handle zeroing instead.
continue;
}

// lvMustInit can only be set for GC types or TYP_STRUCT types
// or when compInitMem is true
// or when in debug code
Expand Down Expand Up @@ -4807,6 +4818,11 @@ void CodeGen::genFinalizeFrame()
regSet.rsSetRegsModified(maskPairRegs);
}
}

if (m_compiler->compUsesUnknownSizeFrame)
{
regSet.rsSetRegsModified(RBM_UNKBASE);
}
#endif

#ifdef DEBUG
Expand Down Expand Up @@ -5131,6 +5147,11 @@ void CodeGen::genFnProlog()
continue;
}

if (m_compiler->lvaIsUnknownSizeLocal(varNum))
{
continue;
}

signed int loOffs = varDsc->GetStackOffset();
signed int hiOffs = varDsc->GetStackOffset() + m_compiler->lvaLclStackHomeSize(varNum);

Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5647,6 +5647,12 @@ void Compiler::generatePatchpointInfo()
//
unsigned varNum = lclNum;

// Variable-sized locals reside in a different part of the stack frame.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This brings up OSR support for this kind of stack frame, which I hadn't yet run into. I suppose it's not possible to just skip over these kinds of variables. I would have to either disable OSR for the method, or add some support to allow for copying over the extra frame space as well?

if (lvaIsUnknownSizeLocal(varNum))
{
continue;
}

if (gsShadowVarInfo != nullptr)
{
unsigned const shadowNum = gsShadowVarInfo[lclNum].shadowCopy;
Expand Down
211 changes: 211 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -981,17 +981,49 @@ class LclVarDsc
public:
int GetStackOffset() const
{
assert(lvValueSize().IsExact());
return lvStkOffs;
}

void SetStackOffset(int offset)
{
assert(lvValueSize().IsExact());
lvStkOffs = offset;
}

unsigned lvExactSize() const;
ValueSize lvValueSize() const;

// SetUnknownSizeFrameIndex: Set the index that has been assigned to this
// local on the UnknownSizeFrame.
//
// This is only used for locals that have an unknown size, such as TYP_SIMD/TYP_MASK.
// These locals do not have an absolute stack offset.
//
// Arguments:
// index -- The index on the UnknownSizeFrame to assign to this local.
//
void SetUnknownSizeFrameIndex(int index)
{
assert(!lvValueSize().IsExact());
lvStkOffs = index;
}

// GetUnknownSizeFrameIndex: Get the index that has been assigned to this
// local on the UnknownSizeFrame.
//
// This is only used for locals that have an unknown size, such as TYP_SIMD/TYP_MASK.
// These locals do not have an absolute stack offset.
//
// Returns:
// The index of this local on the UnknownSizeFrame.
//
int GetUnknownSizeFrameIndex() const
{
assert(!lvValueSize().IsExact());
return lvStkOffs;
}

unsigned lvSlotNum; // original slot # (if remapped)

// class handle for the local or null if not known or not a class
Expand Down Expand Up @@ -4304,6 +4336,185 @@ class Compiler

int lvaOSRLocalTier0FrameOffset(unsigned varNum);

//------------------------- UnknownSizeFrame ---------------------------------

void lvaInitUnknownSizeFrame();
void lvaAllocUnknownSizeLocal(unsigned varNum);

bool compUsesUnknownSizeFrame;

#if defined(FEATURE_SIMD) && defined(TARGET_ARM64)
// For ARM64, the UnknownSizeFrame lives at the end of the statically
// allocated stack space. This means it belongs to the 'alloca' space on the
// frame, and it is essentially the first dynamically allocated stack
// variable.
//
// Currently, the only locals with unknown size are SIMD types supporting
// Vector<T>, TYP_SIMD and TYP_MASK. We do not know the size of these types
// at compile time, so we need to execute the rdvl/addvl instruction to
// learn this size and allocate the UnknownSizeFrame.
//
// We reserve the x19 register to point to the top of the UnknownSizeFrame
// and use this as the base address for local variables with unknown size.
// Reserving a register is simpler than using fp/sp, as fp may point
// to different locations depending on various properties of the frame, and
// the value of sp may change at runtime.
//
// Typically, a vector is loaded using a base address and some index which
// the instruction will scale by VL, for example: `ldr z0, [x19, #3 MUL VL]`.
// A mask is loaded with `ldr p0, [x19, #3 MUL VL]`, but in this case the
// `MUL VL` indicates we are scaling with the length of the predicate
// register rather than the vector. A predicate register is defined to have
// 1/8th the length of a vector register.
//
// We know that sizeof(TYP_SIMD) and sizeof(TYP_MASK) are invariant despite
// being unknown at compile time, so we allocate them in single homogeneous
// blocks per type. An individual local can be referenced from the start of
// its block by an index into the block.
//
// The difference in addressing-mode index scaling means we have to be
// careful where we place the mask locals block with respect to the vector
// locals block. If we place the mask locals after the vector locals, we'll
// need to offset the load index by (8 * nVector) to account for the vector
// locals.
//
// Instead, we choose to pad the mask locals block to VL and place it at the
// beginning of the frame (closest to fp). This way we'll need to offset
// vector load indices by `roundUp(nMask, 8) / 8`. This is less likely to
// put pressure on the immediate encoding range and result in requiring an
// address computation.
//
// The maximum wasted space from the padding is 7/8ths VL (224 bytes with
// the architectural maximum 256 byte vectors), which occurs when 1 mask
// local is spilled to the frame. Alternatively this is 28 bytes for 32 byte
// vectors, for an example closer to today's implementations.
//
// The padding also makes it simple to allocate the UnknownSizeFrame since
// the UnknownSizeFrame will be aligned to VL. The total number of vectors
// to allocate is `(roundUp(nMask, 8) / 8) + nVector`. The stack pointer
// can be adjusted with a single instruction `addvl sp, sp, #totalVectors`.
//
// See the diagram below for a visual representation of this scheme.
//
// ...
// | static space |
// | (totalFrameSize) |
// +----------------------------------+ x19, begin UnknownSizeFrame
// | mask locals block | ^
// | (nMask * VL/8) | |
// +----------------------------------+ |
// | padding to VL alignment | |
// +----------------------------------+ (roundUp(nMask, 8)/8 + nVector)*VL
// | | |
// | vector locals block | |
// | (nVector * VL) | |
// | | v
// +----------------------------------+ end UnknownSizeFrame
// | |
// | rest of alloca space |
// ... sp
struct UnknownSizeFrame
{
// Number of allocated vectors/masks. These also represent the end of
// the allocation space for each block. The allocator for each block is
// a simple bump allocator.
unsigned nVector = 0;
unsigned nMask = 0;

#ifdef DEBUG
bool isFinalized = false;
#endif

// Returns the size of the mask block in number of vector lengths.
unsigned MaskBlockSizeInVectors()
{
assert(roundUp(0U, 8U) == 0);
return roundUp(nMask, 8) / 8;
}

// Returns the size of the vector block in number of vector lengths.
unsigned VectorBlockSize()
{
return nVector;
}

// Returns the size of the total UnknownSizeFrame in number of vector
// lengths.
unsigned FrameSizeInVectors()
{
return MaskBlockSizeInVectors() + VectorBlockSize();
}

// Allocate a mask, returning an index of the mask in the mask block.
unsigned AllocMask()
{
assert(!isFinalized);
unsigned idx = nMask;
nMask++;
return idx;
}

// Allocate a vector, returning an index of the vector in the vector
// block.
unsigned AllocVector()
{
assert(!isFinalized);
unsigned idx = nVector;
nVector++;
return idx;
}

// Returns a negative offset relative to the base of the UnknownSizeFrame
// for addressing an allocated vector or mask local.
// If `isMask == true`, given an index that was assigned to mask local,
// the returned offset is an index measured in units of VL/8.
// Otherwise given an index that was assigned to a vector local, the
// returned offset is measured in units of VL.
// The index parameter should have been obtained through AllocMask() or
// AllocVector().
int GetOffset(unsigned index, bool isMask = false)
{
// We can't compute addresses if we haven't finished allocating.
assert(isFinalized);

unsigned offset = UINT32_MAX;
if (isMask)
{
assert(index < nMask);
offset = index;
}
else
{
assert(index < nVector);
offset = MaskBlockSizeInVectors() + index;
}
assert(offset != UINT32_MAX);
// The index is always offset by 1 as we are writing from below fp
// upwards.
return -(int)(offset + 1);
}

// Given a local on the UnknownSizeFrame, compute the offset used for addressing
// this local relative to the base address of the UnknownSizeFrame. This offset
// can be used with addvl/addpl for TYP_SIMD/TYP_MASK respectively. The offset
// needs to be scaled by VL/PL to produce an absolute address value.
int GetAddressingOffset(LclVarDsc* varDsc)
{
return GetOffset(varDsc->GetUnknownSizeFrameIndex(), varDsc->TypeIs(TYP_MASK));
}

// This system ensures we don't try and generate an address on the frame
// without finishing all allocations.
void Finalize()
{
#ifdef DEBUG
isFinalized = true;
#endif
}

} unkSizeFrame;
#endif

//------------------------ For splitting types ----------------------------

void lvaInitTypeRef();
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2721,6 +2721,7 @@ inline
bool fConservative = false;
if (varNum >= 0)
{
assert(!lvaIsUnknownSizeLocal(varNum));
LclVarDsc* varDsc = lvaGetDesc(varNum);
bool isPrespilledArg = false;
#if defined(TARGET_ARM) && defined(PROFILING_SUPPORTED)
Expand Down Expand Up @@ -2779,6 +2780,7 @@ inline
tmpDsc = codeGen->regSet.tmpFindNum(varNum, RegSet::TEMP_USAGE_USED);
}
assert(tmpDsc != nullptr);
assert(!varTypeHasUnknownSize(tmpDsc->tdTempType()));
varOffset = tmpDsc->tdTempOffs();
}
else
Expand Down
5 changes: 5 additions & 0 deletions src/coreclr/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6999,6 +6999,11 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
continue;
}

if (m_compiler->lvaIsUnknownSizeLocal(num))
{
continue;
}

#if FEATURE_FIXED_OUT_ARGS
if (num == m_compiler->lvaOutgoingArgSpaceVar)
{
Expand Down
Loading
Loading