1
0
Fork 0
mirror of https://github.com/VSadov/Satori.git synced 2025-06-08 03:27:04 +09:00

Arm64 tweaks (#36)

* tweaks

* Gen2Target

* trimmer pause

* style and comment fixes
This commit is contained in:
Vladimir Sadov 2025-01-07 23:40:40 -08:00 committed by vsadov
parent 9755128719
commit 6dd178f834
12 changed files with 172 additions and 99 deletions

View file

@ -204,7 +204,7 @@ typedef DWORD (WINAPI *PTHREAD_START_ROUTINE)(void* lpThreadParameter);
#endif // defined(__i386__) || defined(__x86_64__)
#if defined(__arm__) || defined(__aarch64__)
#define YieldProcessor() asm volatile ("yield")
#define YieldProcessor() asm volatile ("dmb ishst\n" "yield")
#define MemoryBarrier __sync_synchronize
#endif // __arm__ || __aarch64__

View file

@ -154,7 +154,9 @@ public:
BOOL_CONFIG (UseTHP, "gcTHP", NULL, true, "Specifies whether Transparent Huge Pages can be used. (Linux only)") \
BOOL_CONFIG (TrimmigGC, "gcTrim", NULL, true, "Specifies whether background trimming is enabled") \
INT_CONFIG (GCRate, "gcRate", NULL, -1, "Specifies soft min limit for time between GCs in milliseconds. -1 - default") \
INT_CONFIG (GCSpin, "gcSpin", NULL, -1, "Spin") \
INT_CONFIG (GCSpin, "gcSpin", NULL, -1, "Spin") \
INT_CONFIG (Gen2Target, "gcGen2Target", NULL, 200, "Specifies target for Gen2 GC (in terms of % of the last known size)") \
INT_CONFIG (Gen1Target, "gcGen1Target", NULL, 400, "Specifies target for Gen1 GC (in terms of % of the last known size)") \
// This class is responsible for retreiving configuration information
// for how the GC should operate.

View file

@ -41,6 +41,7 @@ bool SatoriLock::EnterSlow(bool noBlock)
// We will count when we failed to change the state of the lock and increase pauses
// so that bursts of activity are better tolerated. This should not happen often.
// after waking up we restart collision and iteration counters.
int collisions = 0;
// We will track the changes of ownership while we are trying to acquire the lock.
@ -145,7 +146,7 @@ bool SatoriLock::EnterSlow(bool noBlock)
// Increment the waiter count.
// Note that we do not do any overflow checking on this increment. In order to overflow,
// we'd need to have about 1 billion waiting threads, which is inconceivable anytime in the
// forseeable future.
// foreseeable future.
uint32_t newState = oldState + WaiterCountIncrement;
if (hasWaited)
newState = (newState - WaiterCountIncrement) & ~WaiterWoken;
@ -153,6 +154,11 @@ bool SatoriLock::EnterSlow(bool noBlock)
if (Interlocked::CompareExchange(&_state, newState, oldState) == oldState)
break;
}
else
{
// We are over the iteration limit, but the lock was open, we tried and failed.
// It was a collision.
}
CollisionBackoff(++collisions);
}

View file

@ -73,7 +73,7 @@ private:
return _InterlockedCompareExchange_acq((long*)destination, exchange, comparand) == (long)comparand;
#endif
#else
return __atomic_compare_exchange_n(destination, &comparand, exchange, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
return __atomic_compare_exchange_n(destination, &comparand, exchange, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
#endif
}
@ -91,36 +91,6 @@ private:
#endif
}
FORCEINLINE
static int64_t GetCheapTimeStamp()
{
#if defined(TARGET_AMD64)
#ifdef _MSC_VER
return __rdtsc();
#else
ptrdiff_t cycles;
ptrdiff_t cyclesHi;
__asm__ __volatile__
("rdtsc":"=a" (cycles), "=d" (cyclesHi));
return (cyclesHi << 32) | cycles;
#endif
#elif defined(TARGET_ARM64)
// On arm64 just read timer register instead
#ifdef _MSC_VER
#define ARM64_CNTVCT_EL0 ARM64_SYSREG(3,3,14,0,2)
return _ReadStatusReg(ARM64_CNTVCT_EL0);
#elif defined(TARGET_LINUX) || defined(TARGET_OSX)
int64_t timerTicks;
asm volatile("mrs %0, cntvct_el0" : "=r"(timerTicks));
return timerTicks;
#else
Unsupported platform?
#endif
#else
Unsupported architecture?
#endif
}
static const uint16_t SpinCountNotInitialized = INT16_MIN;
// While spinning is parameterized in terms of iterations,
@ -137,8 +107,8 @@ private:
// the exponential backoff will generally be not more than 2X worse than the perfect guess and
// will do a lot less attempts than an simple retry. On multiprocessor machine fruitless attempts
// will cause unnecessary sharing of the contended state which may make modifying the state more expensive.
// To protect against degenerate cases we will cap the per-iteration wait to 1024 spinwaits.
static const uint32_t MaxExponentialBackoffBits = 10;
// To protect against degenerate cases we will cap the per-iteration wait to a few thousand spinwaits.
static const uint32_t MaxExponentialBackoffBits = 12;
// This lock is unfair and permits acquiring a contended lock by a nonwaiter in the presence of waiters.
// It is possible for one thread to keep holding the lock long enough that waiters go to sleep and
@ -220,9 +190,16 @@ public:
{
_ASSERTE(collisions > 0);
// no need for much randomness here, we will just hash the stack location and a timestamp.
uint32_t rand = ((uint32_t)(size_t)&collisions + (uint32_t)GetCheapTimeStamp()) * 2654435769u;
uint32_t spins = rand >> (uint8_t)((uint32_t)32 - min(collisions, MaxExponentialBackoffBits));
collisions = min(collisions, MaxExponentialBackoffBits);
// we will backoff for some random number of iterations that roughly grows as collisions^2
// no need for much randomness here, randomness is "good to have", we could do without it,
// so we will just hash in the stack location.
uint32_t rand = (uint32_t)(size_t)&collisions * 2654435769u;
// set the highmost bit to ensure minimum number of spins is exponentialy increasing
// it basically guarantees that we spin at least 1, 2, 4, 8, 16, times, and so on
rand |= (1u << 31);
uint32_t spins = rand >> (uint8_t)(32 - collisions);
for (int i = 0; i < (int)spins; i++)
{
YieldProcessor();
@ -236,12 +213,12 @@ private:
return (uint16_t)GCToOSInterface::GetLowPrecisionTimeStamp();
}
// same idea as in CollisionBackoff, but with guaranteed minimum wait
// same idea as in CollisionBackoff, but with expected small range
static void IterationBackoff(int iteration)
{
_ASSERTE(iteration > 0 && iteration < MaxExponentialBackoffBits);
uint32_t rand = ((uint32_t)(size_t)&iteration + (uint32_t)GetCheapTimeStamp()) * 2654435769u;
uint32_t rand = (uint32_t)(size_t)&iteration * 2654435769u;
// set the highmost bit to ensure minimum number of spins is exponentialy increasing
// it basically guarantees that we spin at least 1, 2, 4, 8, 16, times, and so on
rand |= (1u << 31);

View file

@ -788,7 +788,8 @@ tryAgain:
!m_concurrentCardsDone ||
m_ccStackMarkState != CC_MARK_STATE_DONE ||
m_concurrentCleaningState == CC_CLEAN_STATE_CLEANING ||
m_condemnedGeneration == 0;
m_condemnedGeneration == 0 ||
!m_workList->IsEmpty();
int64_t start = GCToOSInterface::QueryPerformanceCounter();
if (moreWork)
@ -796,15 +797,20 @@ tryAgain:
HelpOnceCore(/*minQuantum*/ false);
m_noWorkSince = GCToOSInterface::QueryPerformanceCounter();
// if we did not use all the quantum,
// consume it here in Low Latency mode for pacing reasons.
// if we did not use all the quantum in Low Latency mode,
// consume what roughly remains for pacing reasons.
if (IsLowLatencyMode())
{
int64_t deadline = start + HelpQuantum();
int64_t deadline = start + HelpQuantum() / 2;
int iters = 1;
while (GCToOSInterface::QueryPerformanceCounter() < deadline &&
m_ccStackMarkState != CC_MARK_STATE_SUSPENDING_EE)
{
YieldProcessor();
iters *= 2;
for (int i = 0; i < iters; i++)
{
YieldProcessor();
}
}
}
}
@ -940,18 +946,16 @@ bool SatoriRecycler::IsBlockingPhase()
}
//TUNING: We use a very simplistic approach for GC triggering here.
// By default:
// SatoriUtil::Gen2Target() triggers Gen2 GC when heap size doubles.
// SatoriUtil::Gen1Target() triggers Gen1 GC when ephemeral size quadruples.
//
// There could be a lot of room to improve in this area:
// - could consider current CPU/memory load and adjust accordingly
// - could collect and use past history of the program behavior
// - could consider user input as to favor latency or throughput
// - ??
// we target 1/EPH_SURV_TARGET ephemeral survival rate
#define EPH_SURV_TARGET 4
// do gen2 when total doubles
#define GEN2_SURV_TARGET 2
void SatoriRecycler::MaybeTriggerGC(gc_reason reason)
{
int generation = 0;
@ -969,7 +973,7 @@ void SatoriRecycler::MaybeTriggerGC(gc_reason reason)
// gen2 allocations are rare and do not count towards gen1 budget since gen1 will not help with that
// they can be big though, so check if by any chance gen2 allocs alone pushed us over the limit
if (m_gen2AddedSinceLastCollection * GEN2_SURV_TARGET > m_totalLimit)
if (m_gen2AddedSinceLastCollection * SatoriUtil::Gen2Target() / 100 > m_totalLimit)
{
generation = 2;
}
@ -1014,11 +1018,11 @@ void SatoriRecycler::AdjustHeuristics()
if (m_prevCondemnedGeneration == 2)
{
m_totalLimit = occupancy * GEN2_SURV_TARGET;
m_totalLimit = occupancy * SatoriUtil::Gen2Target() / 100;
}
// we look for 1 / EPH_SURV_TARGET ephemeral survivorship, thus budget is the diff
size_t newGen1Budget = max(MIN_GEN1_BUDGET, ephemeralOccupancy * (EPH_SURV_TARGET - 1));
// we trigger GC when ephemeral size grows to SatoriUtil::Gen1Target(), thus budget is the diff
size_t newGen1Budget = max(MIN_GEN1_BUDGET, ephemeralOccupancy * (SatoriUtil::Gen2Target() - 100) / 100);
// alternatively we allow gen1 allocs up to 1/8 of total limit.
size_t altNewGen1Budget = max(MIN_GEN1_BUDGET, m_totalLimit / 8);
@ -4359,7 +4363,7 @@ bool SatoriRecycler::DrainDeferredSweepQueueConcurrent(int64_t deadline)
}
YieldProcessor();
if ((++cycles % 127) == 0)
if ((++cycles & 127) == 0)
{
GCToOSInterface::YieldThread(0);
}

View file

@ -211,7 +211,13 @@ SatoriRegion* SatoriRegionQueue::TryDequeueIfHasFreeSpaceInTopBucket()
SatoriRegionQueue* SatoriRegionQueue::AllocAligned(QueueKind kind)
{
const size_t align = 64;
const size_t align =
#if defined(TARGET_AMD64)
64;
#else
128;
#endif
#ifdef _MSC_VER
void* buffer = _aligned_malloc(sizeof(SatoriRegionQueue), align);
#else

View file

@ -46,12 +46,26 @@ SatoriTrimmer::SatoriTrimmer(SatoriHeap* heap)
m_event = new (nothrow) GCEvent;
m_event->CreateAutoEventNoThrow(false);
m_sleepGate = new (nothrow) SatoriGate();
if (SatoriUtil::IsTrimmingEnabled())
{
GCToEEInterface::CreateThread(LoopFn, this, false, "Satori GC Trimmer Thread");
}
}
void SatoriTrimmer::Pause(int msec)
{
m_paused = true;
m_sleepGate->TimedWait(msec);
m_paused = false;
}
void SatoriTrimmer::Unpause()
{
m_sleepGate->WakeOne();
}
void SatoriTrimmer::LoopFn(void* inst)
{
SatoriTrimmer* ths = (SatoriTrimmer*)inst;
@ -76,7 +90,7 @@ void SatoriTrimmer::Loop()
Interlocked::CompareExchange(&m_state, TRIMMER_STATE_STOPPED, TRIMMER_STATE_RUNNING);
// we are not running here, so we can sleep a bit before continuing.
GCToOSInterface::Sleep(5000);
Pause(5000);
StopAndWait();
}
@ -84,7 +98,7 @@ void SatoriTrimmer::Loop()
[&](SatoriPage* page)
{
// limit the rate of scanning to 1 page/msec.
GCToOSInterface::Sleep(1);
Pause(1);
if (m_state != TRIMMER_STATE_RUNNING)
{
StopAndWait();
@ -116,22 +130,23 @@ void SatoriTrimmer::Loop()
if (didSomeWork)
{
// limit the decommit/coalesce rate to 1 region/10 msec.
GCToOSInterface::Sleep(10);
Pause(10);
if (m_state != TRIMMER_STATE_RUNNING)
{
StopAndWait();
}
}
}
}
}
// this is a low priority task, if something needs to run, yield
GCToOSInterface::YieldThread(0);
// also we will pause for 1 sec if there was a GC - to further reduce the churn
// if the app is allocation-active.
int64_t newGen1 = m_heap->Recycler()->GetCollectionCount(1);
if (newGen1 != lastGen1)
{
lastGen1 = newGen1;
GCToOSInterface::Sleep(1000);
Pause(1000);
}
if (m_state != TRIMMER_STATE_RUNNING)
@ -151,8 +166,6 @@ void SatoriTrimmer::StopAndWait()
{
tryAgain:
// this is a low priority task, if something needs to run, yield
GCToOSInterface::YieldThread(0);
int state = m_state;
switch (state)
{
@ -165,7 +178,7 @@ void SatoriTrimmer::StopAndWait()
case TRIMMER_STATE_STOPPED:
for (int i = 0; i < 10; i++)
{
GCToOSInterface::Sleep(100);
Pause(100);
if (m_state != state)
{
goto tryAgain;
@ -215,17 +228,20 @@ void SatoriTrimmer::SetStopSuggested()
case TRIMMER_STATE_OK_TO_RUN:
if (Interlocked::CompareExchange(&m_state, TRIMMER_STATE_STOPPED, state) == state)
{
Unpause();
return;
}
break;
case TRIMMER_STATE_RUNNING:
if (Interlocked::CompareExchange(&m_state, TRIMMER_STATE_STOP_SUGGESTED, state) == state)
{
Unpause();
return;
}
break;
default:
_ASSERTE(m_state <= TRIMMER_STATE_STOP_SUGGESTED);
Unpause();
return;
}
}
@ -238,8 +254,13 @@ void SatoriTrimmer::WaitForStop()
int cycles = 0;
while (m_state == TRIMMER_STATE_STOP_SUGGESTED)
{
if (m_paused)
{
Unpause();
}
YieldProcessor();
if ((++cycles % 127) == 0)
if ((++cycles & 127) == 0)
{
GCToOSInterface::YieldThread(0);
}

View file

@ -52,12 +52,16 @@ private:
SatoriHeap* m_heap;
GCEvent* m_event;
SatoriGate* m_sleepGate;
size_t m_lastGen2Count;
volatile int m_state;
volatile bool m_paused;
static void LoopFn(void* inst);
void Loop();
void StopAndWait();
void Pause(int msec);
void Unpause();
};
#endif

View file

@ -285,6 +285,34 @@ public:
return gcSpin;
}
// DOTNET_gcGen2Target
static int Gen2Target()
{
int target = (int)GCConfig::GetGen2Target();
if (target < 100)
{
// target must be > 100%
// if wee see less, just default to triggering GC when heap doubles
target = 200;
}
return target;
}
// DOTNET_gcGen1Target
static int Gen1Target()
{
int target = (int)GCConfig::GetGen1Target();
if (target < 100)
{
// target must be > 100%
// if wee see less, just default to triggering GC when ephemeral heap quadruples
target = 400;
}
return target;
}
static size_t CommitGranularity()
{
// we can support sizes that are > OS page and binary fractions of REGION_SIZE_GRANULARITY.

View file

@ -34,11 +34,16 @@ NOINLINE
void SatoriWorkList::PushSlow(SatoriWorkChunk* item)
{
uint32_t collisions = 1;
SatoriWorkChunk* head;
while (true)
{
SatoriWorkList orig = *this;
item->m_next = orig.m_head;
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)item, (int64_t*)&orig))
head = this->m_head;
size_t aba = this->m_aba;
item->m_next = head;
SatoriWorkList orig(head, aba);
if (Cas128((int64_t*)this, aba + 1, (int64_t)item, (int64_t*)&orig))
break;
SatoriLock::CollisionBackoff(collisions++);
@ -53,16 +58,19 @@ NOINLINE
SatoriWorkChunk* SatoriWorkList::TryPopSlow()
{
uint32_t collisions = 1;
SatoriWorkList orig;
SatoriWorkChunk* head;
while (true)
{
orig = *this;
if (orig.m_head == nullptr)
head = this->m_head;
size_t aba = this->m_aba;
if (head == nullptr)
{
return nullptr;
}
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)orig.m_head->m_next, (int64_t*)&orig))
SatoriWorkList orig(head, aba);
if (Cas128((int64_t*)this, aba + 1, (int64_t)head->m_next, (int64_t*)&orig))
break;
SatoriLock::CollisionBackoff(collisions++);
@ -72,7 +80,6 @@ SatoriWorkChunk* SatoriWorkList::TryPopSlow()
Interlocked::Decrement(&m_count);
#endif
SatoriWorkChunk* result = orig.m_head;
result->m_next = nullptr;
return result;
head->m_next = nullptr;
return head;
}

View file

@ -33,34 +33,46 @@
#if defined(TARGET_WINDOWS)
FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparandAndResult)
FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparand)
{
return _InterlockedCompareExchange128(pDst, iValueHigh, iValueLow, pComparandAndResult);
return _InterlockedCompareExchange128(pDst, iValueHigh, iValueLow, pComparand);
}
#else
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Watomic-alignment"
FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparandAndResult)
FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparand)
{
__int128_t iValue = ((__int128_t)iValueHigh << 64) + (uint64_t)iValueLow;
return __atomic_compare_exchange_n ((__int128_t*)pDst, (__int128_t*)pComparandAndResult, iValue, /*weak*/ true, /* success_memorder */ __ATOMIC_SEQ_CST, /* failure_memorder */ __ATOMIC_RELAXED);
return __sync_bool_compare_and_swap_16((__int128_t*)pDst, *(__int128_t*)pComparand, iValue);
}
#pragma clang diagnostic pop
#endif // HOST_AMD64
class SatoriWorkList
class DECLSPEC_ALIGN(128) SatoriWorkList
{
public:
SatoriWorkList() :
m_head(), m_aba()
#ifdef _DEBUG
, m_count()
#endif
SatoriWorkList(nullptr, 0)
{}
SatoriWorkList(SatoriWorkChunk* head, size_t aba)
{
m_head = head;
m_aba = aba;
#ifdef _DEBUG
m_count = 0;
#endif
}
static SatoriWorkList* AllocAligned()
{
const size_t align = 64;
const size_t align =
#if defined(TARGET_AMD64)
64;
#else
128;
#endif
#ifdef _MSC_VER
void* buffer = _aligned_malloc(sizeof(SatoriWorkList), align);
#else
@ -80,9 +92,13 @@ public:
{
_ASSERTE(item->m_next == nullptr);
SatoriWorkList orig = *this;
item->m_next = orig.m_head;
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)item, (int64_t*)&orig))
SatoriWorkChunk* head = this->m_head;
size_t aba = this->m_aba;
item->m_next = head;
SatoriWorkList orig(head, aba);
if (Cas128((int64_t*)this, aba + 1, (int64_t)item, (int64_t*)&orig))
{
#ifdef _DEBUG
Interlocked::Increment(&m_count);
@ -96,20 +112,22 @@ public:
FORCEINLINE
SatoriWorkChunk* TryPop()
{
SatoriWorkList orig = *this;
if (orig.m_head == nullptr)
SatoriWorkChunk* head = this->m_head;
size_t aba = this->m_aba;
if (head == nullptr)
{
return nullptr;
}
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)orig.m_head->m_next, (int64_t*)&orig))
SatoriWorkList orig(head, aba);
if (Cas128((int64_t*)this, aba + 1, (int64_t)head->m_next, (int64_t*)&orig))
{
#ifdef _DEBUG
Interlocked::Decrement(&m_count);
#endif
SatoriWorkChunk* result = orig.m_head;
result->m_next = nullptr;
return result;
head->m_next = nullptr;
return head;
}
return TryPopSlow();

View file

@ -3700,7 +3700,7 @@ YieldProcessor()
"rep\n"
"nop");
#elif defined(HOST_ARM) || defined(HOST_ARM64)
__asm__ __volatile__( "yield");
__asm__ __volatile__( "dmb ishst\n yield");
#elif defined(HOST_LOONGARCH64)
__asm__ volatile( "dbar 0; \n");
#elif defined(HOST_RISCV64)