mirror of
https://github.com/VSadov/Satori.git
synced 2025-06-08 03:27:04 +09:00
Arm64 tweaks (#36)
* tweaks * Gen2Target * trimmer pause * style and comment fixes
This commit is contained in:
parent
9755128719
commit
6dd178f834
12 changed files with 172 additions and 99 deletions
2
src/coreclr/gc/env/gcenv.base.h
vendored
2
src/coreclr/gc/env/gcenv.base.h
vendored
|
@ -204,7 +204,7 @@ typedef DWORD (WINAPI *PTHREAD_START_ROUTINE)(void* lpThreadParameter);
|
|||
#endif // defined(__i386__) || defined(__x86_64__)
|
||||
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
#define YieldProcessor() asm volatile ("yield")
|
||||
#define YieldProcessor() asm volatile ("dmb ishst\n" "yield")
|
||||
#define MemoryBarrier __sync_synchronize
|
||||
#endif // __arm__ || __aarch64__
|
||||
|
||||
|
|
|
@ -154,7 +154,9 @@ public:
|
|||
BOOL_CONFIG (UseTHP, "gcTHP", NULL, true, "Specifies whether Transparent Huge Pages can be used. (Linux only)") \
|
||||
BOOL_CONFIG (TrimmigGC, "gcTrim", NULL, true, "Specifies whether background trimming is enabled") \
|
||||
INT_CONFIG (GCRate, "gcRate", NULL, -1, "Specifies soft min limit for time between GCs in milliseconds. -1 - default") \
|
||||
INT_CONFIG (GCSpin, "gcSpin", NULL, -1, "Spin") \
|
||||
INT_CONFIG (GCSpin, "gcSpin", NULL, -1, "Spin") \
|
||||
INT_CONFIG (Gen2Target, "gcGen2Target", NULL, 200, "Specifies target for Gen2 GC (in terms of % of the last known size)") \
|
||||
INT_CONFIG (Gen1Target, "gcGen1Target", NULL, 400, "Specifies target for Gen1 GC (in terms of % of the last known size)") \
|
||||
|
||||
// This class is responsible for retreiving configuration information
|
||||
// for how the GC should operate.
|
||||
|
|
|
@ -41,6 +41,7 @@ bool SatoriLock::EnterSlow(bool noBlock)
|
|||
|
||||
// We will count when we failed to change the state of the lock and increase pauses
|
||||
// so that bursts of activity are better tolerated. This should not happen often.
|
||||
// after waking up we restart collision and iteration counters.
|
||||
int collisions = 0;
|
||||
|
||||
// We will track the changes of ownership while we are trying to acquire the lock.
|
||||
|
@ -145,7 +146,7 @@ bool SatoriLock::EnterSlow(bool noBlock)
|
|||
// Increment the waiter count.
|
||||
// Note that we do not do any overflow checking on this increment. In order to overflow,
|
||||
// we'd need to have about 1 billion waiting threads, which is inconceivable anytime in the
|
||||
// forseeable future.
|
||||
// foreseeable future.
|
||||
uint32_t newState = oldState + WaiterCountIncrement;
|
||||
if (hasWaited)
|
||||
newState = (newState - WaiterCountIncrement) & ~WaiterWoken;
|
||||
|
@ -153,6 +154,11 @@ bool SatoriLock::EnterSlow(bool noBlock)
|
|||
if (Interlocked::CompareExchange(&_state, newState, oldState) == oldState)
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
// We are over the iteration limit, but the lock was open, we tried and failed.
|
||||
// It was a collision.
|
||||
}
|
||||
|
||||
CollisionBackoff(++collisions);
|
||||
}
|
||||
|
|
|
@ -73,7 +73,7 @@ private:
|
|||
return _InterlockedCompareExchange_acq((long*)destination, exchange, comparand) == (long)comparand;
|
||||
#endif
|
||||
#else
|
||||
return __atomic_compare_exchange_n(destination, &comparand, exchange, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
|
||||
return __atomic_compare_exchange_n(destination, &comparand, exchange, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -91,36 +91,6 @@ private:
|
|||
#endif
|
||||
}
|
||||
|
||||
FORCEINLINE
|
||||
static int64_t GetCheapTimeStamp()
|
||||
{
|
||||
#if defined(TARGET_AMD64)
|
||||
#ifdef _MSC_VER
|
||||
return __rdtsc();
|
||||
#else
|
||||
ptrdiff_t cycles;
|
||||
ptrdiff_t cyclesHi;
|
||||
__asm__ __volatile__
|
||||
("rdtsc":"=a" (cycles), "=d" (cyclesHi));
|
||||
return (cyclesHi << 32) | cycles;
|
||||
#endif
|
||||
#elif defined(TARGET_ARM64)
|
||||
// On arm64 just read timer register instead
|
||||
#ifdef _MSC_VER
|
||||
#define ARM64_CNTVCT_EL0 ARM64_SYSREG(3,3,14,0,2)
|
||||
return _ReadStatusReg(ARM64_CNTVCT_EL0);
|
||||
#elif defined(TARGET_LINUX) || defined(TARGET_OSX)
|
||||
int64_t timerTicks;
|
||||
asm volatile("mrs %0, cntvct_el0" : "=r"(timerTicks));
|
||||
return timerTicks;
|
||||
#else
|
||||
Unsupported platform?
|
||||
#endif
|
||||
#else
|
||||
Unsupported architecture?
|
||||
#endif
|
||||
}
|
||||
|
||||
static const uint16_t SpinCountNotInitialized = INT16_MIN;
|
||||
|
||||
// While spinning is parameterized in terms of iterations,
|
||||
|
@ -137,8 +107,8 @@ private:
|
|||
// the exponential backoff will generally be not more than 2X worse than the perfect guess and
|
||||
// will do a lot less attempts than an simple retry. On multiprocessor machine fruitless attempts
|
||||
// will cause unnecessary sharing of the contended state which may make modifying the state more expensive.
|
||||
// To protect against degenerate cases we will cap the per-iteration wait to 1024 spinwaits.
|
||||
static const uint32_t MaxExponentialBackoffBits = 10;
|
||||
// To protect against degenerate cases we will cap the per-iteration wait to a few thousand spinwaits.
|
||||
static const uint32_t MaxExponentialBackoffBits = 12;
|
||||
|
||||
// This lock is unfair and permits acquiring a contended lock by a nonwaiter in the presence of waiters.
|
||||
// It is possible for one thread to keep holding the lock long enough that waiters go to sleep and
|
||||
|
@ -220,9 +190,16 @@ public:
|
|||
{
|
||||
_ASSERTE(collisions > 0);
|
||||
|
||||
// no need for much randomness here, we will just hash the stack location and a timestamp.
|
||||
uint32_t rand = ((uint32_t)(size_t)&collisions + (uint32_t)GetCheapTimeStamp()) * 2654435769u;
|
||||
uint32_t spins = rand >> (uint8_t)((uint32_t)32 - min(collisions, MaxExponentialBackoffBits));
|
||||
collisions = min(collisions, MaxExponentialBackoffBits);
|
||||
// we will backoff for some random number of iterations that roughly grows as collisions^2
|
||||
// no need for much randomness here, randomness is "good to have", we could do without it,
|
||||
// so we will just hash in the stack location.
|
||||
uint32_t rand = (uint32_t)(size_t)&collisions * 2654435769u;
|
||||
// set the highmost bit to ensure minimum number of spins is exponentialy increasing
|
||||
// it basically guarantees that we spin at least 1, 2, 4, 8, 16, times, and so on
|
||||
rand |= (1u << 31);
|
||||
uint32_t spins = rand >> (uint8_t)(32 - collisions);
|
||||
|
||||
for (int i = 0; i < (int)spins; i++)
|
||||
{
|
||||
YieldProcessor();
|
||||
|
@ -236,12 +213,12 @@ private:
|
|||
return (uint16_t)GCToOSInterface::GetLowPrecisionTimeStamp();
|
||||
}
|
||||
|
||||
// same idea as in CollisionBackoff, but with guaranteed minimum wait
|
||||
// same idea as in CollisionBackoff, but with expected small range
|
||||
static void IterationBackoff(int iteration)
|
||||
{
|
||||
_ASSERTE(iteration > 0 && iteration < MaxExponentialBackoffBits);
|
||||
|
||||
uint32_t rand = ((uint32_t)(size_t)&iteration + (uint32_t)GetCheapTimeStamp()) * 2654435769u;
|
||||
uint32_t rand = (uint32_t)(size_t)&iteration * 2654435769u;
|
||||
// set the highmost bit to ensure minimum number of spins is exponentialy increasing
|
||||
// it basically guarantees that we spin at least 1, 2, 4, 8, 16, times, and so on
|
||||
rand |= (1u << 31);
|
||||
|
|
|
@ -788,7 +788,8 @@ tryAgain:
|
|||
!m_concurrentCardsDone ||
|
||||
m_ccStackMarkState != CC_MARK_STATE_DONE ||
|
||||
m_concurrentCleaningState == CC_CLEAN_STATE_CLEANING ||
|
||||
m_condemnedGeneration == 0;
|
||||
m_condemnedGeneration == 0 ||
|
||||
!m_workList->IsEmpty();
|
||||
|
||||
int64_t start = GCToOSInterface::QueryPerformanceCounter();
|
||||
if (moreWork)
|
||||
|
@ -796,15 +797,20 @@ tryAgain:
|
|||
HelpOnceCore(/*minQuantum*/ false);
|
||||
m_noWorkSince = GCToOSInterface::QueryPerformanceCounter();
|
||||
|
||||
// if we did not use all the quantum,
|
||||
// consume it here in Low Latency mode for pacing reasons.
|
||||
// if we did not use all the quantum in Low Latency mode,
|
||||
// consume what roughly remains for pacing reasons.
|
||||
if (IsLowLatencyMode())
|
||||
{
|
||||
int64_t deadline = start + HelpQuantum();
|
||||
int64_t deadline = start + HelpQuantum() / 2;
|
||||
int iters = 1;
|
||||
while (GCToOSInterface::QueryPerformanceCounter() < deadline &&
|
||||
m_ccStackMarkState != CC_MARK_STATE_SUSPENDING_EE)
|
||||
{
|
||||
YieldProcessor();
|
||||
iters *= 2;
|
||||
for (int i = 0; i < iters; i++)
|
||||
{
|
||||
YieldProcessor();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -940,18 +946,16 @@ bool SatoriRecycler::IsBlockingPhase()
|
|||
}
|
||||
|
||||
//TUNING: We use a very simplistic approach for GC triggering here.
|
||||
// By default:
|
||||
// SatoriUtil::Gen2Target() triggers Gen2 GC when heap size doubles.
|
||||
// SatoriUtil::Gen1Target() triggers Gen1 GC when ephemeral size quadruples.
|
||||
//
|
||||
// There could be a lot of room to improve in this area:
|
||||
// - could consider current CPU/memory load and adjust accordingly
|
||||
// - could collect and use past history of the program behavior
|
||||
// - could consider user input as to favor latency or throughput
|
||||
// - ??
|
||||
|
||||
// we target 1/EPH_SURV_TARGET ephemeral survival rate
|
||||
#define EPH_SURV_TARGET 4
|
||||
|
||||
// do gen2 when total doubles
|
||||
#define GEN2_SURV_TARGET 2
|
||||
|
||||
void SatoriRecycler::MaybeTriggerGC(gc_reason reason)
|
||||
{
|
||||
int generation = 0;
|
||||
|
@ -969,7 +973,7 @@ void SatoriRecycler::MaybeTriggerGC(gc_reason reason)
|
|||
|
||||
// gen2 allocations are rare and do not count towards gen1 budget since gen1 will not help with that
|
||||
// they can be big though, so check if by any chance gen2 allocs alone pushed us over the limit
|
||||
if (m_gen2AddedSinceLastCollection * GEN2_SURV_TARGET > m_totalLimit)
|
||||
if (m_gen2AddedSinceLastCollection * SatoriUtil::Gen2Target() / 100 > m_totalLimit)
|
||||
{
|
||||
generation = 2;
|
||||
}
|
||||
|
@ -1014,11 +1018,11 @@ void SatoriRecycler::AdjustHeuristics()
|
|||
|
||||
if (m_prevCondemnedGeneration == 2)
|
||||
{
|
||||
m_totalLimit = occupancy * GEN2_SURV_TARGET;
|
||||
m_totalLimit = occupancy * SatoriUtil::Gen2Target() / 100;
|
||||
}
|
||||
|
||||
// we look for 1 / EPH_SURV_TARGET ephemeral survivorship, thus budget is the diff
|
||||
size_t newGen1Budget = max(MIN_GEN1_BUDGET, ephemeralOccupancy * (EPH_SURV_TARGET - 1));
|
||||
// we trigger GC when ephemeral size grows to SatoriUtil::Gen1Target(), thus budget is the diff
|
||||
size_t newGen1Budget = max(MIN_GEN1_BUDGET, ephemeralOccupancy * (SatoriUtil::Gen2Target() - 100) / 100);
|
||||
|
||||
// alternatively we allow gen1 allocs up to 1/8 of total limit.
|
||||
size_t altNewGen1Budget = max(MIN_GEN1_BUDGET, m_totalLimit / 8);
|
||||
|
@ -4359,7 +4363,7 @@ bool SatoriRecycler::DrainDeferredSweepQueueConcurrent(int64_t deadline)
|
|||
}
|
||||
|
||||
YieldProcessor();
|
||||
if ((++cycles % 127) == 0)
|
||||
if ((++cycles & 127) == 0)
|
||||
{
|
||||
GCToOSInterface::YieldThread(0);
|
||||
}
|
||||
|
|
|
@ -211,7 +211,13 @@ SatoriRegion* SatoriRegionQueue::TryDequeueIfHasFreeSpaceInTopBucket()
|
|||
|
||||
SatoriRegionQueue* SatoriRegionQueue::AllocAligned(QueueKind kind)
|
||||
{
|
||||
const size_t align = 64;
|
||||
const size_t align =
|
||||
#if defined(TARGET_AMD64)
|
||||
64;
|
||||
#else
|
||||
128;
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
void* buffer = _aligned_malloc(sizeof(SatoriRegionQueue), align);
|
||||
#else
|
||||
|
|
|
@ -46,12 +46,26 @@ SatoriTrimmer::SatoriTrimmer(SatoriHeap* heap)
|
|||
m_event = new (nothrow) GCEvent;
|
||||
m_event->CreateAutoEventNoThrow(false);
|
||||
|
||||
m_sleepGate = new (nothrow) SatoriGate();
|
||||
|
||||
if (SatoriUtil::IsTrimmingEnabled())
|
||||
{
|
||||
GCToEEInterface::CreateThread(LoopFn, this, false, "Satori GC Trimmer Thread");
|
||||
}
|
||||
}
|
||||
|
||||
void SatoriTrimmer::Pause(int msec)
|
||||
{
|
||||
m_paused = true;
|
||||
m_sleepGate->TimedWait(msec);
|
||||
m_paused = false;
|
||||
}
|
||||
|
||||
void SatoriTrimmer::Unpause()
|
||||
{
|
||||
m_sleepGate->WakeOne();
|
||||
}
|
||||
|
||||
void SatoriTrimmer::LoopFn(void* inst)
|
||||
{
|
||||
SatoriTrimmer* ths = (SatoriTrimmer*)inst;
|
||||
|
@ -76,7 +90,7 @@ void SatoriTrimmer::Loop()
|
|||
|
||||
Interlocked::CompareExchange(&m_state, TRIMMER_STATE_STOPPED, TRIMMER_STATE_RUNNING);
|
||||
// we are not running here, so we can sleep a bit before continuing.
|
||||
GCToOSInterface::Sleep(5000);
|
||||
Pause(5000);
|
||||
StopAndWait();
|
||||
}
|
||||
|
||||
|
@ -84,7 +98,7 @@ void SatoriTrimmer::Loop()
|
|||
[&](SatoriPage* page)
|
||||
{
|
||||
// limit the rate of scanning to 1 page/msec.
|
||||
GCToOSInterface::Sleep(1);
|
||||
Pause(1);
|
||||
if (m_state != TRIMMER_STATE_RUNNING)
|
||||
{
|
||||
StopAndWait();
|
||||
|
@ -116,22 +130,23 @@ void SatoriTrimmer::Loop()
|
|||
if (didSomeWork)
|
||||
{
|
||||
// limit the decommit/coalesce rate to 1 region/10 msec.
|
||||
GCToOSInterface::Sleep(10);
|
||||
Pause(10);
|
||||
if (m_state != TRIMMER_STATE_RUNNING)
|
||||
{
|
||||
StopAndWait();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// this is a low priority task, if something needs to run, yield
|
||||
GCToOSInterface::YieldThread(0);
|
||||
|
||||
// also we will pause for 1 sec if there was a GC - to further reduce the churn
|
||||
// if the app is allocation-active.
|
||||
int64_t newGen1 = m_heap->Recycler()->GetCollectionCount(1);
|
||||
if (newGen1 != lastGen1)
|
||||
{
|
||||
lastGen1 = newGen1;
|
||||
GCToOSInterface::Sleep(1000);
|
||||
Pause(1000);
|
||||
}
|
||||
|
||||
if (m_state != TRIMMER_STATE_RUNNING)
|
||||
|
@ -151,8 +166,6 @@ void SatoriTrimmer::StopAndWait()
|
|||
{
|
||||
tryAgain:
|
||||
|
||||
// this is a low priority task, if something needs to run, yield
|
||||
GCToOSInterface::YieldThread(0);
|
||||
int state = m_state;
|
||||
switch (state)
|
||||
{
|
||||
|
@ -165,7 +178,7 @@ void SatoriTrimmer::StopAndWait()
|
|||
case TRIMMER_STATE_STOPPED:
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
GCToOSInterface::Sleep(100);
|
||||
Pause(100);
|
||||
if (m_state != state)
|
||||
{
|
||||
goto tryAgain;
|
||||
|
@ -215,17 +228,20 @@ void SatoriTrimmer::SetStopSuggested()
|
|||
case TRIMMER_STATE_OK_TO_RUN:
|
||||
if (Interlocked::CompareExchange(&m_state, TRIMMER_STATE_STOPPED, state) == state)
|
||||
{
|
||||
Unpause();
|
||||
return;
|
||||
}
|
||||
break;
|
||||
case TRIMMER_STATE_RUNNING:
|
||||
if (Interlocked::CompareExchange(&m_state, TRIMMER_STATE_STOP_SUGGESTED, state) == state)
|
||||
{
|
||||
Unpause();
|
||||
return;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
_ASSERTE(m_state <= TRIMMER_STATE_STOP_SUGGESTED);
|
||||
Unpause();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -238,8 +254,13 @@ void SatoriTrimmer::WaitForStop()
|
|||
int cycles = 0;
|
||||
while (m_state == TRIMMER_STATE_STOP_SUGGESTED)
|
||||
{
|
||||
if (m_paused)
|
||||
{
|
||||
Unpause();
|
||||
}
|
||||
|
||||
YieldProcessor();
|
||||
if ((++cycles % 127) == 0)
|
||||
if ((++cycles & 127) == 0)
|
||||
{
|
||||
GCToOSInterface::YieldThread(0);
|
||||
}
|
||||
|
|
|
@ -52,12 +52,16 @@ private:
|
|||
|
||||
SatoriHeap* m_heap;
|
||||
GCEvent* m_event;
|
||||
SatoriGate* m_sleepGate;
|
||||
size_t m_lastGen2Count;
|
||||
volatile int m_state;
|
||||
volatile bool m_paused;
|
||||
|
||||
static void LoopFn(void* inst);
|
||||
void Loop();
|
||||
void StopAndWait();
|
||||
void Pause(int msec);
|
||||
void Unpause();
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -285,6 +285,34 @@ public:
|
|||
return gcSpin;
|
||||
}
|
||||
|
||||
// DOTNET_gcGen2Target
|
||||
static int Gen2Target()
|
||||
{
|
||||
int target = (int)GCConfig::GetGen2Target();
|
||||
if (target < 100)
|
||||
{
|
||||
// target must be > 100%
|
||||
// if wee see less, just default to triggering GC when heap doubles
|
||||
target = 200;
|
||||
}
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
// DOTNET_gcGen1Target
|
||||
static int Gen1Target()
|
||||
{
|
||||
int target = (int)GCConfig::GetGen1Target();
|
||||
if (target < 100)
|
||||
{
|
||||
// target must be > 100%
|
||||
// if wee see less, just default to triggering GC when ephemeral heap quadruples
|
||||
target = 400;
|
||||
}
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
static size_t CommitGranularity()
|
||||
{
|
||||
// we can support sizes that are > OS page and binary fractions of REGION_SIZE_GRANULARITY.
|
||||
|
|
|
@ -34,11 +34,16 @@ NOINLINE
|
|||
void SatoriWorkList::PushSlow(SatoriWorkChunk* item)
|
||||
{
|
||||
uint32_t collisions = 1;
|
||||
SatoriWorkChunk* head;
|
||||
while (true)
|
||||
{
|
||||
SatoriWorkList orig = *this;
|
||||
item->m_next = orig.m_head;
|
||||
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)item, (int64_t*)&orig))
|
||||
head = this->m_head;
|
||||
size_t aba = this->m_aba;
|
||||
|
||||
item->m_next = head;
|
||||
|
||||
SatoriWorkList orig(head, aba);
|
||||
if (Cas128((int64_t*)this, aba + 1, (int64_t)item, (int64_t*)&orig))
|
||||
break;
|
||||
|
||||
SatoriLock::CollisionBackoff(collisions++);
|
||||
|
@ -53,16 +58,19 @@ NOINLINE
|
|||
SatoriWorkChunk* SatoriWorkList::TryPopSlow()
|
||||
{
|
||||
uint32_t collisions = 1;
|
||||
SatoriWorkList orig;
|
||||
SatoriWorkChunk* head;
|
||||
while (true)
|
||||
{
|
||||
orig = *this;
|
||||
if (orig.m_head == nullptr)
|
||||
head = this->m_head;
|
||||
size_t aba = this->m_aba;
|
||||
|
||||
if (head == nullptr)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)orig.m_head->m_next, (int64_t*)&orig))
|
||||
SatoriWorkList orig(head, aba);
|
||||
if (Cas128((int64_t*)this, aba + 1, (int64_t)head->m_next, (int64_t*)&orig))
|
||||
break;
|
||||
|
||||
SatoriLock::CollisionBackoff(collisions++);
|
||||
|
@ -72,7 +80,6 @@ SatoriWorkChunk* SatoriWorkList::TryPopSlow()
|
|||
Interlocked::Decrement(&m_count);
|
||||
#endif
|
||||
|
||||
SatoriWorkChunk* result = orig.m_head;
|
||||
result->m_next = nullptr;
|
||||
return result;
|
||||
head->m_next = nullptr;
|
||||
return head;
|
||||
}
|
||||
|
|
|
@ -33,34 +33,46 @@
|
|||
|
||||
|
||||
#if defined(TARGET_WINDOWS)
|
||||
FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparandAndResult)
|
||||
FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparand)
|
||||
{
|
||||
return _InterlockedCompareExchange128(pDst, iValueHigh, iValueLow, pComparandAndResult);
|
||||
return _InterlockedCompareExchange128(pDst, iValueHigh, iValueLow, pComparand);
|
||||
}
|
||||
#else
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Watomic-alignment"
|
||||
FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparandAndResult)
|
||||
FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparand)
|
||||
{
|
||||
__int128_t iValue = ((__int128_t)iValueHigh << 64) + (uint64_t)iValueLow;
|
||||
return __atomic_compare_exchange_n ((__int128_t*)pDst, (__int128_t*)pComparandAndResult, iValue, /*weak*/ true, /* success_memorder */ __ATOMIC_SEQ_CST, /* failure_memorder */ __ATOMIC_RELAXED);
|
||||
return __sync_bool_compare_and_swap_16((__int128_t*)pDst, *(__int128_t*)pComparand, iValue);
|
||||
}
|
||||
#pragma clang diagnostic pop
|
||||
#endif // HOST_AMD64
|
||||
|
||||
class SatoriWorkList
|
||||
class DECLSPEC_ALIGN(128) SatoriWorkList
|
||||
{
|
||||
public:
|
||||
SatoriWorkList() :
|
||||
m_head(), m_aba()
|
||||
#ifdef _DEBUG
|
||||
, m_count()
|
||||
#endif
|
||||
SatoriWorkList(nullptr, 0)
|
||||
{}
|
||||
|
||||
SatoriWorkList(SatoriWorkChunk* head, size_t aba)
|
||||
{
|
||||
m_head = head;
|
||||
m_aba = aba;
|
||||
#ifdef _DEBUG
|
||||
m_count = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static SatoriWorkList* AllocAligned()
|
||||
{
|
||||
const size_t align = 64;
|
||||
const size_t align =
|
||||
#if defined(TARGET_AMD64)
|
||||
64;
|
||||
#else
|
||||
128;
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
void* buffer = _aligned_malloc(sizeof(SatoriWorkList), align);
|
||||
#else
|
||||
|
@ -80,9 +92,13 @@ public:
|
|||
{
|
||||
_ASSERTE(item->m_next == nullptr);
|
||||
|
||||
SatoriWorkList orig = *this;
|
||||
item->m_next = orig.m_head;
|
||||
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)item, (int64_t*)&orig))
|
||||
SatoriWorkChunk* head = this->m_head;
|
||||
size_t aba = this->m_aba;
|
||||
|
||||
item->m_next = head;
|
||||
|
||||
SatoriWorkList orig(head, aba);
|
||||
if (Cas128((int64_t*)this, aba + 1, (int64_t)item, (int64_t*)&orig))
|
||||
{
|
||||
#ifdef _DEBUG
|
||||
Interlocked::Increment(&m_count);
|
||||
|
@ -96,20 +112,22 @@ public:
|
|||
FORCEINLINE
|
||||
SatoriWorkChunk* TryPop()
|
||||
{
|
||||
SatoriWorkList orig = *this;
|
||||
if (orig.m_head == nullptr)
|
||||
SatoriWorkChunk* head = this->m_head;
|
||||
size_t aba = this->m_aba;
|
||||
|
||||
if (head == nullptr)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)orig.m_head->m_next, (int64_t*)&orig))
|
||||
SatoriWorkList orig(head, aba);
|
||||
if (Cas128((int64_t*)this, aba + 1, (int64_t)head->m_next, (int64_t*)&orig))
|
||||
{
|
||||
#ifdef _DEBUG
|
||||
Interlocked::Decrement(&m_count);
|
||||
#endif
|
||||
SatoriWorkChunk* result = orig.m_head;
|
||||
result->m_next = nullptr;
|
||||
return result;
|
||||
head->m_next = nullptr;
|
||||
return head;
|
||||
}
|
||||
|
||||
return TryPopSlow();
|
||||
|
|
|
@ -3700,7 +3700,7 @@ YieldProcessor()
|
|||
"rep\n"
|
||||
"nop");
|
||||
#elif defined(HOST_ARM) || defined(HOST_ARM64)
|
||||
__asm__ __volatile__( "yield");
|
||||
__asm__ __volatile__( "dmb ishst\n yield");
|
||||
#elif defined(HOST_LOONGARCH64)
|
||||
__asm__ volatile( "dbar 0; \n");
|
||||
#elif defined(HOST_RISCV64)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue