Arm64 tweaks (#36)

* tweaks * Gen2Target * trimmer pause * style and comment fixes
2025-06-08 03:27:04 +09:00 · 2025-01-07 23:40:40 -08:00 · 2025-01-07 23:40:40 -08:00 · 6dd178f834
commit 6dd178f834
parent 9755128719
12 changed files with 172 additions and 99 deletions
--- a/src/coreclr/gc/env/gcenv.base.h
+++ b/src/coreclr/gc/env/gcenv.base.h
@ -204,7 +204,7 @@ typedef DWORD (WINAPI *PTHREAD_START_ROUTINE)(void* lpThreadParameter);
 #endif // defined(__i386__) || defined(__x86_64__)

 #if defined(__arm__) || defined(__aarch64__)
- #define YieldProcessor() asm volatile ("yield")
+ #define YieldProcessor() asm volatile ("dmb ishst\n" "yield")
 #define MemoryBarrier __sync_synchronize
 #endif // __arm__ || __aarch64__

--- a/src/coreclr/gc/gcconfig.h
+++ b/src/coreclr/gc/gcconfig.h
@ -154,7 +154,9 @@ public:
    BOOL_CONFIG  (UseTHP,                    "gcTHP",                     NULL,                                true,               "Specifies whether Transparent Huge Pages can be used. (Linux only)")                     \
    BOOL_CONFIG  (TrimmigGC,                 "gcTrim",                    NULL,                                true,               "Specifies whether background trimming is enabled")                                       \
    INT_CONFIG   (GCRate,                    "gcRate",                    NULL,                                -1,                 "Specifies soft min limit for time between GCs in milliseconds. -1 - default")            \
-    INT_CONFIG   (GCSpin,                    "gcSpin",                    NULL,                                -1,                 "Spin")            \
+    INT_CONFIG   (GCSpin,                    "gcSpin",                    NULL,                                -1,                 "Spin")                                                                                   \
+    INT_CONFIG   (Gen2Target,                "gcGen2Target",              NULL,                                200,                "Specifies target for Gen2 GC (in terms of % of the last known size)")                    \
+    INT_CONFIG   (Gen1Target,                "gcGen1Target",              NULL,                                400,                "Specifies target for Gen1 GC (in terms of % of the last known size)")                    \

 // This class is responsible for retreiving configuration information
 // for how the GC should operate.
--- a/src/coreclr/gc/satori/SatoriLock.cpp
+++ b/src/coreclr/gc/satori/SatoriLock.cpp
@ -41,6 +41,7 @@ bool SatoriLock::EnterSlow(bool noBlock)

        // We will count when we failed to change the state of the lock and increase pauses
        // so that bursts of activity are better tolerated. This should not happen often.
+        // after waking up we restart collision and iteration counters.
        int collisions = 0;

        // We will track the changes of ownership while we are trying to acquire the lock.
@ -145,7 +146,7 @@ bool SatoriLock::EnterSlow(bool noBlock)
                // Increment the waiter count.
                // Note that we do not do any overflow checking on this increment.  In order to overflow,
                // we'd need to have about 1 billion waiting threads, which is inconceivable anytime in the
-                // forseeable future.
+                // foreseeable future.
                uint32_t newState = oldState + WaiterCountIncrement;
                if (hasWaited)
                    newState = (newState - WaiterCountIncrement) & ~WaiterWoken;
@ -153,6 +154,11 @@ bool SatoriLock::EnterSlow(bool noBlock)
                if (Interlocked::CompareExchange(&_state, newState, oldState) == oldState)
                    break;
            }
+            else
+            {
+                // We are over the iteration limit, but the lock was open, we tried and failed.
+                // It was a collision.
+            }

            CollisionBackoff(++collisions);
        }
--- a/src/coreclr/gc/satori/SatoriLock.h
+++ b/src/coreclr/gc/satori/SatoriLock.h
@ -73,7 +73,7 @@ private:
        return _InterlockedCompareExchange_acq((long*)destination, exchange, comparand) == (long)comparand;
 #endif
 #else
-        return __atomic_compare_exchange_n(destination, &comparand, exchange, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+        return __atomic_compare_exchange_n(destination, &comparand, exchange, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
 #endif
    }

@ -91,36 +91,6 @@ private:
 #endif
    }

-    FORCEINLINE
-    static int64_t GetCheapTimeStamp()
-    {
-#if defined(TARGET_AMD64)
-#ifdef _MSC_VER
-        return __rdtsc();
-#else
-        ptrdiff_t cycles;
-        ptrdiff_t cyclesHi;
-        __asm__ __volatile__
-        ("rdtsc":"=a" (cycles), "=d" (cyclesHi));
-        return (cyclesHi << 32) | cycles;
-#endif
-#elif defined(TARGET_ARM64)
-        // On arm64 just read timer register instead
-#ifdef _MSC_VER
-#define ARM64_CNTVCT_EL0 ARM64_SYSREG(3,3,14,0,2)
-        return _ReadStatusReg(ARM64_CNTVCT_EL0);
-#elif defined(TARGET_LINUX) || defined(TARGET_OSX)
-        int64_t timerTicks;
-        asm volatile("mrs %0, cntvct_el0" : "=r"(timerTicks));
-        return timerTicks;
-#else
-        Unsupported platform?
-#endif
-#else
-        Unsupported architecture?
-#endif
-    }
-
    static const uint16_t SpinCountNotInitialized = INT16_MIN;

    // While spinning is parameterized in terms of iterations,
@ -137,8 +107,8 @@ private:
    // the exponential backoff will generally be not more than 2X worse than the perfect guess and
    // will do a lot less attempts than an simple retry. On multiprocessor machine fruitless attempts
    // will cause unnecessary sharing of the contended state which may make modifying the state more expensive.
-    // To protect against degenerate cases we will cap the per-iteration wait to 1024 spinwaits.
-    static const uint32_t MaxExponentialBackoffBits = 10;
+    // To protect against degenerate cases we will cap the per-iteration wait to a few thousand spinwaits.
+    static const uint32_t MaxExponentialBackoffBits = 12;

    // This lock is unfair and permits acquiring a contended lock by a nonwaiter in the presence of waiters.
    // It is possible for one thread to keep holding the lock long enough that waiters go to sleep and
@ -220,9 +190,16 @@ public:
    {
        _ASSERTE(collisions > 0);

-        // no need for much randomness here, we will just hash the stack location and a timestamp.
-        uint32_t rand = ((uint32_t)(size_t)&collisions + (uint32_t)GetCheapTimeStamp()) * 2654435769u;
-        uint32_t spins = rand >> (uint8_t)((uint32_t)32 - min(collisions, MaxExponentialBackoffBits));
+        collisions = min(collisions, MaxExponentialBackoffBits);
+        // we will backoff for some random number of iterations that roughly grows as collisions^2
+        // no need for much randomness here, randomness is "good to have", we could do without it,
+        // so we will just hash in the stack location.
+        uint32_t rand = (uint32_t)(size_t)&collisions * 2654435769u;
+        // set the highmost bit to ensure minimum number of spins is exponentialy increasing
+        // it basically guarantees that we spin at least 1, 2, 4, 8, 16, times, and so on
+        rand |= (1u << 31);
+        uint32_t spins = rand >> (uint8_t)(32 - collisions);
+
        for (int i = 0; i < (int)spins; i++)
        {
            YieldProcessor();
@ -236,12 +213,12 @@ private:
        return (uint16_t)GCToOSInterface::GetLowPrecisionTimeStamp();
    }

-    // same idea as in CollisionBackoff, but with guaranteed minimum wait
+    // same idea as in CollisionBackoff, but with expected small range
    static void IterationBackoff(int iteration)
    {
        _ASSERTE(iteration > 0 && iteration < MaxExponentialBackoffBits);

-        uint32_t rand = ((uint32_t)(size_t)&iteration + (uint32_t)GetCheapTimeStamp()) * 2654435769u;
+        uint32_t rand = (uint32_t)(size_t)&iteration * 2654435769u;
        // set the highmost bit to ensure minimum number of spins is exponentialy increasing
        // it basically guarantees that we spin at least 1, 2, 4, 8, 16, times, and so on
        rand |= (1u << 31);
--- a/src/coreclr/gc/satori/SatoriRecycler.cpp
+++ b/src/coreclr/gc/satori/SatoriRecycler.cpp
@ -788,7 +788,8 @@ tryAgain:
                !m_concurrentCardsDone ||
                m_ccStackMarkState != CC_MARK_STATE_DONE ||
                m_concurrentCleaningState == CC_CLEAN_STATE_CLEANING ||
-                m_condemnedGeneration == 0;
+                m_condemnedGeneration == 0 ||
+                !m_workList->IsEmpty();

            int64_t start = GCToOSInterface::QueryPerformanceCounter();
            if (moreWork)
@ -796,15 +797,20 @@ tryAgain:
                HelpOnceCore(/*minQuantum*/ false);
                m_noWorkSince = GCToOSInterface::QueryPerformanceCounter();

-                // if we did not use all the quantum,
-                // consume it here in Low Latency mode for pacing reasons.
+                // if we did not use all the quantum in Low Latency mode,
+                // consume what roughly remains for pacing reasons.
                if (IsLowLatencyMode())
                {
-                    int64_t deadline = start + HelpQuantum();
+                    int64_t deadline = start + HelpQuantum() / 2;
+                    int iters = 1;
                    while (GCToOSInterface::QueryPerformanceCounter() < deadline &&
                        m_ccStackMarkState != CC_MARK_STATE_SUSPENDING_EE)
                    {
-                        YieldProcessor();
+                        iters *= 2;
+                        for (int i = 0; i < iters; i++)
+                        {
+                            YieldProcessor();
+                        }
                    }
                }
            }
@ -940,18 +946,16 @@ bool SatoriRecycler::IsBlockingPhase()
 }

 //TUNING: We use a very simplistic approach for GC triggering here.
+//        By default: 
+//          SatoriUtil::Gen2Target() triggers Gen2 GC when heap size doubles.
+//          SatoriUtil::Gen1Target() triggers Gen1 GC when ephemeral size quadruples.
+// 
 //        There could be a lot of room to improve in this area:
 //        - could consider current CPU/memory load and adjust accordingly
 //        - could collect and use past history of the program behavior
 //        - could consider user input as to favor latency or throughput
 //        - ??

-// we target 1/EPH_SURV_TARGET ephemeral survival rate
-#define EPH_SURV_TARGET 4
-
-// do gen2 when total doubles
-#define GEN2_SURV_TARGET 2
-
 void SatoriRecycler::MaybeTriggerGC(gc_reason reason)
 {
    int generation = 0;
@ -969,7 +973,7 @@ void SatoriRecycler::MaybeTriggerGC(gc_reason reason)

    // gen2 allocations are rare and do not count towards gen1 budget since gen1 will not help with that
    // they can be big though, so check if by any chance gen2 allocs alone pushed us over the limit
-    if (m_gen2AddedSinceLastCollection * GEN2_SURV_TARGET > m_totalLimit)
+    if (m_gen2AddedSinceLastCollection * SatoriUtil::Gen2Target() / 100 > m_totalLimit)
    {
        generation = 2;
    }
@ -1014,11 +1018,11 @@ void SatoriRecycler::AdjustHeuristics()

    if (m_prevCondemnedGeneration == 2)
    {
-        m_totalLimit = occupancy * GEN2_SURV_TARGET;
+        m_totalLimit = occupancy * SatoriUtil::Gen2Target() / 100;
    }

-    // we look for 1 / EPH_SURV_TARGET ephemeral survivorship, thus budget is the diff
-    size_t newGen1Budget = max(MIN_GEN1_BUDGET, ephemeralOccupancy * (EPH_SURV_TARGET - 1));
+    // we trigger GC when ephemeral size grows to SatoriUtil::Gen1Target(), thus budget is the diff
+    size_t newGen1Budget = max(MIN_GEN1_BUDGET, ephemeralOccupancy * (SatoriUtil::Gen2Target() - 100) / 100);

    // alternatively we allow gen1 allocs up to 1/8 of total limit.
    size_t altNewGen1Budget = max(MIN_GEN1_BUDGET, m_totalLimit / 8);
@ -4359,7 +4363,7 @@ bool SatoriRecycler::DrainDeferredSweepQueueConcurrent(int64_t deadline)
        }

        YieldProcessor();
-        if ((++cycles % 127) == 0)
+        if ((++cycles & 127) == 0)
        {
            GCToOSInterface::YieldThread(0);
        }
--- a/src/coreclr/gc/satori/SatoriRegionQueue.cpp
+++ b/src/coreclr/gc/satori/SatoriRegionQueue.cpp
@ -211,7 +211,13 @@ SatoriRegion* SatoriRegionQueue::TryDequeueIfHasFreeSpaceInTopBucket()

 SatoriRegionQueue* SatoriRegionQueue::AllocAligned(QueueKind kind)
 {
-    const size_t align = 64;
+    const size_t align =
+#if defined(TARGET_AMD64)
+        64;
+#else
+        128;
+#endif
+
 #ifdef _MSC_VER
    void* buffer = _aligned_malloc(sizeof(SatoriRegionQueue), align);
 #else
--- a/src/coreclr/gc/satori/SatoriTrimmer.cpp
+++ b/src/coreclr/gc/satori/SatoriTrimmer.cpp
@ -46,12 +46,26 @@ SatoriTrimmer::SatoriTrimmer(SatoriHeap* heap)
    m_event = new (nothrow) GCEvent;
    m_event->CreateAutoEventNoThrow(false);

+    m_sleepGate = new (nothrow) SatoriGate();
+
    if (SatoriUtil::IsTrimmingEnabled())
    {
        GCToEEInterface::CreateThread(LoopFn, this, false, "Satori GC Trimmer Thread");
    }
 }

+void SatoriTrimmer::Pause(int msec)
+{
+    m_paused = true;
+    m_sleepGate->TimedWait(msec);
+    m_paused = false;
+}
+
+void SatoriTrimmer::Unpause()
+{
+    m_sleepGate->WakeOne();
+}
+
 void SatoriTrimmer::LoopFn(void* inst)
 {
    SatoriTrimmer* ths = (SatoriTrimmer*)inst;
@ -76,7 +90,7 @@ void SatoriTrimmer::Loop()

            Interlocked::CompareExchange(&m_state, TRIMMER_STATE_STOPPED, TRIMMER_STATE_RUNNING);
            // we are not running here, so we can sleep a bit before continuing.
-            GCToOSInterface::Sleep(5000);
+            Pause(5000);
            StopAndWait();
        }

@ -84,7 +98,7 @@ void SatoriTrimmer::Loop()
            [&](SatoriPage* page)
            {
                // limit the rate of scanning to 1 page/msec.
-                GCToOSInterface::Sleep(1);
+                Pause(1);
                if (m_state != TRIMMER_STATE_RUNNING)
                {
                    StopAndWait();
@ -116,22 +130,23 @@ void SatoriTrimmer::Loop()
                                    if (didSomeWork)
                                    {
                                        // limit the decommit/coalesce rate to 1 region/10 msec.
-                                        GCToOSInterface::Sleep(10);
+                                        Pause(10);
+                                        if (m_state != TRIMMER_STATE_RUNNING)
+                                        {
+                                            StopAndWait();
+                                        }
                                    }
                                }
                            }
                        }

-                        // this is a low priority task, if something needs to run, yield
-                        GCToOSInterface::YieldThread(0);
-
                        // also we will pause for 1 sec if there was a GC - to further reduce the churn
                        // if the app is allocation-active.
                        int64_t newGen1 = m_heap->Recycler()->GetCollectionCount(1);
                        if (newGen1 != lastGen1)
                        {
                            lastGen1 = newGen1;
-                            GCToOSInterface::Sleep(1000);
+                            Pause(1000);
                        }

                        if (m_state != TRIMMER_STATE_RUNNING)
@ -151,8 +166,6 @@ void SatoriTrimmer::StopAndWait()
    {
        tryAgain:

-        // this is a low priority task, if something needs to run, yield
-        GCToOSInterface::YieldThread(0);
        int state = m_state;
        switch (state)
        {
@ -165,7 +178,7 @@ void SatoriTrimmer::StopAndWait()
        case TRIMMER_STATE_STOPPED:
            for (int i = 0; i < 10; i++)
            {
-                GCToOSInterface::Sleep(100);
+                Pause(100);
                if (m_state != state)
                {
                    goto tryAgain;
@ -215,17 +228,20 @@ void SatoriTrimmer::SetStopSuggested()
        case TRIMMER_STATE_OK_TO_RUN:
            if (Interlocked::CompareExchange(&m_state, TRIMMER_STATE_STOPPED, state) == state)
            {
+                Unpause();
                return;
            }
            break;
        case TRIMMER_STATE_RUNNING:
            if (Interlocked::CompareExchange(&m_state, TRIMMER_STATE_STOP_SUGGESTED, state) == state)
            {
+                Unpause();
                return;
            }
            break;
        default:
            _ASSERTE(m_state <= TRIMMER_STATE_STOP_SUGGESTED);
+            Unpause();
            return;
        }
    }
@ -238,8 +254,13 @@ void SatoriTrimmer::WaitForStop()
    int cycles = 0;
    while (m_state == TRIMMER_STATE_STOP_SUGGESTED)
    {
+        if (m_paused)
+        {
+            Unpause();
+        }
+
        YieldProcessor();
-        if ((++cycles % 127) == 0)
+        if ((++cycles & 127) == 0)
        {
            GCToOSInterface::YieldThread(0);
        }
--- a/src/coreclr/gc/satori/SatoriTrimmer.h
+++ b/src/coreclr/gc/satori/SatoriTrimmer.h
@ -52,12 +52,16 @@ private:

    SatoriHeap*  m_heap;
    GCEvent*     m_event;
+    SatoriGate*  m_sleepGate;
    size_t       m_lastGen2Count;
    volatile int m_state;
+    volatile bool m_paused;

    static void LoopFn(void* inst);
    void Loop();
    void StopAndWait();
+    void Pause(int msec);
+    void Unpause();
 };

 #endif
--- a/src/coreclr/gc/satori/SatoriUtil.h
+++ b/src/coreclr/gc/satori/SatoriUtil.h
@ -285,6 +285,34 @@ public:
        return gcSpin;
    }

+    // DOTNET_gcGen2Target
+    static int Gen2Target()
+    {
+        int target = (int)GCConfig::GetGen2Target();
+        if (target < 100)
+        {
+            // target must be > 100%
+            // if wee see less, just default to triggering GC when heap doubles
+            target = 200;
+        }
+
+        return target;
+    }
+
+    // DOTNET_gcGen1Target
+    static int Gen1Target()
+    {
+        int target = (int)GCConfig::GetGen1Target();
+        if (target < 100)
+        {
+            // target must be > 100%
+            // if wee see less, just default to triggering GC when ephemeral heap quadruples
+            target = 400;
+        }
+
+        return target;
+    }
+
    static size_t CommitGranularity()
    {
        // we can support sizes that are > OS page and binary fractions of REGION_SIZE_GRANULARITY.
--- a/src/coreclr/gc/satori/SatoriWorkList.cpp
+++ b/src/coreclr/gc/satori/SatoriWorkList.cpp
@ -34,11 +34,16 @@ NOINLINE
 void SatoriWorkList::PushSlow(SatoriWorkChunk* item)
 {
    uint32_t collisions = 1;
+    SatoriWorkChunk* head;
    while (true)
    {
-        SatoriWorkList orig = *this;
-        item->m_next = orig.m_head;
-        if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)item, (int64_t*)&orig))
+        head = this->m_head;
+        size_t aba = this->m_aba;
+
+        item->m_next = head;
+
+        SatoriWorkList orig(head, aba);
+        if (Cas128((int64_t*)this, aba + 1, (int64_t)item, (int64_t*)&orig))
            break;

        SatoriLock::CollisionBackoff(collisions++);
@ -53,16 +58,19 @@ NOINLINE
 SatoriWorkChunk* SatoriWorkList::TryPopSlow()
 {
    uint32_t collisions = 1;
-    SatoriWorkList orig;
+    SatoriWorkChunk* head;
    while (true)
    {
-        orig = *this;
-        if (orig.m_head == nullptr)
+        head = this->m_head;
+        size_t aba = this->m_aba;
+
+        if (head == nullptr)
        {
            return nullptr;
        }

-        if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)orig.m_head->m_next, (int64_t*)&orig))
+        SatoriWorkList orig(head, aba);
+        if (Cas128((int64_t*)this, aba + 1, (int64_t)head->m_next, (int64_t*)&orig))
            break;

        SatoriLock::CollisionBackoff(collisions++);
@ -72,7 +80,6 @@ SatoriWorkChunk* SatoriWorkList::TryPopSlow()
    Interlocked::Decrement(&m_count);
 #endif

-    SatoriWorkChunk* result = orig.m_head;
-    result->m_next = nullptr;
-    return result;
+    head->m_next = nullptr;
+    return head;
 }
--- a/src/coreclr/gc/satori/SatoriWorkList.h
+++ b/src/coreclr/gc/satori/SatoriWorkList.h
@ -33,34 +33,46 @@


 #if defined(TARGET_WINDOWS)
-FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparandAndResult)
+FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparand)
 {
-    return _InterlockedCompareExchange128(pDst, iValueHigh, iValueLow, pComparandAndResult);
+    return _InterlockedCompareExchange128(pDst, iValueHigh, iValueLow, pComparand);
 }
 #else
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Watomic-alignment"
-FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparandAndResult)
+FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparand)
 {
    __int128_t iValue = ((__int128_t)iValueHigh << 64) + (uint64_t)iValueLow;
-    return __atomic_compare_exchange_n ((__int128_t*)pDst, (__int128_t*)pComparandAndResult, iValue, /*weak*/ true, /* success_memorder */ __ATOMIC_SEQ_CST, /* failure_memorder */ __ATOMIC_RELAXED);
+    return __sync_bool_compare_and_swap_16((__int128_t*)pDst, *(__int128_t*)pComparand, iValue);
 }
 #pragma clang diagnostic pop
 #endif // HOST_AMD64

-class SatoriWorkList
+class DECLSPEC_ALIGN(128) SatoriWorkList
 {
 public:
    SatoriWorkList() :
-        m_head(), m_aba()
-#ifdef _DEBUG
-        , m_count()
-#endif
+        SatoriWorkList(nullptr, 0)
    {}

+    SatoriWorkList(SatoriWorkChunk* head, size_t aba)
+    {
+        m_head = head;
+        m_aba = aba;
+#ifdef _DEBUG
+        m_count = 0;
+#endif
+    }
+
    static SatoriWorkList* AllocAligned()
    {
-        const size_t align = 64;
+        const size_t align =
+#if defined(TARGET_AMD64)
+            64;
+#else
+            128;
+#endif
+
 #ifdef _MSC_VER
        void* buffer = _aligned_malloc(sizeof(SatoriWorkList), align);
 #else
@ -80,9 +92,13 @@ public:
    {
        _ASSERTE(item->m_next == nullptr);

-        SatoriWorkList orig = *this;
-        item->m_next = orig.m_head;
-        if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)item, (int64_t*)&orig))
+        SatoriWorkChunk* head = this->m_head;
+        size_t aba = this->m_aba;
+
+        item->m_next = head;
+
+        SatoriWorkList orig(head, aba);
+        if (Cas128((int64_t*)this, aba + 1, (int64_t)item, (int64_t*)&orig))
        {
 #ifdef _DEBUG
            Interlocked::Increment(&m_count);
@ -96,20 +112,22 @@ public:
    FORCEINLINE
    SatoriWorkChunk* TryPop()
    {
-        SatoriWorkList orig = *this;
-        if (orig.m_head == nullptr)
+        SatoriWorkChunk* head  = this->m_head;
+        size_t aba = this->m_aba;
+
+        if (head == nullptr)
        {
            return nullptr;
        }

-        if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)orig.m_head->m_next, (int64_t*)&orig))
+        SatoriWorkList orig(head, aba);
+        if (Cas128((int64_t*)this, aba + 1, (int64_t)head->m_next, (int64_t*)&orig))
        {
    #ifdef _DEBUG
            Interlocked::Decrement(&m_count);
    #endif
-            SatoriWorkChunk* result = orig.m_head;
-            result->m_next = nullptr;
-            return result;
+            head->m_next = nullptr;
+            return head;
        }

        return TryPopSlow();
--- a/src/coreclr/pal/inc/pal.h
+++ b/src/coreclr/pal/inc/pal.h
@ -3700,7 +3700,7 @@ YieldProcessor()
        "rep\n"
        "nop");
 #elif defined(HOST_ARM) || defined(HOST_ARM64)
-    __asm__ __volatile__( "yield");
+    __asm__ __volatile__( "dmb ishst\n yield");
 #elif defined(HOST_LOONGARCH64)
    __asm__ volatile( "dbar 0;  \n");
 #elif defined(HOST_RISCV64)