1
0
Fork 0
mirror of https://github.com/VSadov/Satori.git synced 2025-06-08 03:27:04 +09:00

Low Latency Mode TODOs (#35)

* Gate and new Lock, also GC thread spin/rate

* barrier tweaks

* GetGCSafeMethodTable

* Suspend SpinWait tweak

* GetMemoryInfo throttling

* collection heuristics

* avail ram

* recycler and env changes

* barriers

* bulk copy

* gc spin tweak

* TryPopWithTryEnter

* small commit is back

* m_helperWoken

* AskForHelp

* tweaks

* do stacks last

* only lock changes

* AllocAligned

* misalign TLABs

* prefer concurrent in LowLat mode

* worklist

* spin in TryEnter

* rationalizing mark chunk size

* shorter card groups

* a few todos

* Gen0 Gen1 switches

* gcTHP switch

* tweak conc block time

* more conservative trimming

* fixes for OSX

* fix for arm64

* tweak helper counts

* round up

* helper --> worker
This commit is contained in:
Vladimir Sadov 2024-12-28 18:01:13 -08:00 committed by vsadov
parent 7f52e2c11d
commit 9755128719
53 changed files with 3411 additions and 1505 deletions

View file

@ -213,6 +213,11 @@ if(CLR_CMAKE_HOST_UNIX)
add_compile_options($<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wno-conversion-null>)
add_compile_options($<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wno-pointer-arith>)
if(CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_I386 OR CLR_CMAKE_TARGET_ARCH_ARM64)
# Allow 16 byte compare-exchange
add_compile_options(-mcx16)
endif(CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_I386 OR CLR_CMAKE_TARGET_ARCH_ARM64)
set (NATIVE_RESOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nativeresources)
include_directories(${NATIVE_RESOURCE_DIR})
set (PROCESS_RC_SCRIPT ${NATIVE_RESOURCE_DIR}/processrc.sh)

View file

@ -133,6 +133,7 @@ if(CLR_CMAKE_TARGET_WIN32)
bcrypt.lib
RuntimeObject.lib
delayimp.lib
Synchronization.lib
)
else()
list(APPEND CORECLR_LIBRARIES

View file

@ -39,6 +39,8 @@ set(GC_SOURCES
satori/SatoriAllocationContext.cpp
satori/SatoriUtil.cpp
satori/SatoriLock.cpp
satori/SatoriWorkList.cpp
satori/SatoriGate.cpp
)
if(CLR_CMAKE_HOST_UNIX)
@ -110,6 +112,7 @@ if (CLR_CMAKE_TARGET_WIN32)
satori/SatoriAllocationContext.h
satori/SatoriUtil.h
satori/SatoriLock.h
satori/SatoriGate.h
)
endif(CLR_CMAKE_TARGET_WIN32)
@ -118,6 +121,7 @@ if(CLR_CMAKE_HOST_WIN32)
${STATIC_MT_CRT_LIB}
${STATIC_MT_VCRT_LIB}
kernel32.lib
Synchronization.lib
advapi32.lib)
endif(CLR_CMAKE_HOST_WIN32)

View file

@ -173,11 +173,16 @@ public:
MethodTable * GetGCSafeMethodTable() const
{
#if !defined(FEATURE_SATORI_GC)
#ifdef HOST_64BIT
return (MethodTable *)((uintptr_t)m_pMethTab & ~7);
#else
return (MethodTable *)((uintptr_t)m_pMethTab & ~3);
#endif //HOST_64BIT
#else
// Satori does not mess up MT pointers.
return RawGetMethodTable();
#endif
}
void RawSetMethodTable(MethodTable * pMT)

View file

@ -252,7 +252,7 @@ public:
// granularity.
static void* VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node = NUMA_NODE_UNDEFINED);
static void* VirtualReserve(void* location, size_t size);
static void* VirtualReserve(void* location, size_t size, bool useTHP = false);
// Release virtual memory range previously reserved using VirtualReserve
// Parameters:

View file

@ -149,8 +149,12 @@ public:
BOOL_CONFIG (RelocatingInGen1, "gcRelocatingGen1", NULL, true, "Specifies whether GC can relocate objects in Gen1 GC") \
BOOL_CONFIG (RelocatingInGen2, "gcRelocatingGen2", NULL, true, "Specifies whether GC can relocate objects in Gen2 GC") \
INT_CONFIG (ParallelGC, "gcParallel", NULL, -1, "Specifies max number of addtional GC threads. 0 - no helpers, -1 - default") \
BOOL_CONFIG (ThreadLocalGC, "gcThreadLocal", NULL, true, "Specifies whether thread-local GC can be performed") \
BOOL_CONFIG (Gen0GC, "gcGen0", NULL, true, "Specifies whether Gen0 GC can be performed") \
BOOL_CONFIG (Gen1GC, "gcGen1", NULL, true, "Specifies whether Gen1 GC can be performed") \
BOOL_CONFIG (UseTHP, "gcTHP", NULL, true, "Specifies whether Transparent Huge Pages can be used. (Linux only)") \
BOOL_CONFIG (TrimmigGC, "gcTrim", NULL, true, "Specifies whether background trimming is enabled") \
INT_CONFIG (GCRate, "gcRate", NULL, -1, "Specifies soft min limit for time between GCs in milliseconds. -1 - default") \
INT_CONFIG (GCSpin, "gcSpin", NULL, -1, "Spin") \
// This class is responsible for retreiving configuration information
// for how the GC should operate.

View file

@ -48,22 +48,22 @@ void SatoriAllocator::Initialize(SatoriHeap* heap)
for (int i = 0; i < Satori::ALLOCATOR_BUCKET_COUNT; i++)
{
m_queues[i] = new (nothrow) SatoriRegionQueue(QueueKind::Allocator);
m_queues[i] = SatoriRegionQueue::AllocAligned(QueueKind::Allocator);
}
m_workChunks = new (nothrow) SatoriWorkList();
m_workChunks = SatoriWorkList::AllocAligned();
m_immortalRegion = nullptr;
m_immortalAlocLock.Initialize();
m_immortalAllocLock.Initialize();
m_pinnedRegion = nullptr;
m_pinnedAlocLock.Initialize();
m_pinnedAllocLock.Initialize();
m_largeRegion = nullptr;
m_largeAlocLock.Initialize();
m_largeAllocLock.Initialize();
m_regularRegion = nullptr;
m_regularAlocLock.Initialize();
m_regularAllocLock.Initialize();
m_singePageAdders = 0;
}
@ -127,7 +127,7 @@ tryAgain:
{
// Reserving a regular-sized Page.
// We will often come here on multiple threads and we do not want all threads to reserve a page.
// If someone alse is reserving, we will allow 1 msec of retrying before reserving a page eagerly.
// If someone else is reserving, we will allow 1 msec of retrying before reserving a page eagerly.
if (newPageDeadline == 0)
{
newPageDeadline = GCToOSInterface::QueryPerformanceCounter() + GCToOSInterface::QueryPerformanceFrequency() / 1000;
@ -216,8 +216,21 @@ void SatoriAllocator::ReturnRegion(SatoriRegion* region)
m_queues[SizeToBucket(region->Size())]->Push(region);
}
void SatoriAllocator::ReturnRegionNoLock(SatoriRegion* region)
{
_ASSERTE(region->IsAttachedToAllocatingOwner() == false);
_ASSERTE(region->Generation() == -1);
_ASSERTE(m_heap->Recycler()->IsBlockingPhase());
m_queues[SizeToBucket(region->Size())]->PushNoLock(region);
}
void SatoriAllocator::AllocationTickIncrement(AllocationTickKind allocationTickKind, size_t totalAdded, SatoriObject* obj, size_t objSize)
{
if (!EVENT_ENABLED(GCAllocationTick_V4))
{
return;
}
size_t& tickAmout = allocationTickKind == AllocationTickKind::Small ?
m_smallAllocTickAmount :
@ -242,6 +255,11 @@ void SatoriAllocator::AllocationTickIncrement(AllocationTickKind allocationTickK
void SatoriAllocator::AllocationTickDecrement(size_t totalUnused)
{
if (!EVENT_ENABLED(GCAllocationTick_V4))
{
return;
}
Interlocked::ExchangeAdd64(&m_smallAllocTickAmount, (size_t)(-(int64_t)totalUnused));
}
@ -292,6 +310,15 @@ Object* SatoriAllocator::Alloc(SatoriAllocationContext* context, size_t size, ui
return nullptr;
}
thread_local
size_t lastSharedRegularAllocUsec;
#ifdef _DEBUG
const size_t minSharedAllocDelay = 1024;
#else
const size_t minSharedAllocDelay = 128;
#endif
SatoriObject* SatoriAllocator::AllocRegular(SatoriAllocationContext* context, size_t size, uint32_t flags)
{
@ -302,17 +329,23 @@ SatoriObject* SatoriAllocator::AllocRegular(SatoriAllocationContext* context, si
SatoriObject* freeObj = context->alloc_ptr != 0 ? context->FinishAllocFromShared() : nullptr;
//m_regularAlocLock.Enter();
if (m_regularAlocLock.TryEnter())
size_t usecNow = m_heap->Recycler()->GetNowUsecs();
if (usecNow - lastSharedRegularAllocUsec > minSharedAllocDelay)
{
if (freeObj && freeObj->ContainingRegion() == m_regularRegion)
{
size_t size = freeObj->Size();
m_regularRegion->SetOccupancy(m_regularRegion->Occupancy() - size);
m_regularRegion->AddFreeSpace(freeObj, size);
}
lastSharedRegularAllocUsec = usecNow;
return AllocRegularShared(context, size, flags);
//m_regularAllocLock.Enter();
if (m_regularAllocLock.TryEnter())
{
if (freeObj && freeObj->ContainingRegion() == m_regularRegion)
{
size_t size = freeObj->FreeObjSize();
m_regularRegion->SetOccupancy(m_regularRegion->Occupancy() - size);
m_regularRegion->ReturnFreeSpace(freeObj, size);
}
return AllocRegularShared(context, size, flags);
}
}
}
else
@ -343,9 +376,22 @@ SatoriObject* SatoriAllocator::AllocRegular(SatoriAllocationContext* context, si
if (moreSpace <= allocRemaining)
{
bool zeroInitialize = !(flags & GC_ALLOC_ZEROING_OPTIONAL);
if (zeroInitialize && moreSpace < SatoriUtil::MinZeroInitSize())
if (zeroInitialize)
{
moreSpace = min(allocRemaining, SatoriUtil::MinZeroInitSize());
if (moreSpace < SatoriUtil::MinZeroInitSize())
{
moreSpace = min(allocRemaining, SatoriUtil::MinZeroInitSize());
}
// " +/- sizeof(size_t)" here is to intentionally misalign alloc_limit on the index granularity
// to improve chances that the object that is allocated here will be indexed
size_t misAlignedOnIndexEnd = ALIGN_UP(region->GetAllocStart() + moreSpace + sizeof(size_t), Satori::INDEX_GRANULARITY) - sizeof(size_t);
size_t misAlignedMoreSpace = misAlignedOnIndexEnd - region->GetAllocStart();
if (misAlignedMoreSpace <= allocRemaining)
{
moreSpace = misAlignedMoreSpace;
}
}
if (region->Allocate(moreSpace, zeroInitialize))
@ -414,9 +460,10 @@ SatoriObject* SatoriAllocator::AllocRegular(SatoriAllocationContext* context, si
region->DetachFromAlocatingOwnerRelease();
m_heap->Recycler()->AddEphemeralRegion(region);
// if we got this far with region not detached, get another one
// TUNING: we could force trying to allocate from shared based on some heuristic
// goto tryAgain;
// if we got this far with region not detached, get another one
}
TryGetRegularRegion(region);
@ -435,7 +482,7 @@ SatoriObject* SatoriAllocator::AllocRegular(SatoriAllocationContext* context, si
// 4) (optional: clear escape tag) Detach
region->AttachToAllocatingOwner(&context->RegularRegion());
if (SatoriUtil::IsThreadLocalGCEnabled())
if (SatoriUtil::IsGen0Enabled())
{
switch (region->ReusableFor())
{
@ -477,9 +524,22 @@ SatoriObject* SatoriAllocator::AllocRegularShared(SatoriAllocationContext* conte
{
// we have enough free space in the region to continue
bool zeroInitialize = !(flags & GC_ALLOC_ZEROING_OPTIONAL);
if (zeroInitialize && moreSpace < SatoriUtil::MinZeroInitSize())
if (zeroInitialize)
{
moreSpace = min(allocRemaining, SatoriUtil::MinZeroInitSize());
if (moreSpace < SatoriUtil::MinZeroInitSize())
{
moreSpace = min(allocRemaining, SatoriUtil::MinZeroInitSize());
}
// " +/- sizeof(size_t)" here is to intentionally misalign alloc_limit on the index granularity
// to improve chances that the object that is allocated here will be indexed
size_t misAlignedOnIndexEnd = ALIGN_UP(region->GetAllocStart() + moreSpace + sizeof(size_t), Satori::INDEX_GRANULARITY) - sizeof(size_t);
size_t misAlignedMoreSpace = misAlignedOnIndexEnd - region->GetAllocStart();
if (misAlignedMoreSpace <= allocRemaining)
{
moreSpace = misAlignedMoreSpace;
}
}
// do not zero-initialize just yet, we will do that after leaving the lock.
@ -487,7 +547,7 @@ SatoriObject* SatoriAllocator::AllocRegularShared(SatoriAllocationContext* conte
if (!result)
{
//OOM, nothing to undo.
m_regularAlocLock.Leave();
m_regularAllocLock.Leave();
return nullptr;
}
@ -497,7 +557,7 @@ SatoriObject* SatoriAllocator::AllocRegularShared(SatoriAllocationContext* conte
{
// OOM, undo the allocation
region->StopAllocating(result->Start());
m_regularAlocLock.Leave();
m_regularAllocLock.Leave();
return nullptr;
}
}
@ -512,7 +572,7 @@ SatoriObject* SatoriAllocator::AllocRegularShared(SatoriAllocationContext* conte
region->IncrementUnfinishedAlloc();
// done with region modifications.
m_regularAlocLock.Leave();
m_regularAllocLock.Leave();
context->alloc_ptr = (uint8_t*)result + size;
context->alloc_bytes += moreSpace;
@ -521,6 +581,7 @@ SatoriObject* SatoriAllocator::AllocRegularShared(SatoriAllocationContext* conte
context->alloc_limit = (uint8_t*)result + moreSpace;
result->CleanSyncBlock();
region->SetIndicesForObject(result, result->Start() + size);
if (zeroInitialize)
{
memset((uint8_t*)result + sizeof(size_t), 0, moreSpace - 2 * sizeof(size_t));
@ -623,8 +684,8 @@ tryAgain:
{
m_heap->Recycler()->MaybeTriggerGC(gc_reason::reason_alloc_loh);
//m_largeAlocLock.Enter();
if (m_largeAlocLock.TryEnter())
//m_largeAllocLock.Enter();
if (m_largeAllocLock.TryEnter())
{
return AllocLargeShared(context, size, flags);
}
@ -688,7 +749,7 @@ tryAgain:
}
// try get from the free list
if (region->StartAllocating(size))
if (region->StartAllocatingBestFit(size))
{
// we have enough free space in the region to continue
continue;
@ -746,7 +807,7 @@ SatoriObject* SatoriAllocator::AllocLargeShared(SatoriAllocationContext* context
if (!result)
{
//OOM, nothing to undo
m_largeAlocLock.Leave();
m_largeAllocLock.Leave();
return nullptr;
}
@ -756,7 +817,7 @@ SatoriObject* SatoriAllocator::AllocLargeShared(SatoriAllocationContext* context
{
// OOM, undo the allocation
region->StopAllocating(result->Start());
m_largeAlocLock.Leave();
m_largeAllocLock.Leave();
return nullptr;
}
@ -766,10 +827,11 @@ SatoriObject* SatoriAllocator::AllocLargeShared(SatoriAllocationContext* context
// region stays unparsable until allocation is complete.
region->IncrementUnfinishedAlloc();
// done with region modifications.
m_largeAlocLock.Leave();
m_largeAllocLock.Leave();
context->alloc_bytes_uoh += size;
result->CleanSyncBlockAndSetUnfinished();
region->SetIndicesForObject(result, result->Start() + size);
if (!(flags & GC_ALLOC_ZEROING_OPTIONAL))
{
memset((uint8_t*)result + sizeof(size_t), 0, size - 2 * sizeof(size_t));
@ -785,7 +847,7 @@ SatoriObject* SatoriAllocator::AllocLargeShared(SatoriAllocationContext* context
}
// try get from the free list
if (region->StartAllocating(size))
if (region->StartAllocatingBestFit(size))
{
// we have enough free space in the region to continue
continue;
@ -811,7 +873,7 @@ SatoriObject* SatoriAllocator::AllocLargeShared(SatoriAllocationContext* context
if (!region)
{
//OOM
m_largeAlocLock.Leave();
m_largeAllocLock.Leave();
return nullptr;
}
@ -850,6 +912,7 @@ SatoriObject* SatoriAllocator::AllocHuge(SatoriAllocationContext* context, size_
}
result->CleanSyncBlock();
hugeRegion->SetIndicesForObject(result, hugeRegion->Start() + Satori::REGION_SIZE_GRANULARITY);
// huge regions are not attached to contexts and in gen0+ would appear parseable,
// but this one is not parseable yet since the new object has no MethodTable
@ -881,7 +944,7 @@ SatoriObject* SatoriAllocator::AllocPinned(SatoriAllocationContext* context, siz
m_heap->Recycler()->MaybeTriggerGC(gc_reason::reason_alloc_soh);
// if can't get a lock, let AllocLarge handle this.
if (!m_pinnedAlocLock.TryEnter())
if (!m_pinnedAllocLock.TryEnter())
{
return AllocLarge(context, size, flags);
}
@ -898,7 +961,7 @@ SatoriObject* SatoriAllocator::AllocPinned(SatoriAllocationContext* context, siz
if (!result)
{
//OOM, nothing to undo
m_pinnedAlocLock.Leave();
m_pinnedAllocLock.Leave();
return nullptr;
}
@ -908,7 +971,7 @@ SatoriObject* SatoriAllocator::AllocPinned(SatoriAllocationContext* context, siz
{
// OOM, undo the allocation
region->StopAllocating(result->Start());
m_pinnedAlocLock.Leave();
m_pinnedAllocLock.Leave();
return nullptr;
}
@ -918,10 +981,11 @@ SatoriObject* SatoriAllocator::AllocPinned(SatoriAllocationContext* context, siz
// region stays unparsable until allocation is complete.
region->IncrementUnfinishedAlloc();
// done with region modifications.
m_pinnedAlocLock.Leave();
m_pinnedAllocLock.Leave();
context->alloc_bytes_uoh += size;
result->CleanSyncBlockAndSetUnfinished();
region->SetIndicesForObject(result, result->Start() + size);
if (!(flags & GC_ALLOC_ZEROING_OPTIONAL))
{
memset((uint8_t*)result + sizeof(size_t), 0, size - 2 * sizeof(size_t));
@ -963,7 +1027,7 @@ SatoriObject* SatoriAllocator::AllocPinned(SatoriAllocationContext* context, siz
if (!region)
{
//OOM
m_pinnedAlocLock.Leave();
m_pinnedAllocLock.Leave();
return nullptr;
}
@ -983,7 +1047,7 @@ SatoriObject* SatoriAllocator::AllocImmortal(SatoriAllocationContext* context, s
// immortal allocs should be way less than region size.
_ASSERTE(size < Satori::REGION_SIZE_GRANULARITY / 2);
SatoriLockHolder<SatoriLock> holder(&m_immortalAlocLock);
SatoriLockHolder holder(&m_immortalAllocLock);
SatoriRegion* region = m_immortalRegion;
while (true)

View file

@ -56,6 +56,7 @@ public:
SatoriRegion* GetRegion(size_t minSize);
void AddRegion(SatoriRegion* region);
void ReturnRegion(SatoriRegion* region);
void ReturnRegionNoLock(SatoriRegion * region);
void AllocationTickIncrement(AllocationTickKind isSmall, size_t totalAdded, SatoriObject* obj, size_t obj_size);
void AllocationTickDecrement(size_t totalUnused);
@ -72,25 +73,37 @@ private:
SatoriRegionQueue* m_queues[Satori::ALLOCATOR_BUCKET_COUNT];
SatoriWorkList* m_workChunks;
SatoriRegion* m_immortalRegion;
SatoriLock m_immortalAlocLock;
SatoriRegion* m_pinnedRegion;
SatoriSpinLock m_pinnedAlocLock;
SatoriRegion* m_largeRegion;
SatoriSpinLock m_largeAlocLock;
SatoriRegion* m_regularRegion;
SatoriSpinLock m_regularAlocLock;
SatoriRegion* m_largeRegion;
SatoriRegion* m_pinnedRegion;
SatoriRegion* m_immortalRegion;
volatile int32_t m_singePageAdders;
DECLSPEC_ALIGN(64)
SatoriLock m_regularAllocLock;
DECLSPEC_ALIGN(64)
SatoriLock m_immortalAllocLock;
DECLSPEC_ALIGN(64)
SatoriLock m_largeAllocLock;
DECLSPEC_ALIGN(64)
SatoriLock m_pinnedAllocLock;
// for event trace
size_t m_smallAllocTickAmount;
DECLSPEC_ALIGN(64)
size_t m_largeAllocTickAmount;
DECLSPEC_ALIGN(64)
size_t m_pinnedAllocTickAmount;
DECLSPEC_ALIGN(64)
size_t m_smallAllocTickAmount;
private:
DECLSPEC_ALIGN(64)
volatile int32_t m_singePageAdders;
SatoriObject* AllocRegular(SatoriAllocationContext* context, size_t size, uint32_t flags);
SatoriObject* AllocRegularShared(SatoriAllocationContext* context, size_t size, uint32_t flags);
SatoriObject* AllocLarge(SatoriAllocationContext* context, size_t size, uint32_t flags);

View file

@ -653,17 +653,20 @@ size_t SatoriGC::GetPromotedBytes(int heap_index)
return 0;
}
static uint64_t g_totalLimit;
void SatoriGC::GetMemoryInfo(uint64_t* highMemLoadThresholdBytes, uint64_t* totalAvailableMemoryBytes, uint64_t* lastRecordedMemLoadBytes, uint64_t* lastRecordedHeapSizeBytes, uint64_t* lastRecordedFragmentationBytes, uint64_t* totalCommittedBytes, uint64_t* promotedBytes, uint64_t* pinnedObjectCount, uint64_t* finalizationPendingCount, uint64_t* index, uint32_t* generation, uint32_t* pauseTimePct, bool* isCompaction, bool* isConcurrent, uint64_t* genInfoRaw, uint64_t* pauseInfoRaw, int kind)
{
LastRecordedGcInfo* lastGcInfo = m_heap->Recycler()->GetLastGcInfo((gc_kind)kind);
uint64_t totalLimit = GCToOSInterface::GetPhysicalMemoryLimit();
if (g_totalLimit == 0)
g_totalLimit = GCToOSInterface::GetPhysicalMemoryLimit();
uint64_t totalLimit = g_totalLimit;
*highMemLoadThresholdBytes = totalLimit * 99 / 100; // just say 99% for now
*totalAvailableMemoryBytes = totalLimit;
uint32_t memLoad;
uint64_t availPhysical, availPage;
GCToOSInterface::GetMemoryStatus(totalLimit, &memLoad, &availPhysical, &availPage);
uint32_t memLoad = GetMemoryLoad();
*lastRecordedMemLoadBytes = memLoad * totalLimit / 100;
*lastRecordedHeapSizeBytes = GetTotalBytesInUse();
@ -686,13 +689,22 @@ void SatoriGC::GetMemoryInfo(uint64_t* highMemLoadThresholdBytes, uint64_t* tota
}
}
static uint32_t g_memLoad;
static size_t g_memLoadMsec;
uint32_t SatoriGC::GetMemoryLoad()
{
uint32_t memLoad;
uint64_t availPhysical, availPage;
GCToOSInterface::GetMemoryStatus(0, &memLoad, &availPhysical, &availPage);
size_t time = GetNow();
return memLoad;
// limit querying frequency to once per 16 msec.
if ((time >> 4) != (g_memLoadMsec >> 4))
{
uint64_t availPhysical, availPage;
GCToOSInterface::GetMemoryStatus(0, &g_memLoad, &availPhysical, &availPage);
g_memLoadMsec = time;
}
return g_memLoad;
}
void SatoriGC::DiagGetGCSettings(EtwGCSettingsInfo* etw_settings)
@ -847,7 +859,8 @@ void SatoriGC::BulkMoveWithWriteBarrier(void* dst, const void* src, size_t byteC
memmove(dst, src, byteCount);
if (byteCount >= sizeof(size_t) &&
(!localAssignment || m_heap->Recycler()->IsBarrierConcurrent()))
(!(localAssignment || m_heap->Recycler()->IsNextGcFullGc()) ||
m_heap->Recycler()->IsBarrierConcurrent()))
{
SetCardsAfterBulkCopy((size_t)dst, (size_t)src, byteCount);
}

View file

@ -0,0 +1,87 @@
// Copyright (c) 2024 Vladimir Sadov
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// SatoriGate.cpp
//
#ifdef TARGET_WINDOWS
#include "common.h"
#include "windows.h"
#include "synchapi.h"
#include "SatoriGate.h"
SatoriGate::SatoriGate()
{
m_state = s_blocking;
}
// If this gate is in blocking state, the thread will block
// until woken up, possibly spuriously.
void SatoriGate::Wait()
{
uint32_t blocking = s_blocking;
BOOL result = WaitOnAddress(&m_state, &blocking, sizeof(uint32_t), INFINITE);
_ASSERTE(result == TRUE);
m_state = s_blocking;
}
// If this gate is in blocking state, the thread will block
// until woken up, possibly spuriously.
// or until the wait times out. (in a case of timeout returns false)
bool SatoriGate::TimedWait(int timeout)
{
uint32_t blocking = s_blocking;
BOOL result = WaitOnAddress(&m_state, &blocking, sizeof(uint32_t), timeout);
_ASSERTE(result == TRUE || GetLastError() == ERROR_TIMEOUT);
bool woken = result == TRUE;
if (woken)
{
// consume the wake
m_state = s_blocking;
}
return woken;
}
// After this call at least one thread will go through the gate, either by waking up,
// or by going through Wait without blocking.
// If there are several racing wakes, one or more may take effect,
// but all wakes will see at least one thread going through the gate.
void SatoriGate::WakeOne()
{
m_state = s_open;
WakeByAddressSingle((PVOID)&m_state);
}
// Same as WakeOne, but if there are multiple waiters sleeping,
// all will be woken up and go through the gate.
void SatoriGate::WakeAll()
{
m_state = s_open;
WakeByAddressAll((PVOID)&m_state);
}
#endif

View file

@ -0,0 +1,60 @@
// Copyright (c) 2024 Vladimir Sadov
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// SatoriGate.h
//
#ifndef __SATORI_GATE_H__
#define __SATORI_GATE_H__
#include <stdint.h>
class SatoriGate
{
private:
static const uint32_t s_open = 1;
static const uint32_t s_blocking = 0;
volatile uint32_t m_state;
#if defined(_INC_PTHREADS)
pthread_mutex_t* m_cs;
pthread_cond_t* m_cv;
#else
size_t* dummy1;
size_t* dummy2;
#endif
public:
SatoriGate();
void Wait();
bool TimedWait(int timeout);
void WakeOne();
void WakeAll();
};
#endif

View file

@ -116,7 +116,7 @@ bool SatoriHeap::CommitMoreMap(size_t currentCommittedMapSize)
void* commitFrom = (void*)((size_t)&m_pageMap + currentCommittedMapSize);
size_t commitSize = SatoriUtil::CommitGranularity();
SatoriLockHolder<SatoriSpinLock> holder(&m_mapLock);
SatoriLockHolder holder(&m_mapLock);
if (currentCommittedMapSize <= m_committedMapSize)
{
if (GCToOSInterface::VirtualCommit(commitFrom, commitSize))

View file

@ -132,7 +132,7 @@ private:
size_t m_committedMapSize;
size_t m_usedMapLength;
size_t m_nextPageIndex;
SatoriSpinLock m_mapLock;
SatoriLock m_mapLock;
SatoriPage* m_pageMap[1];
static int8_t* s_pageByteMap;

View file

@ -30,3 +30,189 @@
#include "../env/gcenv.os.h"
#include "SatoriLock.h"
NOINLINE
bool SatoriLock::EnterSlow(bool noBlock)
{
bool hasWaited = false;
// we will retry after waking up
while (true)
{
int iteration = 1;
// We will count when we failed to change the state of the lock and increase pauses
// so that bursts of activity are better tolerated. This should not happen often.
int collisions = 0;
// We will track the changes of ownership while we are trying to acquire the lock.
size_t oldOwner = _owningThreadId;
uint32_t ownerChanged = 0;
int iterationLimit = _spinCount >> SpinCountScaleShift;
// inner loop where we try acquiring the lock or registering as a waiter
while (true)
{
//
// Try to grab the lock. We may take the lock here even if there are existing waiters. This creates the possibility
// of starvation of waiters, but it also prevents lock convoys and preempted waiters from destroying perf.
// However, if we do not see _wakeWatchDog cleared for long enough, we go into YieldToWaiters mode to ensure some
// waiter progress.
//
uint32_t oldState = _state;
bool canAcquire = ((oldState & Locked) == Unlocked) &&
(hasWaited || ((oldState & YieldToWaiters) == 0));
if (canAcquire)
{
uint32_t newState = oldState | Locked;
if (hasWaited)
newState = (newState - WaiterCountIncrement) & ~(WaiterWoken | YieldToWaiters);
if (CompareExchangeAcq(&_state, newState, oldState))
{
// GOT THE LOCK!!
_ASSERTE((_state | Locked) != 0);
_ASSERTE(_owningThreadId == 0);
_owningThreadId = SatoriUtil::GetCurrentThreadTag();
if (hasWaited)
_wakeWatchDog = 0;
// now we can estimate how busy the lock is and adjust spinning accordingly
uint16_t spinLimit = _spinCount;
if (ownerChanged != 0)
{
// The lock has changed ownership while we were trying to acquire it.
// It is a signal that we might want to spin less next time.
// Pursuing a lock that is being "stolen" by other threads is inefficient
// due to cache misses and unnecessary sharing of state that keeps invalidating.
if (spinLimit > DefaultMinSpinCount)
{
_spinCount = (uint16_t)(spinLimit - 1);
}
}
else if (spinLimit < DefaultMaxSpinCount &&
iteration >= (spinLimit >> SpinCountScaleShift))
{
// we used all of allowed iterations, but the lock does not look very contested,
// we can allow a bit more spinning.
//
// NB: if we acquired the lock while registering a waiter, and owner did not change it still counts.
// (however iteration does not grow beyond the iterationLimit)
_spinCount = (uint16_t)(spinLimit + 1);
}
return true;
}
}
size_t newOwner = _owningThreadId;
if (newOwner != 0 && newOwner != oldOwner)
{
if (oldOwner != 0)
ownerChanged++;
oldOwner = newOwner;
}
if (iteration < iterationLimit)
{
// We failed to acquire the lock and want to retry after a pause.
// Ideally we will retry right when the lock becomes free, but we cannot know when that will happen.
// We will use a pause that doubles up on every iteration. It will not be more than 2x worse
// than the ideal guess, while minimizing the number of retries.
// We will allow pauses up to 64~128 spinwaits.
IterationBackoff(min(iteration, 6));
iteration++;
continue;
}
else if (!canAcquire)
{
// We reached our spin limit, and need to wait.
if (noBlock)
return false;
// If waiter was awaken spuriously, it may acquire the lock before wake watchdog is set.
// If there are no more waiters for a long time, the watchdog could hang around for a while too.
// When a new waiter enters the system, it may look like we had no waiter progress for all that time.
// To avoid this, if it looks like we have no waiters and will be the first new one,
// clear the watchdog.
// It is ok to clear even if we will not end up the first one.
// We will self-correct on the next wake and reestablish a new watchdog.
if (oldState < WaiterCountIncrement && _wakeWatchDog !=0)
_wakeWatchDog = 0;
// Increment the waiter count.
// Note that we do not do any overflow checking on this increment. In order to overflow,
// we'd need to have about 1 billion waiting threads, which is inconceivable anytime in the
// forseeable future.
uint32_t newState = oldState + WaiterCountIncrement;
if (hasWaited)
newState = (newState - WaiterCountIncrement) & ~WaiterWoken;
if (Interlocked::CompareExchange(&_state, newState, oldState) == oldState)
break;
}
CollisionBackoff(++collisions);
}
//
// Now we wait.
//
_ASSERTE(_state >= WaiterCountIncrement);
_gate->Wait();
_ASSERTE(_state >= WaiterCountIncrement);
// this was either real or spurious wake.
// either way try acquire again.
hasWaited = true;
}
}
NOINLINE
void SatoriLock::AwakeWaiterIfNeeded()
{
int collisions = 0;
while (true)
{
uint32_t oldState = _state;
if ((int32_t)oldState >= (int32_t)WaiterCountIncrement) // false if WaiterWoken is set
{
// there are waiters, and nobody has woken one.
uint32_t newState = oldState | WaiterWoken;
uint16_t lastWakeTicks = _wakeWatchDog;
if (lastWakeTicks != 0)
{
uint16_t currentTicks = GetTickCount();
if ((int16_t)currentTicks - (int16_t)lastWakeTicks > (int16_t)WaiterWatchdogTicks)
{
//printf("Last: %i ", (int)lastWakeTicks);
//printf("Current: %i \n", (int)currentTicks);
newState |= YieldToWaiters;
}
}
if (Interlocked::CompareExchange(&_state, newState, oldState) == oldState)
{
if (lastWakeTicks == 0)
{
// Sometimes timestamp will be 0.
// It is harmless. We will try again on the next wake
_wakeWatchDog = GetTickCount();
}
_gate->WakeOne();
return;
}
}
else
{
// no need to wake a waiter.
return;
}
CollisionBackoff(++collisions);
}
}

View file

@ -30,125 +30,251 @@
#include "common.h"
#include "../gc.h"
#include "SatoriUtil.h"
#include "SatoriGate.h"
#if defined(TARGET_OSX)
#include <time.h>
#endif
class SatoriLock
{
private:
CLRCriticalSection m_cs;
// m_state layout:
//
// bit 0: True if the lock is held, false otherwise.
//
// bit 1: True if nonwaiters must not get ahead of waiters when acquiring a contended lock.
//
// sign bit: True if we've set the event to wake a waiting thread. The waiter resets this to false when it
// wakes up. This avoids the overhead of setting the event multiple times.
//
// everything else: A count of the number of threads waiting on the event.
static const uint32_t Unlocked = 0;
static const uint32_t Locked = 1;
static const uint32_t YieldToWaiters = 2;
static const uint32_t WaiterCountIncrement = 4;
static const uint32_t WaiterWoken = 1u << 31;
public:
void Initialize()
{
m_cs.Initialize();
}
volatile uint32_t _state;
volatile uint16_t _spinCount;
volatile uint16_t _wakeWatchDog;
volatile size_t _owningThreadId;
void Destroy()
{
m_cs.Destroy();
}
void Enter()
{
m_cs.Enter();
}
void Leave()
{
m_cs.Leave();
}
};
class SatoriSpinLock
{
private:
int m_backoff;
public:
void Initialize()
{
m_backoff = 0;
}
void Enter()
{
if (!CompareExchangeAcq(&m_backoff, 1, 0))
{
EnterSpin();
}
}
bool TryEnter()
{
return CompareExchangeAcq(&m_backoff, 1, 0);
}
void Leave()
{
_ASSERTE(m_backoff);
VolatileStore(&m_backoff, 0);
}
SatoriGate* _gate;
private:
NOINLINE
void EnterSpin()
{
int localBackoff = 0;
while (VolatileLoadWithoutBarrier(&m_backoff) ||
!CompareExchangeAcq(&m_backoff, 1, 0))
{
localBackoff = Backoff(localBackoff);
}
}
int Backoff(int backoff)
{
// TUNING: do we care about 1-proc machines?
for (int i = 0; i < backoff; i++)
{
YieldProcessor();
if ((i & 0x3FF) == 0x3FF)
{
GCToOSInterface::YieldThread(0);
}
}
return (backoff * 2 + 1) & 0x3FFF;
}
static bool CompareExchangeAcq(int volatile* destination, int exchange, int comparand)
FORCEINLINE
static bool CompareExchangeAcq(uint32_t volatile* destination, uint32_t exchange, uint32_t comparand)
{
#ifdef _MSC_VER
#if defined(TARGET_AMD64)
return _InterlockedCompareExchange((long*)destination, exchange, comparand) == comparand;
return _InterlockedCompareExchange((long*)destination, exchange, comparand) == (long)comparand;
#else
return _InterlockedCompareExchange_acq((long*)destination, exchange, comparand) == comparand;
return _InterlockedCompareExchange_acq((long*)destination, exchange, comparand) == (long)comparand;
#endif
#else
return __atomic_compare_exchange_n(destination, &comparand, exchange, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
#endif
}
FORCEINLINE
static uint32_t InterlockedDecRel(volatile uint32_t* arg)
{
#ifdef _MSC_VER
#if defined(TARGET_AMD64)
return (uint32_t)_InterlockedDecrement((long*)arg);
#else
return (uint32_t)_InterlockedDecrement_rel((long*)arg);
#endif
#else
return __atomic_sub_fetch(arg, 1, __ATOMIC_RELEASE);
#endif
}
FORCEINLINE
static int64_t GetCheapTimeStamp()
{
#if defined(TARGET_AMD64)
#ifdef _MSC_VER
return __rdtsc();
#else
ptrdiff_t cycles;
ptrdiff_t cyclesHi;
__asm__ __volatile__
("rdtsc":"=a" (cycles), "=d" (cyclesHi));
return (cyclesHi << 32) | cycles;
#endif
#elif defined(TARGET_ARM64)
// On arm64 just read timer register instead
#ifdef _MSC_VER
#define ARM64_CNTVCT_EL0 ARM64_SYSREG(3,3,14,0,2)
return _ReadStatusReg(ARM64_CNTVCT_EL0);
#elif defined(TARGET_LINUX) || defined(TARGET_OSX)
int64_t timerTicks;
asm volatile("mrs %0, cntvct_el0" : "=r"(timerTicks));
return timerTicks;
#else
Unsupported platform?
#endif
#else
Unsupported architecture?
#endif
}
static const uint16_t SpinCountNotInitialized = INT16_MIN;
// While spinning is parameterized in terms of iterations,
// the internal tuning operates with spin count at a finer scale.
// One iteration is mapped to 64 spin count units.
static const int SpinCountScaleShift = 6;
static const uint16_t DefaultMaxSpinCount = 22 << SpinCountScaleShift;
static const uint16_t DefaultMinSpinCount = 1 << SpinCountScaleShift;
// We will use exponential backoff in rare cases when we need to change state atomically and cannot
// make progress due to concurrent state changes by other threads.
// While we cannot know the ideal amount of wait needed before making a successful attempt,
// the exponential backoff will generally be not more than 2X worse than the perfect guess and
// will do a lot less attempts than an simple retry. On multiprocessor machine fruitless attempts
// will cause unnecessary sharing of the contended state which may make modifying the state more expensive.
// To protect against degenerate cases we will cap the per-iteration wait to 1024 spinwaits.
static const uint32_t MaxExponentialBackoffBits = 10;
// This lock is unfair and permits acquiring a contended lock by a nonwaiter in the presence of waiters.
// It is possible for one thread to keep holding the lock long enough that waiters go to sleep and
// then release and reacquire fast enough that waiters have no chance to get the lock.
// In extreme cases one thread could keep retaking the lock starving everybody else.
// If we see woken waiters not able to take the lock for too long we will ask nonwaiters to wait.
static const uint32_t WaiterWatchdogTicks = 60;
public:
void Initialize()
{
_state = 0;
_spinCount = DefaultMinSpinCount;
_wakeWatchDog = 0;
_owningThreadId = 0;
_gate = new (nothrow) SatoriGate();
}
FORCEINLINE
bool TryEnterOneShot()
{
uint32_t origState = _state;
if ((origState & (YieldToWaiters | Locked)) == 0)
{
uint32_t newState = origState + Locked;
if (CompareExchangeAcq(&_state, newState, origState))
{
_ASSERTE(_owningThreadId == 0);
_owningThreadId = SatoriUtil::GetCurrentThreadTag();
return true;
}
}
return false;
}
FORCEINLINE
bool TryEnter()
{
return TryEnterOneShot() ||
EnterSlow(/*noBlock*/true);
}
FORCEINLINE
void Enter()
{
if (!TryEnterOneShot())
{
bool entered = EnterSlow();
_ASSERTE(entered);
}
}
bool IsLocked()
{
return (_state & Locked) != 0;
}
FORCEINLINE
void Leave()
{
_ASSERTE(IsLocked());
_ASSERTE(_owningThreadId == SatoriUtil::GetCurrentThreadTag());
_owningThreadId = 0;
uint32_t state = InterlockedDecRel(&_state);
if ((int32_t)state < (int32_t)WaiterCountIncrement) // true if have no waiters or WaiterWoken is set
{
return;
}
//
// We have waiters; take the slow path.
//
AwakeWaiterIfNeeded();
}
static void CollisionBackoff(uint32_t collisions)
{
_ASSERTE(collisions > 0);
// no need for much randomness here, we will just hash the stack location and a timestamp.
uint32_t rand = ((uint32_t)(size_t)&collisions + (uint32_t)GetCheapTimeStamp()) * 2654435769u;
uint32_t spins = rand >> (uint8_t)((uint32_t)32 - min(collisions, MaxExponentialBackoffBits));
for (int i = 0; i < (int)spins; i++)
{
YieldProcessor();
}
}
private:
static uint16_t GetTickCount()
{
return (uint16_t)GCToOSInterface::GetLowPrecisionTimeStamp();
}
// same idea as in CollisionBackoff, but with guaranteed minimum wait
static void IterationBackoff(int iteration)
{
_ASSERTE(iteration > 0 && iteration < MaxExponentialBackoffBits);
uint32_t rand = ((uint32_t)(size_t)&iteration + (uint32_t)GetCheapTimeStamp()) * 2654435769u;
// set the highmost bit to ensure minimum number of spins is exponentialy increasing
// it basically guarantees that we spin at least 1, 2, 4, 8, 16, times, and so on
rand |= (1u << 31);
uint32_t spins = rand >> (uint8_t)(32 - iteration);
for (int i = 0; i < (int)spins; i++)
{
YieldProcessor();
}
}
NOINLINE
bool EnterSlow(bool noBlock = false);
NOINLINE
void AwakeWaiterIfNeeded();
};
template <typename T>
class SatoriLockHolder : public Satori::StackOnly {
private:
T* const m_lock;
SatoriLock* const m_lock;
public:
// Disallow copying
SatoriLockHolder& operator=(const SatoriLockHolder&) = delete;
SatoriLockHolder(const SatoriLockHolder&) = delete;
SatoriLockHolder(T* lock)
SatoriLockHolder(SatoriLock* lock)
: m_lock(lock)
{
m_lock->Enter();
}
SatoriLockHolder(T* lock, bool isLocked)
SatoriLockHolder(SatoriLock* lock, bool isLocked)
: m_lock(lock)
{
if (!isLocked)

View file

@ -54,6 +54,7 @@ public:
SatoriObject* Next();
size_t Size();
size_t FreeObjSize();
bool SameRegion(SatoriRegion* otherRegion);
bool IsFree();
bool IsExternal();
@ -95,6 +96,9 @@ public:
template <typename F>
void ForEachObjectRef(F lambda, bool includeCollectibleAllocator = false);
template <typename F>
void ForEachObjectRef(F lambda, size_t size, bool includeCollectibleAllocator = false);
template <typename F>
void ForEachObjectRef(F lambda, size_t start, size_t end);

View file

@ -48,6 +48,15 @@ FORCEINLINE size_t SatoriObject::Size()
return size;
}
FORCEINLINE size_t SatoriObject::FreeObjSize()
{
_ASSERTE(IsFree());
size_t size = Satori::MIN_FREE_SIZE;
size += (size_t)((ArrayBase*)this)->GetNumComponents();
size = ALIGN_UP(size, Satori::OBJECT_ALIGNMENT);
return size;
}
inline size_t SatoriObject::Start()
{
return (size_t)this;
@ -303,6 +312,72 @@ inline void SatoriObject::ForEachObjectRef(F lambda, bool includeCollectibleAllo
}
}
template <typename F>
inline void SatoriObject::ForEachObjectRef(F lambda, size_t size, bool includeCollectibleAllocator)
{
MethodTable* mt = RawGetMethodTable();
if (includeCollectibleAllocator && mt->Collectible())
{
uint8_t* loaderAllocator = GCToEEInterface::GetLoaderAllocatorObjectForGC(this);
// NB: Allocator ref location is fake. The actual location is a handle).
// For that same reason relocation callers should not care about the location.
lambda((SatoriObject**)&loaderAllocator);
}
if (!mt->ContainsPointers())
{
return;
}
CGCDesc* map = CGCDesc::GetCGCDescFromMT(mt);
CGCDescSeries* cur = map->GetHighestSeries();
// GetNumSeries is actually signed.
// Negative value means the pattern repeats componentNum times (struct arrays)
ptrdiff_t numSeries = (ptrdiff_t)map->GetNumSeries();
if (numSeries >= 0)
{
CGCDescSeries* last = map->GetLowestSeries();
do
{
size_t refPtr = (size_t)this + cur->GetSeriesOffset();
// series size is offset by the object size, so need to compensate for that.
size_t refPtrStop = refPtr + size + cur->GetSeriesSize();
// top check loop. this could be a zero-element array
while (refPtr < refPtrStop)
{
lambda((SatoriObject**)refPtr);
refPtr += sizeof(size_t);
}
cur--;
} while (cur >= last);
}
else
{
// repeating patern - an array
size_t refPtr = (size_t)this + cur->GetSeriesOffset();
uint32_t componentNum = ((ArrayBase*)this)->GetNumComponents();
while (componentNum-- > 0)
{
for (ptrdiff_t i = 0; i > numSeries; i--)
{
val_serie_item item = *(cur->val_serie + i);
size_t refPtrStop = refPtr + item.nptrs * sizeof(size_t);
do
{
lambda((SatoriObject**)refPtr);
refPtr += sizeof(size_t);
} while (refPtr < refPtrStop);
refPtr += item.skip;
}
}
}
}
template <typename F>
inline void SatoriObject::ForEachObjectRef(F lambda, size_t start, size_t end)
{

View file

@ -39,7 +39,7 @@ SatoriPage* SatoriPage::InitializeAt(size_t address, size_t pageSize, SatoriHeap
{
_ASSERTE(pageSize % Satori::PAGE_SIZE_GRANULARITY == 0);
SatoriPage* result = (SatoriPage*)GCToOSInterface::VirtualReserve((void*)address, pageSize);
SatoriPage* result = (SatoriPage*)GCToOSInterface::VirtualReserve((void*)address, pageSize, SatoriUtil::UseTHP());
if (result == nullptr)
{
return result;
@ -70,16 +70,23 @@ SatoriPage* SatoriPage::InitializeAt(size_t address, size_t pageSize, SatoriHeap
// conservatively assume the first useful card word to cover the start of the first region.
size_t cardTableStart = (result->m_firstRegion - address) / Satori::BYTES_PER_CARD_BYTE;
// this is also region map size
size_t regionNumber = pageSize >> Satori::REGION_BITS;
size_t cardGroupSize = regionNumber * 2;
size_t regionCount = pageSize >> Satori::REGION_BITS;
size_t cardGroupSize = pageSize / Satori::BYTES_PER_CARD_GROUP * 2;
// initializing to EPHEMERAL is optional. 0 is ok too.
// initializing cards to EPHEMERAL is optional. 0 is ok too.
// for huge pages it is not as useful and may get expensive.
// also if the huge obj contains references, its region will go to gen2 anyways
if (pageSize == Satori::PAGE_SIZE_GRANULARITY)
{
#if _DEBUG
// in debug we initialize only half the cards, since it is optional...
memset(&result->m_cardTable[cardTableStart], Satori::CardState::EPHEMERAL, (cardTableSize - cardTableStart) / 2);
#else
memset(&result->m_cardTable[cardTableStart], Satori::CardState::EPHEMERAL, cardTableSize - cardTableStart);
memset(&result->m_cardGroups, Satori::CardState::EPHEMERAL, cardGroupSize);
#endif
// We leave card groups blank as we do not want to look at them when they may not even be covered by regions.
// We maintain the invariant that groups are not set for allocator regions.
// We do not use EPHEMERAL for groups.
}
result->m_cardTableStart = cardTableStart;
@ -90,7 +97,7 @@ SatoriPage* SatoriPage::InitializeAt(size_t address, size_t pageSize, SatoriHeap
result->m_regionMap = (uint8_t*)(address + 128 + cardGroupSize);
// make sure the first useful card word is beyond the header.
_ASSERTE(result->Start() + cardTableStart > (size_t)(result->m_regionMap) + regionNumber);
_ASSERTE(result->Start() + cardTableStart > (size_t)(result->m_regionMap) + regionCount);
return result;
}
@ -157,7 +164,7 @@ SatoriRegion* SatoriPage::NextInPage(SatoriRegion* region)
SatoriRegion* SatoriPage::RegionForCardGroup(size_t group)
{
size_t mapIndex = group;
size_t mapIndex = group * Satori::BYTES_PER_CARD_GROUP / Satori::REGION_SIZE_GRANULARITY;
while (RegionMap()[mapIndex] > 1)
{
mapIndex -= ((size_t)1 << (RegionMap()[mapIndex] - 2));
@ -205,8 +212,8 @@ void SatoriPage::SetCardsForRange(size_t start, size_t end)
memset((void*)(m_cardTable + firstCard), Satori::CardState::REMEMBERED, lastCard - firstCard + 1);
size_t firstGroup = firstByteOffset / Satori::REGION_SIZE_GRANULARITY;
size_t lastGroup = lastByteOffset / Satori::REGION_SIZE_GRANULARITY;
size_t firstGroup = firstByteOffset / Satori::BYTES_PER_CARD_GROUP;
size_t lastGroup = lastByteOffset / Satori::BYTES_PER_CARD_GROUP;
for (size_t i = firstGroup; i <= lastGroup; i++)
{
if (!m_cardGroups[i * 2])
@ -235,7 +242,7 @@ void SatoriPage::DirtyCardForAddress(size_t address)
// so the card dirtying write can be unordered.
m_cardTable[cardByteOffset] = Satori::CardState::DIRTY;
size_t cardGroup = offset / Satori::REGION_SIZE_GRANULARITY;
size_t cardGroup = offset / Satori::BYTES_PER_CARD_GROUP;
VolatileStore(&this->m_cardGroups[cardGroup * 2], Satori::CardState::DIRTY);
VolatileStore(&this->m_cardState, Satori::CardState::DIRTY);
}
@ -262,8 +269,8 @@ void SatoriPage::DirtyCardsForRange(size_t start, size_t end)
// cleaning will read in the opposite order
VolatileStoreBarrier();
size_t firstGroup = firstByteOffset / Satori::REGION_SIZE_GRANULARITY;
size_t lastGroup = lastByteOffset / Satori::REGION_SIZE_GRANULARITY;
size_t firstGroup = firstByteOffset / Satori::BYTES_PER_CARD_GROUP;
size_t lastGroup = lastByteOffset / Satori::BYTES_PER_CARD_GROUP;
for (size_t i = firstGroup; i <= lastGroup; i++)
{
this->m_cardGroups[i * 2] = Satori::CardState::DIRTY;
@ -294,8 +301,8 @@ void SatoriPage::DirtyCardsForRangeConcurrent(size_t start, size_t end)
// we do not clean groups concurrently, so these can be conditional and unordered
// only the eventual final state matters
size_t firstGroup = firstByteOffset / Satori::REGION_SIZE_GRANULARITY;
size_t lastGroup = lastByteOffset / Satori::REGION_SIZE_GRANULARITY;
size_t firstGroup = firstByteOffset / Satori::BYTES_PER_CARD_GROUP;
size_t lastGroup = lastByteOffset / Satori::BYTES_PER_CARD_GROUP;
for (size_t i = firstGroup; i <= lastGroup; i++)
{
if (m_cardGroups[i * 2] != Satori::CardState::DIRTY)
@ -310,7 +317,17 @@ void SatoriPage::DirtyCardsForRangeConcurrent(size_t start, size_t end)
}
}
void SatoriPage::WipeCardsForRange(size_t start, size_t end, bool isTenured)
void SatoriPage::WipeGroupsForRange(size_t start, size_t end)
{
size_t firstByteOffset = start - Start();
size_t lastByteOffset = end - Start() - 1;
size_t firstGroup = firstByteOffset / Satori::BYTES_PER_CARD_GROUP;
size_t lastGroup = lastByteOffset / Satori::BYTES_PER_CARD_GROUP;
memset((void*)&m_cardGroups[firstGroup * 2], Satori::CardState::BLANK, (lastGroup - firstGroup + 1) * 2);
}
void SatoriPage::ResetCardsForRange(size_t start, size_t end, bool isTenured)
{
size_t firstByteOffset = start - Start();
size_t lastByteOffset = end - Start() - 1;
@ -325,7 +342,5 @@ void SatoriPage::WipeCardsForRange(size_t start, size_t end, bool isTenured)
int8_t resetValue = isTenured ? Satori::CardState::BLANK : Satori::CardState::EPHEMERAL;
memset((void*)(m_cardTable + firstCard), resetValue, lastCard - firstCard + 1);
size_t firstGroup = firstByteOffset / Satori::REGION_SIZE_GRANULARITY;
size_t lastGroup = lastByteOffset / Satori::REGION_SIZE_GRANULARITY;
memset((void*)&m_cardGroups[firstGroup * 2], resetValue, (lastGroup - firstGroup + 1) * 2);
WipeGroupsForRange(start, end);
}

View file

@ -69,7 +69,7 @@ public:
{
m_cardTable[cardByteOffset] = Satori::CardState::REMEMBERED;
size_t cardGroup = offset / Satori::REGION_SIZE_GRANULARITY;
size_t cardGroup = offset / Satori::BYTES_PER_CARD_GROUP;
if (!m_cardGroups[cardGroup * 2])
{
m_cardGroups[cardGroup * 2] = Satori::CardState::REMEMBERED;
@ -100,7 +100,7 @@ public:
// we do not clean groups concurrently, so these can be conditional and unordered
// only the eventual final state matters
size_t cardGroup = offset / Satori::REGION_SIZE_GRANULARITY;
size_t cardGroup = offset / Satori::BYTES_PER_CARD_GROUP;
if (m_cardGroups[cardGroup * 2] != Satori::CardState::DIRTY)
{
m_cardGroups[cardGroup * 2] = Satori::CardState::DIRTY;
@ -117,7 +117,8 @@ public:
void DirtyCardsForRange(size_t start, size_t length);
void DirtyCardsForRangeConcurrent(size_t start, size_t end);
void WipeCardsForRange(size_t start, size_t end, bool isTenured);
void WipeGroupsForRange(size_t start, size_t end);
void ResetCardsForRange(size_t start, size_t end, bool isTenured);
volatile int8_t& CardState();
volatile int8_t& ScanTicket();
@ -162,14 +163,14 @@ private:
// ----- we can have a few more fields above as long as m_cardGroups starts at offset 128.
// that can be adjusted if needed
// computed size,
// computed size, located after card groups
// 1byte per region
// 512 bytes per 1Gb
uint8_t* m_regionMap;
// computed size,
// 2byte per region
// 1024 bytes per 1Gb
// 2 byte per card group (4 per region granule)
// 2048 bytes per 1Gb
DECLSPEC_ALIGN(128)
int8_t m_cardGroups[1];
};

View file

@ -82,7 +82,7 @@ inline volatile int8_t& SatoriPage::CardGroupScanTicket(size_t i)
inline size_t SatoriPage::CardGroupCount()
{
return (End() - Start()) >> Satori::REGION_BITS;
return (End() - Start()) / Satori::BYTES_PER_CARD_GROUP;
}
inline int8_t* SatoriPage::CardsForGroup(size_t i)

View file

@ -75,7 +75,7 @@ public:
_ASSERTE(item->m_prev == nullptr);
_ASSERTE(item->m_containingQueue == nullptr);
SatoriLockHolder<SatoriSpinLock> holder(&m_lock);
SatoriLockHolder holder(&m_lock);
m_count++;
item->m_containingQueue = this;
if (m_head == nullptr)
@ -92,6 +92,31 @@ public:
m_head = item;
}
void PushNoLock(T* item)
{
_ASSERTE(item->m_next == nullptr);
_ASSERTE(item->m_prev == nullptr);
_ASSERTE(item->m_containingQueue == nullptr);
size_t oldCount = m_count;
Interlocked::Increment(&m_count);
T* head = Interlocked::ExchangePointer(&m_head, item);
if (head == nullptr)
{
_ASSERTE(m_tail == nullptr);
m_tail = item;
}
else
{
item->m_next = head;
head->m_prev = item;
}
item->m_containingQueue = this;
_ASSERTE(m_count > oldCount);
}
T* TryPop()
{
if (IsEmpty())
@ -101,23 +126,24 @@ public:
T* result;
{
SatoriLockHolder<SatoriSpinLock> holder(&m_lock);
SatoriLockHolder holder(&m_lock);
result = m_head;
if (result == nullptr)
{
return nullptr;
}
T* next = result->m_next;
m_count--;
m_head = next;
result->m_containingQueue = nullptr;
m_head = result->m_next;
if (m_head == nullptr)
if (next == nullptr)
{
m_tail = nullptr;
}
else
{
m_head->m_prev = nullptr;
next->m_prev = nullptr;
}
}
@ -127,13 +153,56 @@ public:
return result;
}
T* TryPopWithTryEnter()
{
if (IsEmpty())
{
return nullptr;
}
T* result;
{
if (!m_lock.TryEnter())
{
return nullptr;
}
result = m_head;
if (result == nullptr)
{
m_lock.Leave();
return nullptr;
}
T* next = result->m_next;
m_count--;
m_head = next;
result->m_containingQueue = nullptr;
if (next == nullptr)
{
m_tail = nullptr;
}
else
{
next->m_prev = nullptr;
}
m_lock.Leave();
}
_ASSERTE(result->m_prev == nullptr);
result->m_next = nullptr;
return result;
}
void Enqueue(T* item)
{
_ASSERTE(item->m_next == nullptr);
_ASSERTE(item->m_prev == nullptr);
_ASSERTE(item->m_containingQueue == nullptr);
SatoriLockHolder<SatoriSpinLock> holder(&m_lock);
SatoriLockHolder holder(&m_lock);
m_count++;
item->m_containingQueue = this;
if (m_tail == nullptr)
@ -181,7 +250,7 @@ public:
bool TryRemove(T* item)
{
{
SatoriLockHolder<SatoriSpinLock> holder(&m_lock);
SatoriLockHolder holder(&m_lock);
if (!Contains(item))
{
return false;
@ -246,7 +315,7 @@ public:
protected:
QueueKind m_kind;
SatoriSpinLock m_lock;
SatoriLock m_lock;
T* m_head;
T* m_tail;
size_t m_count;

File diff suppressed because it is too large Load diff

View file

@ -31,6 +31,7 @@
#include "../gc.h"
#include "SatoriRegionQueue.h"
#include "SatoriWorkList.h"
#include "SatoriGate.h"
class SatoriHeap;
class SatoriTrimmer;
@ -56,7 +57,9 @@ public:
void AddEphemeralRegion(SatoriRegion* region);
void AddTenuredRegion(SatoriRegion* region);
// TODO: VS should be moved to Heap?
size_t GetNowMillis();
size_t GetNowUsecs();
bool& IsLowLatencyMode();
@ -69,11 +72,13 @@ public:
void TryStartGC(int generation, gc_reason reason);
void HelpOnce();
void MaybeTriggerGC(gc_reason reason);
bool IsBlockingPhase();
void ConcurrentHelp();
bool ShouldDoConcurrent(int generation);
void ConcurrentWorkerFn();
void ShutDown();
void BlockingMarkForConcurrentHelper();
void BlockingMarkForConcurrentImpl();
void BlockingMarkForConcurrent();
void MaybeAskForHelp();
@ -99,6 +104,11 @@ public:
return m_isBarrierConcurrent;
}
inline bool IsNextGcFullGc()
{
return m_nextGcIsFullGc;
}
bool IsReuseCandidate(SatoriRegion* region);
bool IsRelocationCandidate(SatoriRegion* region);
bool IsPromotionCandidate(SatoriRegion* region);
@ -109,7 +119,7 @@ public:
return &m_lastEphemeralGcInfo;
if (kind == gc_kind_full_blocking)
return GetLastGcInfo(gc_kind_any); // no concept of blocking GC, every GC has blocking part.
return &m_lastTenuredGcInfo; // no concept of background GC, every GC has blocking part.
if (kind == gc_kind_background)
return GetLastGcInfo(gc_kind_any); // no concept of background GC, cant have 2 GCs at a time.
@ -124,7 +134,7 @@ private:
SatoriHeap* m_heap;
int m_rootScanTicket;
uint8_t m_cardScanTicket;
int8_t m_cardScanTicket;
SatoriWorkList* m_workList;
SatoriTrimmer* m_trimmer;
@ -166,6 +176,11 @@ private:
static const int CC_MARK_STATE_MARKING = 2;
static const int CC_MARK_STATE_DONE = 3;
static const int CC_CLEAN_STATE_NOT_READY = 0;
static const int CC_CLEAN_STATE_SETTING_UP = 1;
static const int CC_CLEAN_STATE_CLEANING = 2;
static const int CC_CLEAN_STATE_DONE = 3;
volatile int m_ccStackMarkState;
volatile int m_ccStackMarkingThreadsNum;
@ -175,6 +190,7 @@ private:
bool m_concurrentCardsDone;
bool m_concurrentHandlesDone;
volatile int m_concurrentCleaningState;
bool m_isRelocating;
bool m_isLowLatencyMode;
@ -188,13 +204,15 @@ private:
int64_t m_gcDurationMillis[3];
size_t m_gen1Budget;
size_t m_totalBudget;
size_t m_totalLimit;
size_t m_nextGcIsFullGc;
size_t m_condemnedRegionsCount;
size_t m_deferredSweepCount;
size_t m_gen1AddedSinceLastCollection;
size_t m_gen2AddedSinceLastCollection;
size_t m_gen1CountAtLastGen2;
size_t m_gcNextTimeTarget;
size_t m_occupancy[3];
size_t m_occupancyAcc[3];
@ -210,12 +228,14 @@ private:
int64_t m_perfCounterTicksPerMilli;
int64_t m_perfCounterTicksPerMicro;
GCEvent* m_helpersGate;
volatile int m_gateSignaled;
volatile int m_activeHelpers;
volatile int m_totalHelpers;
SatoriGate* m_workerGate;
void(SatoriRecycler::* volatile m_activeHelperFn)();
volatile int m_gateSignaled;
volatile int m_workerWoken;
volatile int m_activeWorkers;
volatile int m_totalWorkers;
void(SatoriRecycler::* volatile m_activeWorkerFn)();
int64_t m_noWorkSince;
@ -224,9 +244,6 @@ private:
LastRecordedGcInfo* m_CurrentGcInfo;
private:
bool IsBlockingPhase();
size_t Gen1RegionCount();
size_t Gen2RegionCount();
size_t RegionCount();
@ -243,12 +260,12 @@ private:
template <bool isConservative>
static void MarkFnConcurrent(PTR_PTR_Object ppObject, ScanContext* sc, uint32_t flags);
static void HelperThreadFn(void* param);
int MaxHelpers();
static void WorkerThreadMainLoop(void* param);
int MaxWorkers();
int64_t HelpQuantum();
void AskForHelp();
void RunWithHelp(void(SatoriRecycler::* method)());
bool HelpOnceCore();
bool HelpOnceCore(bool minQuantum);
void PushToEphemeralQueues(SatoriRegion* region);
void PushToTenuredQueues(SatoriRegion* region);
@ -258,7 +275,7 @@ private:
void IncrementRootScanTicket();
void IncrementCardScanTicket();
uint8_t GetCardScanTicket();
int8_t GetCardScanTicket();
void MarkOwnStack(gc_alloc_context* aContext, MarkContext* markContext);
void MarkThroughCards();
@ -271,10 +288,11 @@ private:
void MarkOwnStackAndDrainQueues();
void MarkOwnStackOrDrainQueuesConcurrent(int64_t deadline);
bool MarkDemotedAndDrainQueuesConcurrent(int64_t deadline);
void PushOrReturnWorkChunk(SatoriWorkChunk * srcChunk);
bool DrainMarkQueuesConcurrent(SatoriWorkChunk* srcChunk = nullptr, int64_t deadline = 0);
bool HasDirtyCards();
bool ScanDirtyCardsConcurrent(int64_t deadline);
bool CleanCardsConcurrent(int64_t deadline);
void CleanCards();
bool MarkHandles(int64_t deadline = 0);
void ShortWeakPtrScan();
@ -316,7 +334,7 @@ private:
void Relocate();
void RelocateWorker();
void RelocateRegion(SatoriRegion* region);
void FreeRelocatedRegion(SatoriRegion* curRegion);
void FreeRelocatedRegion(SatoriRegion* curRegion, bool noLock);
void FreeRelocatedRegionsWorker();
void PromoteHandlesAndFreeRelocatedRegions();
@ -333,7 +351,7 @@ private:
void KeepRegion(SatoriRegion* curRegion);
void DrainDeferredSweepQueue();
bool DrainDeferredSweepQueueConcurrent(int64_t deadline = 0);
void DrainDeferredSweepQueueHelp();
void DrainDeferredSweepQueueWorkerFn();
void SweepAndReturnRegion(SatoriRegion* curRegion);
void ASSERT_NO_WORK();

View file

@ -113,7 +113,7 @@ SatoriRecycler* SatoriRegion::Recycler()
void SatoriRegion::RearmCardsForTenured()
{
_ASSERTE(Generation() == 2);
m_containingPage->WipeCardsForRange(Start(), End(), /* tenured */ true);
m_containingPage->ResetCardsForRange(Start(), End(), /* tenured */ true);
HasUnmarkedDemotedObjects() = false;
FreeDemotedTrackers();
@ -135,42 +135,72 @@ void SatoriRegion::FreeDemotedTrackers()
void SatoriRegion::ResetCardsForEphemeral()
{
_ASSERTE(Generation() == 2);
m_containingPage->WipeCardsForRange(Start(), End(), /* tenured */ false);
m_containingPage->ResetCardsForRange(Start(), End(), /* tenured */ false);
}
void SatoriRegion::MakeBlank()
{
_ASSERTE(!m_hasPendingFinalizables);
_ASSERTE(!m_finalizableTrackers);
_ASSERTE(!m_acceptedPromotedObjects);
_ASSERTE(!m_gen2Objects);
_ASSERTE(NothingMarked());
if (m_generation == 2)
{
this->ResetCardsForEphemeral();
}
else
{
m_containingPage->WipeGroupsForRange(Start(), End());
}
m_generation = -1;
m_ownerThreadTag = 0;
m_escapeFunc = EscapeFn;
m_generation = -1;
m_occupancyAtReuse = 0;
// m_end stays the same
// m_containingPage stays the same
m_reusableFor = ReuseLevel::None;
_ASSERTE(!m_allocatingOwnerAttachmentPoint);
_ASSERTE(!m_gen2Objects);
m_allocStart = (size_t)&m_firstObject;
m_allocEnd = End();
m_occupancy = m_allocEnd - m_allocStart;
m_occupancyAtReuse = 0;
m_sweepsSinceLastAllocation = 0;
m_unfinishedAllocationCount = 0;
m_markStack = 0;
// m_used stays the same
// m_committed stays the same
m_escapedSize = 0;
m_objCount = 0;
_ASSERTE(!m_markStack);
m_allocBytesAtCollect = 0;
m_hasFinalizables = false;
_ASSERTE(!m_finalizableTrackers);
_ASSERTE(!m_finalizableTrackersLock);
m_sweepsSinceLastAllocation = 0;
// m_prev
// m_next
// m_containingQueue all stay the same
// assume all space reserved to allocations will be used
// (we will revert what will be unused)
m_occupancy = m_allocEnd - m_allocStart;
m_objCount = 0;
m_unfinishedAllocationCount = 0;
m_hasPinnedObjects = false;
m_hasMarksSet = false;
m_hasFinalizables = false;
_ASSERTE(!m_hasPendingFinalizables);
m_doNotSweep = false;
m_reusableFor = ReuseLevel::None;
m_hasUnmarkedDemotedObjects = false;
_ASSERTE(!m_acceptedPromotedObjects);
_ASSERTE(!m_individuallyPromoted);
_ASSERTE(!m_hasUnmarkedDemotedObjects);
#if _DEBUG
m_hasMarksSet = false;
#endif
//clear index and free list
ClearFreeLists();
@ -249,38 +279,94 @@ bool SatoriRegion::ValidateIndexEmpty()
static const int FREE_LIST_NEXT_OFFSET = sizeof(ArrayBase);
// prefers leftmost bucket that fits to improve locality, possibly at cost to fragmentation
size_t SatoriRegion::StartAllocating(size_t minAllocSize)
{
_ASSERTE(!IsAllocating());
// skip buckets that certainly will not fit.
DWORD bucket;
BitScanReverse64(&bucket, minAllocSize);
// when minAllocSize is not a power of two we could search through the current bucket,
// which may have a large enough obj,
// but we will just use the next bucket, which guarantees it fits
if (minAllocSize & (minAllocSize - 1))
{
bucket++;
}
bucket = bucket > Satori::MIN_FREELIST_SIZE_BITS ?
bucket - Satori::MIN_FREELIST_SIZE_BITS :
0;
// we will check the first free obj in the bucket, but will not dig through the rest.
// if the first obj does not fit, we will switch to the next bucket where everything will fit.
size_t minFreeObjSize = minAllocSize + Satori::MIN_FREE_SIZE;
DWORD selectedBucket = Satori::FREELIST_COUNT;
SatoriObject* freeObj = m_freeLists[bucket];
if (freeObj)
{
if (freeObj->FreeObjSize() >= minFreeObjSize)
{
selectedBucket = bucket;
}
}
// in higher buckets everything will fit
// prefer free objects that start earlier
bucket++;
for (; bucket < Satori::FREELIST_COUNT; bucket++)
{
SatoriObject* freeObjCandidate = m_freeLists[bucket];
if (freeObjCandidate &&
(selectedBucket == Satori::FREELIST_COUNT || freeObjCandidate->Start() < freeObj->Start()))
{
selectedBucket = bucket;
freeObj = freeObjCandidate;
}
}
if (selectedBucket < Satori::FREELIST_COUNT)
{
m_freeLists[selectedBucket] = *(SatoriObject**)(freeObj->Start() + FREE_LIST_NEXT_OFFSET);
m_allocStart = freeObj->Start();
m_allocEnd = m_allocStart + freeObj->FreeObjSize();
SetOccupancy(m_occupancy + m_allocEnd - m_allocStart);
ClearIndicesForAllocRange();
_ASSERTE(GetAllocRemaining() >= minAllocSize);
m_sweepsSinceLastAllocation = 0;
return m_allocStart;
}
return 0;
}
// prefers smallest bucket that fits to reduce fragmentation, possibly at cost to locality
size_t SatoriRegion::StartAllocatingBestFit(size_t minAllocSize)
{
_ASSERTE(!IsAllocating());
// skip buckets that certainly will not fit.
DWORD bucket;
BitScanReverse64(&bucket, minAllocSize);
bucket = bucket > Satori::MIN_FREELIST_SIZE_BITS ?
bucket - Satori::MIN_FREELIST_SIZE_BITS :
0;
// we will check the first free obj in the bucket, but will not dig through the rest.
// if the first obj does not fit, we will switch to the next bucket where everything will fit.
size_t minFreeObjSize = minAllocSize + Satori::MIN_FREE_SIZE;
for (; bucket < Satori::FREELIST_COUNT; bucket++)
{
SatoriObject* freeObj = m_freeLists[bucket];
if (freeObj)
{
m_freeLists[bucket] = *(SatoriObject**)(freeObj->Start() + FREE_LIST_NEXT_OFFSET);
m_allocStart = freeObj->Start();
m_allocEnd = freeObj->End();
SetOccupancy(m_occupancy + m_allocEnd - m_allocStart);
ClearIndicesForAllocRange();
_ASSERTE(GetAllocRemaining() >= minAllocSize);
m_sweepsSinceLastAllocation = 0;
return m_allocStart;
size_t size = freeObj->FreeObjSize();
if (size >= minFreeObjSize)
{
m_freeLists[bucket] = *(SatoriObject**)(freeObj->Start() + FREE_LIST_NEXT_OFFSET);
m_allocStart = freeObj->Start();
m_allocEnd = m_allocStart + size;
SetOccupancy(m_occupancy + m_allocEnd - m_allocStart);
ClearIndicesForAllocRange();
_ASSERTE(GetAllocRemaining() >= minAllocSize);
m_sweepsSinceLastAllocation = 0;
return m_allocStart;
}
}
}
@ -300,7 +386,7 @@ void SatoriRegion::StopAllocating(size_t allocPtr)
_ASSERTE(m_occupancy >= unused);
SetOccupancy(m_occupancy - unused);
SatoriObject* freeObj = SatoriObject::FormatAsFree(allocPtr, unused);
AddFreeSpace(freeObj, unused);
ReturnFreeSpace(freeObj, unused);
}
m_allocStart = m_allocEnd = 0;
@ -328,10 +414,47 @@ void SatoriRegion::AddFreeSpace(SatoriObject* freeObj, size_t size)
_ASSERTE(bucket >= 0);
_ASSERTE(bucket < Satori::FREELIST_COUNT);
*(SatoriObject**)(freeObj->Start() + FREE_LIST_NEXT_OFFSET) = m_freeLists[bucket];
m_freeLists[bucket] = freeObj;
// insert at the tail
*(SatoriObject**)(freeObj->Start() + FREE_LIST_NEXT_OFFSET) = nullptr;
if (m_freeLists[bucket] == nullptr)
{
m_freeLists[bucket] = m_freeListTails[bucket] = freeObj;
return;
}
SatoriObject* tailObj = m_freeListTails[bucket];
_ASSERTE(tailObj);
*(SatoriObject**)(tailObj->Start() + FREE_LIST_NEXT_OFFSET) = freeObj;
m_freeListTails[bucket] = freeObj;
}
void SatoriRegion::ReturnFreeSpace(SatoriObject* freeObj, size_t size)
{
_ASSERTE(freeObj->Size() == size);
// allocSize is smaller than size to make sure the span can always be made parseable
// after allocating objects in it.
ptrdiff_t allocSize = size - Satori::MIN_FREE_SIZE;
if (allocSize < Satori::MIN_FREELIST_SIZE)
{
return;
}
DWORD bucket;
BitScanReverse64(&bucket, allocSize);
bucket -= (Satori::MIN_FREELIST_SIZE_BITS);
_ASSERTE(bucket >= 0);
_ASSERTE(bucket < Satori::FREELIST_COUNT);
// insert at the head, since we are returning what we recently took.
*(SatoriObject**)(freeObj->Start() + FREE_LIST_NEXT_OFFSET) = m_freeLists[bucket];
if (m_freeLists[bucket] == nullptr)
{
m_freeListTails[bucket] = freeObj;
}
m_freeLists[bucket] = freeObj;
}
bool SatoriRegion::HasFreeSpaceInTopBucket()
{
@ -689,7 +812,9 @@ size_t SatoriRegion::AllocateHuge(size_t size, bool zeroInitialize)
// can give refs pointing to Free. (because of card granularity)
SatoriObject* SatoriRegion::FindObject(size_t location)
{
_ASSERTE(m_generation >= 0 && location >= Start() && location < End());
_ASSERTE(m_generation >= 0);
_ASSERTE(location >= Start());
_ASSERTE(location < End());
_ASSERTE(m_unfinishedAllocationCount == 0);
location = min(location, Start() + Satori::REGION_SIZE_GRANULARITY);
@ -843,7 +968,7 @@ void SatoriRegion::EscapeRecursively(SatoriObject* o)
}
SetEscaped(o);
m_escapedSize += o->Size();
m_escapedSize += (int32_t)o->Size();
// now recursively mark all the objects reachable from escaped object.
do
@ -864,7 +989,7 @@ void SatoriRegion::EscapeRecursively(SatoriObject* o)
if (child->SameRegion(this) && !IsEscaped(child))
{
SetEscaped(child);
m_escapedSize += child->Size();
m_escapedSize += (int32_t)child->Size();
PushToMarkStackIfHasPointers(child);
}
}
@ -877,18 +1002,21 @@ void SatoriRegion::EscapeRecursively(SatoriObject* o)
void SatoriRegion::EscsapeAll()
{
size_t objLimit = Start() + Satori::REGION_SIZE_GRANULARITY;
for (SatoriObject* o = FirstObject(); o->Start() < objLimit; o = o->Next())
for (SatoriObject* o = FirstObject(); o->Start() < objLimit;)
{
size_t size = o->Size();
if (!o->IsFree())
{
EscapeShallow(o);
EscapeShallow(o, size);
}
o = (SatoriObject*)(o->Start() + size);
}
}
// do not recurse into children
// used when escaping all objects in the region anyways
void SatoriRegion::EscapeShallow(SatoriObject* o)
void SatoriRegion::EscapeShallow(SatoriObject* o, size_t size)
{
_ASSERTE(o->SameRegion(this));
_ASSERTE(!IsEscaped(o));
@ -899,7 +1027,7 @@ void SatoriRegion::EscapeShallow(SatoriObject* o)
// typically objects have died and we have fewer escapes than before the GC,
// so we do not bother to check
SetEscaped(o);
m_escapedSize += o->Size();
m_escapedSize += (int32_t)size;
o->ForEachObjectRef(
[&](SatoriObject** ref)
@ -910,11 +1038,12 @@ void SatoriRegion::EscapeShallow(SatoriObject* o)
// mark ref location as exposed
SetExposed(ref);
}
},
size
);
}
void SatoriRegion::SetOccupancy(size_t occupancy, size_t objCount)
void SatoriRegion::SetOccupancy(size_t occupancy, int32_t objCount)
{
_ASSERTE(objCount == 0 || occupancy != 0);
_ASSERTE(occupancy <= (Size() - offsetof(SatoriRegion, m_firstObject)));
@ -996,9 +1125,8 @@ void SatoriRegion::ThreadLocalMark()
m_bitmap[bitmapIndex + (markBitOffset >> 6)] |= ((size_t)1 << (markBitOffset & 63));
SatoriObject* o = ObjectForMarkBit(bitmapIndex, markBitOffset);
o->Validate();
#ifdef _DEBUG
o->Validate();
escaped += o->Size();
#endif
@ -1130,7 +1258,7 @@ void SatoriRegion::ThreadLocalPlan()
// stats
size_t occupancy = 0;
size_t objCount = 0;
int32_t objCount = 0;
// moveable: starts at first movable and reachable, as long as there is any free space to slide in
size_t lastMarkedEnd = FirstObject()->Start();
@ -1410,6 +1538,7 @@ void SatoriRegion::ThreadLocalCompact()
{
size_t freeSpace = d2->Start() - d1->Start();
SatoriObject* freeObj = SatoriObject::FormatAsFree(d1->Start(), freeSpace);
SetIndicesForObject(freeObj, d2->Start());
AddFreeSpace(freeObj, freeSpace);
foundFree += freeSpace;
@ -1777,7 +1906,7 @@ void SatoriRegion::UpdateFinalizableTrackers()
}
}
void SatoriRegion::UpdatePointersInObject(SatoriObject* o)
void SatoriRegion::UpdatePointersInObject(SatoriObject* o, size_t size)
{
// if the containing region is large, do not engage with the entire object,
// schedule update of separate ranges.
@ -1797,7 +1926,8 @@ void SatoriRegion::UpdatePointersInObject(SatoriObject* o)
*ppObject = (SatoriObject*)-ptr;
}
}
}
},
size
);
}
}
@ -1810,8 +1940,9 @@ void SatoriRegion::UpdatePointers()
SatoriObject* o = FirstObject();
do
{
UpdatePointersInObject(o);
o = o->Next();
size_t size = o->Size();
UpdatePointersInObject(o, size);
o = (SatoriObject*)(o->Start() + size);
} while (o->Start() < objLimit);
}
@ -1929,7 +2060,7 @@ bool SatoriRegion::NothingMarked()
void SatoriRegion::ClearMarks()
{
_ASSERTE(this->HasUnmarkedDemotedObjects() == false);
memset(&m_bitmap[BITMAP_START], 0, (BITMAP_LENGTH - BITMAP_START) * sizeof(size_t));
memset((void*)&m_bitmap[BITMAP_START], 0, (BITMAP_LENGTH - BITMAP_START) * sizeof(size_t));
}
void SatoriRegion::ClearIndex()
@ -1939,7 +2070,8 @@ void SatoriRegion::ClearIndex()
void SatoriRegion::ClearFreeLists()
{
memset(m_freeLists, 0, sizeof(m_freeLists));
// clear free lists and free list tails
memset(m_freeLists, 0, sizeof(m_freeLists) * 2);
}
void SatoriRegion::Verify(bool allowMarked)

View file

@ -76,11 +76,13 @@ public:
size_t AllocateHuge(size_t size, bool zeroInitialize);
size_t StartAllocating(size_t minSize);
size_t StartAllocatingBestFit(size_t minAllocSize);
void StopAllocating(size_t allocPtr);
void StopAllocating();
bool IsAllocating();
void AddFreeSpace(SatoriObject* freeObj, size_t size);
void ReturnFreeSpace(SatoriObject * freeObj, size_t size);
bool HasFreeSpaceInTopBucket();
bool HasFreeSpaceInTopNBuckets(int n);
@ -132,8 +134,8 @@ public:
void IndividuallyPromote();
void UpdateFinalizableTrackers();
void UpdatePointers();
void UpdatePointersInObject(SatoriObject* o);
void SetCardsForObject(SatoriObject* o);
void UpdatePointersInObject(SatoriObject* o, size_t size);
void SetCardsForObject(SatoriObject* o, size_t size);
template <bool promotingAllRegions>
void UpdatePointersInPromotedObjects();
@ -145,7 +147,7 @@ public:
bool AnyExposed(size_t from, size_t length);
void EscapeRecursively(SatoriObject* obj);
void EscsapeAll();
void EscapeShallow(SatoriObject* o);
void EscapeShallow(SatoriObject* o, size_t size);
template <typename F>
void ForEachFinalizable(F lambda);
@ -161,18 +163,18 @@ public:
bool HasFinalizables();
bool& HasPendingFinalizables();
void SetOccupancy(size_t occupancy, size_t objCount);
void SetOccupancy(size_t occupancy, int32_t objCount);
void SetOccupancy(size_t occupancy);
size_t Occupancy();
size_t& OccupancyAtReuse();
size_t ObjCount();
int32_t& OccupancyAtReuse();
int32_t ObjCount();
bool& HasPinnedObjects();
bool& DoNotSweep();
bool& AcceptedPromotedObjects();
bool& IndividuallyPromoted();
size_t SweepsSinceLastAllocation();
uint32_t SweepsSinceLastAllocation();
enum class ReuseLevel : uint8_t
{
@ -221,8 +223,8 @@ private:
//
// we will overlap the map and the header for simplicity of map operations.
// it is ok because the first BITMAP_START elements of the map cover the header/map itself and thus will not be used.
// +1 to include End(), it will always be 0, but it is conveninet to make it legal map index.
size_t m_bitmap[BITMAP_LENGTH + 1];
// +1 to include End(), it will always be 0, but it is convenient to make it legal map index.
volatile size_t m_bitmap[BITMAP_LENGTH + 1];
// Header.(can be up to 72 size_t)
struct
@ -232,54 +234,66 @@ private:
size_t m_ownerThreadTag;
void (*m_escapeFunc)(SatoriObject**, SatoriObject*, SatoriRegion*);
int m_generation;
ReuseLevel m_reusableFor;
SatoriRegion** m_allocatingOwnerAttachmentPoint;
// above fields are accessed from asm helpers
// the following 5 fields change rarely or not at all.
size_t m_end;
size_t m_committed;
size_t m_used;
SatoriPage* m_containingPage;
SatoriRegion* m_prev;
SatoriRegion* m_next;
SatoriQueue<SatoriRegion>* m_containingQueue;
ReuseLevel m_reusableFor;
int32_t m_occupancyAtReuse;
SatoriRegion** m_allocatingOwnerAttachmentPoint;
SatoriWorkChunk* m_gen2Objects;
// ===== 64 bytes boundary
// Active allocation may happen in the following range.
// The range may not be parseable as sequence of objects
// The range is in terms of objects, there is embedded off-by-one error for syncblocks.
size_t m_allocStart;
size_t m_allocEnd;
// dirty and comitted watermarks
size_t m_used;
size_t m_committed;
// counting escaped objects
// when size goes too high, we stop escaping and do not do local GC.
int32_t m_escapedSize;
// misc uses in thread-local regions
int32_t m_markStack;
// alloc bytes at last threadlocal collect
size_t m_allocBytesAtCollect;
SatoriWorkChunk* m_finalizableTrackers;
int m_finalizableTrackersLock;
// active allocation may happen in the following range.
// the range may not be parseable as sequence of objects
// NB: the range is in terms of objects,
// there is embedded off-by-one error for syncblocks
size_t m_allocStart;
size_t m_allocEnd;
uint32_t m_sweepsSinceLastAllocation;
int32_t m_markStack;
// ===== 128 bytes boundary
SatoriRegion* m_prev;
SatoriRegion* m_next;
SatoriQueue<SatoriRegion>* m_containingQueue;
// counting escaped objects
// when size goes too high, we stop escaping and do not do local GC.
size_t m_escapedSize;
size_t m_allocBytesAtCollect;
size_t m_objCount;
size_t m_occupancy;
size_t m_occupancyAtReuse;
size_t m_sweepsSinceLastAllocation;
int32_t m_objCount;
size_t m_unfinishedAllocationCount;
int32_t m_unfinishedAllocationCount;
bool m_hasPinnedObjects;
bool m_hasMarksSet;
bool m_doNotSweep;
bool m_hasFinalizables;
bool m_hasPendingFinalizables;
bool m_doNotSweep;
bool m_acceptedPromotedObjects;
bool m_individuallyPromoted;
bool m_hasUnmarkedDemotedObjects;
// when demoted, we remember our gen2 objects here
SatoriWorkChunk* m_gen2Objects;
#if _DEBUG
bool m_hasMarksSet;
#endif
SatoriObject* m_freeLists[Satori::FREELIST_COUNT];
SatoriObject* m_freeListTails[Satori::FREELIST_COUNT];
};
};

View file

@ -162,7 +162,7 @@ inline void SatoriRegion::StopEscapeTracking()
}
// Used to simulate writes when containing region is individually promoted.
inline void SatoriRegion::SetCardsForObject(SatoriObject* o)
inline void SatoriRegion::SetCardsForObject(SatoriObject* o, size_t size)
{
_ASSERTE(this->Size() == Satori::REGION_SIZE_GRANULARITY);
@ -180,7 +180,8 @@ inline void SatoriRegion::SetCardsForObject(SatoriObject* o)
// for simplicity and call a concurrent helper.
ContainingPage()->DirtyCardForAddressConcurrent((size_t)ppObject);
}
}
},
size
);
}
@ -279,7 +280,7 @@ bool SatoriRegion::Sweep()
m_escapedSize = 0;
bool cannotRecycle = this->IsAttachedToAllocatingOwner();
size_t occupancy = 0;
size_t objCount = 0;
int32_t objCount = 0;
bool hasFinalizables = false;
SatoriObject* o = FirstObject();
do
@ -303,19 +304,20 @@ bool SatoriRegion::Sweep()
_ASSERTE(!o->IsFree());
cannotRecycle = true;
size_t size = o->Size();
if (isEscapeTracking)
{
this->EscapeShallow(o);
this->EscapeShallow(o, size);
}
if (updatePointers)
{
UpdatePointersInObject(o);
UpdatePointersInObject(o, size);
}
if (individuallyPromoted)
{
SetCardsForObject(o);
SetCardsForObject(o, size);
}
if (!hasFinalizables && o->RawGetMethodTable()->HasFinalizer())
@ -323,7 +325,6 @@ bool SatoriRegion::Sweep()
hasFinalizables = true;
}
size_t size = o->Size();
objCount++;
occupancy += size;
o = (SatoriObject*)(o->Start() + size);
@ -373,13 +374,13 @@ inline size_t SatoriRegion::Occupancy()
return m_occupancy;
}
inline size_t &SatoriRegion::OccupancyAtReuse()
inline int32_t &SatoriRegion::OccupancyAtReuse()
{
_ASSERTE(!IsAllocating());
return m_occupancyAtReuse;
}
inline size_t SatoriRegion::ObjCount()
inline int32_t SatoriRegion::ObjCount()
{
return m_objCount;
}
@ -411,7 +412,7 @@ inline bool& SatoriRegion::IndividuallyPromoted()
return m_individuallyPromoted;
}
inline size_t SatoriRegion::SweepsSinceLastAllocation()
inline uint32_t SatoriRegion::SweepsSinceLastAllocation()
{
return m_sweepsSinceLastAllocation;
}
@ -550,7 +551,7 @@ inline bool SatoriRegion::CheckAndClearMarked(SatoriObject* o)
size_t bitmapIndex = (word >> 9) & (SatoriRegion::BITMAP_LENGTH - 1);
size_t mask = (size_t)1 << ((word >> 3) & 63);
size_t& bitmapWord = m_bitmap[bitmapIndex];
volatile size_t& bitmapWord = m_bitmap[bitmapIndex];
bool wasMarked = bitmapWord & mask;
bitmapWord &= ~mask;
return wasMarked;
@ -683,6 +684,7 @@ void SatoriRegion::UpdatePointersInPromotedObjects()
_ASSERTE(!relocated->IsFree());
SatoriPage* page = relocated->ContainingRegion()->ContainingPage();
size_t size = relocated->Size();
relocated->ForEachObjectRef(
[&](SatoriObject** ppObject)
{
@ -707,10 +709,11 @@ void SatoriRegion::UpdatePointersInPromotedObjects()
}
}
}
}
},
size
);
o = o->Next();
o = (SatoriObject*)(o->Start() + size);
} while (o->Start() < objLimit);
}

View file

@ -208,3 +208,16 @@ SatoriRegion* SatoriRegionQueue::TryDequeueIfHasFreeSpaceInTopBucket()
result->m_prev = nullptr;
return result;
}
SatoriRegionQueue* SatoriRegionQueue::AllocAligned(QueueKind kind)
{
const size_t align = 64;
#ifdef _MSC_VER
void* buffer = _aligned_malloc(sizeof(SatoriRegionQueue), align);
#else
void* buffer = malloc(sizeof(SatoriRegionQueue) + align);
buffer = (void*)ALIGN_UP((size_t)buffer, align);
#endif
return new(buffer)SatoriRegionQueue(kind);
}

View file

@ -42,6 +42,8 @@ public:
SatoriRegion* TryPopWithSize(size_t regionSize, SatoriRegion* &putBack);
SatoriRegion* TryRemoveWithSize(size_t regionSize, SatoriRegion*& putBack);
SatoriRegion* TryDequeueIfHasFreeSpaceInTopBucket();
static SatoriRegionQueue* AllocAligned(QueueKind kind);
};
#endif

View file

@ -43,8 +43,8 @@ SatoriTrimmer::SatoriTrimmer(SatoriHeap* heap)
m_heap = heap;
m_state = TRIMMER_STATE_STOPPED;
m_gate = new (nothrow) GCEvent;
m_gate->CreateAutoEventNoThrow(false);
m_event = new (nothrow) GCEvent;
m_event->CreateAutoEventNoThrow(false);
if (SatoriUtil::IsTrimmingEnabled())
{
@ -60,18 +60,25 @@ void SatoriTrimmer::LoopFn(void* inst)
void SatoriTrimmer::Loop()
{
int64_t lastGen2 = m_heap->Recycler()->GetCollectionCount(2);
while (true)
{
int64_t curGen2 = m_heap->Recycler()->GetCollectionCount(2);
// limit the trim rate to once per 1 sec + 1 gen2 gc.
do
// limit the re-trim rate to once per 5 sec.
// we would also require that gen2 gc happened since the last round.
while (true)
{
int64_t newGen2 = m_heap->Recycler()->GetCollectionCount(2);
if (lastGen2 != newGen2)
{
lastGen2 = newGen2;
break;
}
Interlocked::CompareExchange(&m_state, TRIMMER_STATE_STOPPED, TRIMMER_STATE_RUNNING);
// we are not running here, so we can sleep a bit before continuing.
GCToOSInterface::Sleep(1000);
GCToOSInterface::Sleep(5000);
StopAndWait();
} while (curGen2 == m_heap->Recycler()->GetCollectionCount(2));
}
m_heap->ForEachPage(
[&](SatoriPage* page)
@ -83,6 +90,8 @@ void SatoriTrimmer::Loop()
StopAndWait();
}
int64_t lastGen1 = m_heap->Recycler()->GetCollectionCount(1);
page->ForEachRegion(
[&](SatoriRegion* region)
{
@ -106,13 +115,25 @@ void SatoriTrimmer::Loop()
if (didSomeWork)
{
// limit the decommit/coalesce rate to 1 region/msec.
GCToOSInterface::Sleep(1);
// limit the decommit/coalesce rate to 1 region/10 msec.
GCToOSInterface::Sleep(10);
}
}
}
}
// this is a low priority task, if something needs to run, yield
GCToOSInterface::YieldThread(0);
// also we will pause for 1 sec if there was a GC - to further reduce the churn
// if the app is allocation-active.
int64_t newGen1 = m_heap->Recycler()->GetCollectionCount(1);
if (newGen1 != lastGen1)
{
lastGen1 = newGen1;
GCToOSInterface::Sleep(1000);
}
if (m_state != TRIMMER_STATE_RUNNING)
{
StopAndWait();
@ -129,6 +150,9 @@ void SatoriTrimmer::StopAndWait()
while (true)
{
tryAgain:
// this is a low priority task, if something needs to run, yield
GCToOSInterface::YieldThread(0);
int state = m_state;
switch (state)
{
@ -150,7 +174,7 @@ void SatoriTrimmer::StopAndWait()
if (Interlocked::CompareExchange(&m_state, TRIMMER_STATE_BLOCKED, state) == state)
{
m_gate->Wait(INFINITE, false);
m_event->Wait(INFINITE, false);
}
continue;
case TRIMMER_STATE_RUNNING:
@ -170,7 +194,7 @@ void SatoriTrimmer::SetOkToRun()
case TRIMMER_STATE_BLOCKED:
// trimmer can't get out of BlOCKED by itself, ordinary assignment is ok
m_state = TRIMMER_STATE_OK_TO_RUN;
m_gate->Set();
m_event->Set();
break;
case TRIMMER_STATE_STOPPED:
Interlocked::CompareExchange(&m_state, TRIMMER_STATE_OK_TO_RUN, state);

View file

@ -51,7 +51,7 @@ private:
static const int TRIMMER_STATE_RUNNING = 3;
SatoriHeap* m_heap;
GCEvent* m_gate;
GCEvent* m_event;
size_t m_lastGen2Count;
volatile int m_state;

View file

@ -64,18 +64,31 @@ namespace Satori
// we use a trivial array object to fill holes, thus this is the size of a shortest array object.
static const size_t MIN_FREE_SIZE = 3 * sizeof(size_t);
// ~1024 items for now, we can fiddle with size a bit later
const static size_t MARK_CHUNK_SIZE = 1024 * sizeof(size_t);
// If a single mark takes very roughly ~50ns (5-20 for CAS + some extra), then 1k objects marks in 50us
// we set the chunk to roughly 1/2k to expect it mark in under 20us or so
const static size_t MARK_CHUNK_COUNT = 512;
// this includes header, so the number of objects is slightly less (by -2)
const static size_t MARK_CHUNK_SIZE = MARK_CHUNK_COUNT * sizeof(size_t);
// objects that are bigger are chunked into ranges when marking.
// the threshold is slightly less than MARK_CHUNK_SIZE, so that object in the range
// could fit into same chunk
const static size_t MARK_RANGE_THRESHOLD = MARK_CHUNK_SIZE - 2 * sizeof(size_t);
// if we have more than twice this much and work list is empty we can share half
const static int SHARE_WORK_THRESHOLD = 4;
// address bits set to track finalizable that needs to be scheduled to F-queue
const static size_t FINALIZATION_PENDING = 1;
static const int BYTES_PER_CARD_BYTE = 512;
static const int CARD_BYTES_IN_CARD_GROUP = Satori::REGION_SIZE_GRANULARITY / BYTES_PER_CARD_BYTE;
static const int BYTES_PER_CARD_GROUP = REGION_SIZE_GRANULARITY / 2;
static const int CARD_BYTES_IN_CARD_GROUP = Satori::BYTES_PER_CARD_GROUP / BYTES_PER_CARD_BYTE;
namespace CardState
{
static const int8_t EPHEMERAL = -128; // 0b10000000
static const int8_t EPHEMERAL = -128; // 0b10000000 only used in cards (not groups or higher)
static const int8_t BLANK = 0;
static const int8_t REMEMBERED = 1;
static const int8_t PROCESSING = 2;
@ -163,23 +176,6 @@ public:
#endif
}
static size_t CommitGranularity()
{
// we can support sizes that are > OS page and binary fractions of REGION_SIZE_GRANULARITY.
// we can also support PAGE_SIZE_GRANULARITY
size_t result = 1024 * 32;
// result = Satori::REGION_SIZE_GRANULARITY;
// result = Satori::PAGE_SIZE_GRANULARITY;
#if defined(TARGET_LINUX) && defined(TARGET_ARM64)
result = max(result, GCToOSInterface::GetPageSize());
#endif
return result;
}
// TUNING: Needs tuning?
// When doing regular allocation we clean this much memory
// if we do cleaning, and if available.
@ -189,43 +185,55 @@ public:
return 16 * 1024;
}
// COMPlus_gcConservative
// DOTNET_gcConservative
static bool IsConservativeMode()
{
return (GCConfig::GetConservativeGC());
}
// COMPlus_gcConcurrent
static bool IsConcurrent()
// DOTNET_gcConcurrent
static bool IsConcurrentEnabled()
{
return (GCConfig::GetConcurrentGC());
}
// COMPlus_gcRelocatingGen1
// DOTNET_gcRelocatingGen1
static bool IsRelocatingInGen1()
{
return (GCConfig::GetRelocatingInGen1());
}
// COMPlus_gcRelocatingGen2
// DOTNET_gcRelocatingGen2
static bool IsRelocatingInGen2()
{
return (GCConfig::GetRelocatingInGen2());
}
// COMPlus_gcThreadLocal
static bool IsThreadLocalGCEnabled()
// DOTNET_gcGen0
static bool IsGen0Enabled()
{
return (GCConfig::GetThreadLocalGC());
return (GCConfig::GetGen0GC());
}
// COMPlus_gcTrim
// DOTNET_gcGen1
static bool IsGen1Enabled()
{
return (GCConfig::GetGen1GC());
}
// DOTNET_gcTHP
static bool UseTHP()
{
return (GCConfig::GetUseTHP());
}
// DOTNET_gcTrim
static bool IsTrimmingEnabled()
{
return (GCConfig::GetTrimmigGC());
}
// COMPlus_GCLatencyMode
// DOTNET_GCLatencyMode
static bool IsLowLatencyMode()
{
return (GCConfig::GetLatencyMode()) >= 2;
@ -242,11 +250,65 @@ public:
return partitionCount;
}
// COMPlus_gcParallel
static int MaxHelpersCount()
// DOTNET_gcParallel
static int MaxWorkersCount()
{
return (int)GCConfig::GetParallelGC();
}
// DOTNET_gcRate
static int GcRate()
{
int gcRate = (int)GCConfig::GetGCRate();
if (gcRate == -1)
{
#if _DEBUG
// minimum rate-limiting in debug
return 0;
#else
return 3;
#endif
}
return gcRate;
}
// DOTNET_gcSpin
static int GcSpin()
{
int gcSpin = (int)GCConfig::GetGCSpin();
if (gcSpin == -1)
{
return 10;
}
return gcSpin;
}
static size_t CommitGranularity()
{
// we can support sizes that are > OS page and binary fractions of REGION_SIZE_GRANULARITY.
// we can also support PAGE_SIZE_GRANULARITY
size_t result = 1024 * 32;
#if defined(TARGET_LINUX)
#if defined(TARGET_ARM64)
result = max(result, GCToOSInterface::GetPageSize());
#endif
if (UseTHP())
{
result = Satori::REGION_SIZE_GRANULARITY;
}
#endif
// result = Satori::REGION_SIZE_GRANULARITY;
// result = Satori::PAGE_SIZE_GRANULARITY;
return result;
}
};
#endif

View file

@ -31,6 +31,7 @@
#include "../gc.h"
#include "SatoriUtil.h"
#include "SatoriQueue.h"
#include "SatoriObject.h"
class SatoriWorkChunk
{
@ -51,11 +52,12 @@ public:
static size_t Capacity()
{
return (Satori::MARK_CHUNK_SIZE - sizeof(SatoriWorkChunk)) / sizeof(SatoriObject*);
return Satori::MARK_CHUNK_SIZE / sizeof(SatoriObject*) - /* m_top, m_next*/ 2;
}
size_t Count()
{
_ASSERTE(!IsRange());
return m_top;
}
@ -102,6 +104,21 @@ public:
return false;
}
void TakeFrom(SatoriWorkChunk* other, size_t count)
{
_ASSERTE(Count() == 0);
_ASSERTE(other->Count() >= count);
m_top = count;
other->m_top -= count;
size_t otherTop = other->m_top;
for (size_t i = 0; i < count; i++)
{
m_data[i] = other->m_data[otherTop + i];
}
}
void SetNext(SatoriWorkChunk* next)
{
m_next = next;

View file

@ -0,0 +1,78 @@
// Copyright (c) 2024 Vladimir Sadov
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// SatoriWorkList.cpp
//
#include "common.h"
#include "gcenv.h"
#include "../env/gcenv.os.h"
#include "SatoriWorkList.h"
NOINLINE
void SatoriWorkList::PushSlow(SatoriWorkChunk* item)
{
uint32_t collisions = 1;
while (true)
{
SatoriWorkList orig = *this;
item->m_next = orig.m_head;
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)item, (int64_t*)&orig))
break;
SatoriLock::CollisionBackoff(collisions++);
}
#ifdef _DEBUG
Interlocked::Increment(&m_count);
#endif
}
NOINLINE
SatoriWorkChunk* SatoriWorkList::TryPopSlow()
{
uint32_t collisions = 1;
SatoriWorkList orig;
while (true)
{
orig = *this;
if (orig.m_head == nullptr)
{
return nullptr;
}
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)orig.m_head->m_next, (int64_t*)&orig))
break;
SatoriLock::CollisionBackoff(collisions++);
}
#ifdef _DEBUG
Interlocked::Decrement(&m_count);
#endif
SatoriWorkChunk* result = orig.m_head;
result->m_next = nullptr;
return result;
}

View file

@ -31,16 +31,43 @@
#include "../gc.h"
#include "SatoriWorkChunk.h"
#if defined(TARGET_WINDOWS)
FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparandAndResult)
{
return _InterlockedCompareExchange128(pDst, iValueHigh, iValueLow, pComparandAndResult);
}
#else
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Watomic-alignment"
FORCEINLINE uint8_t Cas128(int64_t volatile *pDst, int64_t iValueHigh, int64_t iValueLow, int64_t *pComparandAndResult)
{
__int128_t iValue = ((__int128_t)iValueHigh << 64) + (uint64_t)iValueLow;
return __atomic_compare_exchange_n ((__int128_t*)pDst, (__int128_t*)pComparandAndResult, iValue, /*weak*/ true, /* success_memorder */ __ATOMIC_SEQ_CST, /* failure_memorder */ __ATOMIC_RELAXED);
}
#pragma clang diagnostic pop
#endif // HOST_AMD64
class SatoriWorkList
{
public:
SatoriWorkList() :
m_lock(), m_head()
m_head(), m_aba()
#ifdef _DEBUG
, m_count()
#endif
{}
static SatoriWorkList* AllocAligned()
{
m_lock.Initialize();
const size_t align = 64;
#ifdef _MSC_VER
void* buffer = _aligned_malloc(sizeof(SatoriWorkList), align);
#else
void* buffer = malloc(sizeof(SatoriWorkList) + align);
buffer = (void*)ALIGN_UP((size_t)buffer, align);
#endif
return new(buffer)SatoriWorkList();
}
bool IsEmpty()
@ -48,42 +75,44 @@ public:
return m_head == nullptr;
}
FORCEINLINE
void Push(SatoriWorkChunk* item)
{
_ASSERTE(item->m_next == nullptr);
SatoriLockHolder<SatoriSpinLock> holder(&m_lock);
item->m_next = m_head;
m_head = item;
SatoriWorkList orig = *this;
item->m_next = orig.m_head;
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)item, (int64_t*)&orig))
{
#ifdef _DEBUG
m_count++;
Interlocked::Increment(&m_count);
#endif
return;
}
PushSlow(item);
}
FORCEINLINE
SatoriWorkChunk* TryPop()
{
if (IsEmpty())
SatoriWorkList orig = *this;
if (orig.m_head == nullptr)
{
return nullptr;
}
SatoriWorkChunk* result;
if (Cas128((int64_t*)this, orig.m_aba + 1, (int64_t)orig.m_head->m_next, (int64_t*)&orig))
{
SatoriLockHolder<SatoriSpinLock> holder(&m_lock);
result = m_head;
if (result == nullptr)
{
return result;
}
m_head = result->m_next;
#ifdef _DEBUG
m_count--;
#endif
#ifdef _DEBUG
Interlocked::Decrement(&m_count);
#endif
SatoriWorkChunk* result = orig.m_head;
result->m_next = nullptr;
return result;
}
result->m_next = nullptr;
return result;
return TryPopSlow();
}
#ifdef _DEBUG
@ -94,11 +123,20 @@ public:
#endif
private:
SatoriSpinLock m_lock;
SatoriWorkChunk* m_head;
struct
{
SatoriWorkChunk* volatile m_head;
volatile size_t m_aba;
};
#ifdef _DEBUG
size_t m_count;
#endif
NOINLINE
void PushSlow(SatoriWorkChunk* item);
NOINLINE
SatoriWorkChunk* TryPopSlow();
};
#endif

View file

@ -306,3 +306,151 @@ bool GCEvent::CreateOSManualEventNoThrow(bool initialState)
m_impl = event;
return true;
}
#define _INC_PTHREADS
#include "..\satori\SatoriGate.h"
#if defined(TARGET_LINUX)
#include <linux/futex.h> /* Definition of FUTEX_* constants */
#include <sys/syscall.h> /* Definition of SYS_* constants */
#include <unistd.h>
#ifndef INT_MAX
#define INT_MAX 2147483647
#endif
SatoriGate::SatoriGate()
{
m_state = s_blocking;
}
// returns true if was woken up. false if timed out
bool SatoriGate::TimedWait(int timeout)
{
timespec t;
uint64_t nanoseconds = (uint64_t)timeout * tccMilliSecondsToNanoSeconds;
t.tv_sec = nanoseconds / tccSecondsToNanoSeconds;
t.tv_nsec = nanoseconds % tccSecondsToNanoSeconds;
long waitResult = syscall(SYS_futex, &m_state, FUTEX_WAIT_PRIVATE, s_blocking, &t, NULL, 0);
// woken, not blocking, interrupted, timeout
assert(waitResult == 0 || errno == EAGAIN || errno == ETIMEDOUT || errno == EINTR);
bool woken = waitResult == 0 || errno != ETIMEDOUT;
if (woken)
{
// consume the wake
m_state = s_blocking;
}
return woken;
}
void SatoriGate::Wait()
{
syscall(SYS_futex, &m_state, FUTEX_WAIT_PRIVATE, s_blocking, NULL, NULL, 0);
}
void SatoriGate::WakeAll()
{
m_state = s_open;
syscall(SYS_futex, &m_state, FUTEX_WAKE_PRIVATE, s_blocking, INT_MAX , NULL, 0);
}
void SatoriGate::WakeOne()
{
m_state = s_open;
syscall(SYS_futex, &m_state, FUTEX_WAKE_PRIVATE, s_blocking, 1, NULL, 0);
}
#else
SatoriGate::SatoriGate()
{
m_cs = new (nothrow) pthread_mutex_t();
m_cv = new (nothrow) pthread_cond_t();
pthread_mutex_init(m_cs, NULL);
pthread_condattr_t attrs;
pthread_condattr_init(&attrs);
#if HAVE_PTHREAD_CONDATTR_SETCLOCK && !HAVE_CLOCK_GETTIME_NSEC_NP
// Ensure that the pthread_cond_timedwait will use CLOCK_MONOTONIC
pthread_condattr_setclock(&attrs, CLOCK_MONOTONIC);
#endif // HAVE_PTHREAD_CONDATTR_SETCLOCK && !HAVE_CLOCK_GETTIME_NSEC_NP
pthread_cond_init(m_cv, &attrs);
pthread_condattr_destroy(&attrs);
}
// returns true if was woken up
bool SatoriGate::TimedWait(int timeout)
{
timespec endTime;
#if HAVE_CLOCK_GETTIME_NSEC_NP
uint64_t endNanoseconds;
uint64_t nanoseconds = (uint64_t)timeout * tccMilliSecondsToNanoSeconds;
NanosecondsToTimeSpec(nanoseconds, &endTime);
endNanoseconds = clock_gettime_nsec_np(CLOCK_UPTIME_RAW) + nanoseconds;
#elif HAVE_PTHREAD_CONDATTR_SETCLOCK
clock_gettime(CLOCK_MONOTONIC, &endTime);
TimeSpecAdd(&endTime, timeout);
#else
#error "Don't know how to perform timed wait on this platform"
#endif
int waitResult = 0;
pthread_mutex_lock(m_cs);
#if HAVE_CLOCK_GETTIME_NSEC_NP
// Since OSX doesn't support CLOCK_MONOTONIC, we use relative variant of the timed wait.
waitResult = m_state == s_open ?
0 :
pthread_cond_timedwait_relative_np(m_cv, m_cs, &endTime);
#else // HAVE_CLOCK_GETTIME_NSEC_NP
waitResult = m_state == SatoriGate::s_open ?
0 :
pthread_cond_timedwait(m_cv, m_cs, &endTime);
#endif // HAVE_CLOCK_GETTIME_NSEC_NP
pthread_mutex_unlock(m_cs);
assert(waitResult == 0 || waitResult == ETIMEDOUT);
bool woken = waitResult == 0;
if (woken)
{
// consume the wake
m_state = s_blocking;
}
return woken;
}
void SatoriGate::Wait()
{
int waitResult;
pthread_mutex_lock(m_cs);
waitResult = m_state == SatoriGate::s_open ?
0 :
pthread_cond_wait(m_cv, m_cs);
pthread_mutex_unlock(m_cs);
assert(waitResult == 0);
m_state = s_blocking;
}
void SatoriGate::WakeAll()
{
m_state = SatoriGate::s_open;
pthread_mutex_lock(m_cs);
pthread_cond_broadcast(m_cv);
pthread_mutex_unlock(m_cs);
}
void SatoriGate::WakeOne()
{
m_state = SatoriGate::s_open;
pthread_mutex_lock(m_cs);
pthread_cond_signal(m_cv);
pthread_mutex_unlock(m_cs);
}
#endif

View file

@ -628,7 +628,7 @@ void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t fl
return VirtualReserveInner(size, alignment, flags, 0, /* committing */ false);
}
void* GCToOSInterface::VirtualReserve(void* location, size_t size)
void* GCToOSInterface::VirtualReserve(void* location, size_t size, bool useTHP)
{
void* pRetVal = mmap(location, size, PROT_NONE, MAP_ANON | MAP_PRIVATE , -1, 0);
@ -643,10 +643,18 @@ void* GCToOSInterface::VirtualReserve(void* location, size_t size)
return NULL;
}
#ifdef TARGET_LINUX
if (useTHP)
{
madvise(pRetVal, size, MADV_HUGEPAGE);
}
#endif
#ifdef MADV_DONTDUMP
// Do not include reserved memory in coredump.
madvise(pRetVal, size, MADV_DONTDUMP);
#endif
return pRetVal;
}

View file

@ -701,7 +701,7 @@ void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t fl
}
}
void* GCToOSInterface::VirtualReserve(void* location, size_t size)
void* GCToOSInterface::VirtualReserve(void* location, size_t size, bool useTHP /*unused*/)
{
DWORD memFlags = MEM_RESERVE;
return ::VirtualAlloc(location, size, memFlags, PAGE_READWRITE);

View file

@ -78,6 +78,7 @@ The .NET Foundation licenses this file to you under the MIT license.
<SdkNativeLibrary Include="user32.lib" />
<SdkNativeLibrary Include="version.lib" />
<SdkNativeLibrary Include="ws2_32.lib" />
<SdkNativeLibrary Include="Synchronization.lib" />
</ItemGroup>
<ItemGroup>

View file

@ -2371,3 +2371,10 @@ ucrtbase!memset
ucrtbase!realloc
ucrtbase!wmemcpy_s
ucrtbase!wmemmove_s
#
# Synchronization.lib
#
Synchronization!WaitOnAddress
Synchronization!WakeByAddressSingle
Synchronization!WakeByAddressAll

View file

@ -24,10 +24,10 @@ if(CLR_CMAKE_HOST_UNIX)
add_definitions(-DFEATURE_OBJCMARSHAL)
endif(CLR_CMAKE_TARGET_APPLE)
if(CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_I386)
if(CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_I386 OR CLR_CMAKE_TARGET_ARCH_ARM64)
# Allow 16 byte compare-exchange
add_compile_options(-mcx16)
endif(CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_I386)
endif(CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_I386 OR CLR_CMAKE_TARGET_ARCH_ARM64)
endif (CLR_CMAKE_HOST_UNIX)
if(CLR_CMAKE_TARGET_ANDROID)

View file

@ -67,6 +67,8 @@ set(COMMON_RUNTIME_SOURCES
${GC_DIR}/satori/SatoriAllocationContext.cpp
${GC_DIR}/satori/SatoriUtil.cpp
${GC_DIR}/satori/SatoriLock.cpp
${GC_DIR}/satori/SatoriWorkList.cpp
${GC_DIR}/satori/SatoriGate.cpp
)
set(SERVER_GC_SOURCES
@ -149,6 +151,7 @@ if (WIN32)
${GC_DIR}/satori/SatoriAllocationContext.h
${GC_DIR}/satori/SatoriUtil.h
${GC_DIR}/satori/SatoriLock.h
${GC_DIR}/satori/SatoriGate.h
)
include_directories(windows)

View file

@ -45,12 +45,18 @@ class Object
public:
MethodTable * GetMethodTable() const
{ return m_pEEType; }
MethodTable * GetGCSafeMethodTable() const
MethodTable* GetGCSafeMethodTable() const
#if !defined(FEATURE_SATORI_GC)
#ifdef TARGET_64BIT
{ return dac_cast<PTR_EEType>((dac_cast<TADDR>(m_pEEType)) & ~((uintptr_t)7)); }
#else
{ return dac_cast<PTR_EEType>((dac_cast<TADDR>(m_pEEType)) & ~((uintptr_t)3)); }
#endif
#else
// Satori does not mess up MT pointers.
{ return get_EEType(); }
#endif
ObjHeader * GetHeader() { return dac_cast<DPTR(ObjHeader)>(dac_cast<TADDR>(this) - SYNC_BLOCK_SKEW); }
#ifndef DACCESS_COMPILE
void set_EEType(MethodTable * pEEType)

View file

@ -336,13 +336,12 @@ LEAF_END RhpByRefAssignRef, _TEXT
// rsi - object
//
LEAF_ENTRY RhpCheckedAssignRef, _TEXT
// See if this is in GCHeap
mov rax, rdi
shr rax, 30 // round to page size ( >> PAGE_BITS )
add rax, [C_VAR(g_card_bundle_table)] // fetch the page byte map
cmp byte ptr [rax], 0
jne C_FUNC(RhpAssignRef)
// See if dst is in GCHeap
mov rax, [C_VAR(g_card_bundle_table)] // fetch the page byte map
mov r8, rdi
shr r8, 30 // dst page index
cmp byte ptr [rax + r8], 0
jne C_FUNC(CheckedEntry)
NotInHeap:
ALTERNATE_ENTRY RhpCheckedAssignRefAVLocation
@ -354,19 +353,23 @@ LEAF_END RhpCheckedAssignRef, _TEXT
// rdi - dest address
// rsi - object
//
.balign 16
LEAF_ENTRY RhpAssignRef, _TEXT
// check for escaping assignment
// 1) check if we own the source region
#ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
mov rax, rsi
shr rax, 30 // round to page size ( >> PAGE_BITS )
add rax, [C_VAR(g_card_bundle_table)] // fetch the page byte map
cmp byte ptr [rax], 0
je JustAssign // src not in heap
// check if src is in heap
mov rax, [C_VAR(g_card_bundle_table)] // fetch the page byte map
ALTERNATE_ENTRY CheckedEntry
mov r8, rsi
shr r8, 30 // src page index
cmp byte ptr [rax + r8], 0
je JustAssign // src not in heap
#else
ALTERNATE_ENTRY CheckedEntry
#endif
// check for escaping assignment
// 1) check if we own the source region
mov rdx, rsi
and rdx, 0xFFFFFFFFFFE00000 // source region
@ -407,76 +410,86 @@ ALTERNATE_ENTRY RhpAssignRefAVLocationNotHeap
ALTERNATE_ENTRY RhpAssignRefAVLocation
mov [rdi], rsi
// TUNING: barriers in different modes could be separate pieces of code, but barrier switch
// needs to suspend EE, not sure if skipping mode check would worth that much.
mov r11, [C_VAR(g_write_watch_table)]
// set rdi per contract with JIT_ByRefWriteBarrier
mov rax, rdi
add rdi, 8
xor rsi, rdi
shr rsi, 21
// check the barrier state. this must be done after the assignment (in program order)
// if state == 2 we do not set or dirty cards.
cmp r11, 2
jne DoCards
// set rsi per contract with JIT_ByRefWriteBarrier
mov rsi, r10
Exit:
ret
DoCards:
// if same region, just check if barrier is not concurrent
xor rsi, rax
shr rsi, 21
// set rsi per contract with JIT_ByRefWriteBarrier
mov rsi, r10
jz CheckConcurrent // same region, just check if barrier is not concurrent
// if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
cmp dword ptr [rdx + 16], 2
jl MarkCards
CheckConcurrent:
cmp byte ptr [C_VAR(g_sw_ww_enabled_for_gc_heap)], 0
jne MarkCards
ret
// if concurrent, load card location
cmp r11, 0
je Exit
MarkCards:
// fetch card location for rax (saved rdi)
mov r9 , [C_VAR(g_card_table)] // fetch the page map
mov rdx, rax
mov rdx, rax
shr rax, 30
mov rax, qword ptr [r9 + rax * 8] // page
sub rdx, rax // offset in page
mov r8 ,rdx
mov r8, rdx
shr rdx, 9 // card offset
shr r8 , 21 // group offset
shr r8, 20 // group index
lea r8, [rax + r8 * 2 + 0x80] // group offset
// check if concurrent marking is in progress
cmp byte ptr [C_VAR(g_sw_ww_enabled_for_gc_heap)], 0
// check if concurrent marking is in progress
cmp r11, 0
jne DirtyCard
// SETTING CARD
// SETTING CARD
SetCard:
cmp byte ptr [rax + rdx], 0
jne CardSet
jne Exit
mov byte ptr [rax + rdx], 1
SetGroup:
cmp byte ptr [rax + r8 * 2 + 0x80], 0
cmp byte ptr [r8], 0
jne CardSet
mov byte ptr [rax + r8 * 2 + 0x80], 1
mov byte ptr [r8], 1
SetPage:
cmp byte ptr [rax], 0
jne CardSet
mov byte ptr [rax], 1
CardSet:
// check if concurrent marking is still not in progress
cmp byte ptr [C_VAR(g_sw_ww_enabled_for_gc_heap)], 0
// check if concurrent marking is still not in progress
cmp qword ptr [C_VAR(g_write_watch_table)], 0
jne DirtyCard
ret
// DIRTYING CARD
// DIRTYING CARD
DirtyCard:
mov byte ptr [rax + rdx], 4
DirtyGroup:
cmp byte ptr [rax + r8 * 2 + 0x80], 4
cmp byte ptr [r8], 4
je Exit
mov byte ptr [rax + r8 * 2 + 0x80], 4
mov byte ptr [r8], 4
DirtyPage:
cmp byte ptr [rax], 4
je Exit
mov byte ptr [rax], 4
Exit:
ret
// this is expected to be rare.
@ -484,12 +497,19 @@ ALTERNATE_ENTRY RhpAssignRefAVLocation
// 4) check if the source is escaped
mov rax, rsi
add rax, 8 // escape bit is MT + 1
and rax, 0x1FFFFF
shr rax, 3
bt qword ptr [rdx], rax
jb AssignAndMarkCards // source is already escaped.
// save rdi, rsi, rdx and r10 (possibly preadjusted rsi)
// Align rsp
mov r9, rsp
and rsp, -16
sub rsp, 8
// save rsp, rdi, rsi, rdx and r10 (possibly preadjusted rsi)
push r9
push rdi
push rsi
push rdx
@ -502,6 +522,7 @@ ALTERNATE_ENTRY RhpAssignRefAVLocation
pop rdx
pop rsi
pop rdi
pop rsp
jmp AssignAndMarkCards
LEAF_END RhpAssignRef, _TEXT
@ -521,14 +542,13 @@ LEAF_ENTRY RhpByRefAssignRef, _TEXT
ALTERNATE_ENTRY RhpByRefAssignRefAVLocation1
mov rsi, [rsi]
// See if assignment is into heap
mov rax, rdi
shr rax, 30 // round to page size ( >> PAGE_BITS )
add rax, [C_VAR(g_card_bundle_table)] // fetch the page byte map
cmp byte ptr [rax], 0
jne C_FUNC(RhpAssignRef)
// See if dst is in GCHeap
mov rax, [C_VAR(g_card_bundle_table)] // fetch the page byte map
mov r8, rdi
shr r8, 30 // dst page index
cmp byte ptr [rax + r8], 0
jne C_FUNC(CheckedEntry)
.balign 16
NotInHeap_RhpByRefAssignRef:
ALTERNATE_ENTRY RhpByRefAssignRefAVLocation2
mov [rdi], rsi
@ -540,13 +560,13 @@ LEAF_END RhpByRefAssignRef, _TEXT
LEAF_ENTRY RhpCheckedLockCmpXchg, _TEXT
// Setup rax with the new object for the exchange, that way it will automatically hold the correct result
// afterwards and we can leave rsi unaltered ready for the GC write barrier below.
mov rax, rdx
mov rax, rdx
mov r11, [C_VAR(g_card_bundle_table)] // fetch the page byte map
// check if dst is in heap
mov rdx, rdi
shr rdx, 30 // round to page size ( >> PAGE_BITS )
add rdx, [C_VAR(g_card_bundle_table)] // fetch the page byte map
cmp byte ptr [rdx], 0
cmp byte ptr [r11 + rdx], 0
je JustAssign_CmpXchg // dst not in heap
// check for escaping assignment
@ -554,8 +574,7 @@ LEAF_ENTRY RhpCheckedLockCmpXchg, _TEXT
#ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
mov rdx, rsi
shr rdx, 30 // round to page size ( >> PAGE_BITS )
add rdx, [C_VAR(g_card_bundle_table)] // fetch the page byte map
cmp byte ptr [rdx], 0
cmp byte ptr [r11 + rdx], 0
je JustAssign_CmpXchg // src not in heap
#endif
@ -597,19 +616,30 @@ ALTERNATE_ENTRY RhpCheckedLockCmpXchgAVLocation
lock cmpxchg [rdi], rsi
jne Exit_CmpXchg
// TUNING: barriers in different modes could be separate pieces of code, but barrier switch
// needs to suspend EE, not sure if skipping mode check would worth that much.
mov r10, [C_VAR(g_write_watch_table)]
// check the barrier state. this must be done after the assignment (in program order)
// if state == 2 we do not set or dirty cards.
cmp r10, 2
jne DoCards_CmpXchg
Exit_CmpXchg:
ret
DoCards_CmpXchg:
// if same region, just check if barrier is not concurrent
xor rsi, rdi
shr rsi, 21
jz CheckConcurrent_CmpXchg // same region, just check if barrier is not concurrent
// TUNING: nonconcurrent and concurrent barriers could be separate pieces of code, but to switch
// need to suspend EE, not sure if skipping concurrent check would worth that much.
// if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
cmp dword ptr [rdx + 16], 2
jl MarkCards_CmpXchg
CheckConcurrent_CmpXchg:
cmp byte ptr [C_VAR(g_sw_ww_enabled_for_gc_heap)], 0
// if concurrent, load card location
cmp r10, 0
jne MarkCards_CmpXchg
ret
@ -622,29 +652,30 @@ ALTERNATE_ENTRY RhpCheckedLockCmpXchgAVLocation
sub rdx, r11 // offset in page
mov rsi,rdx
shr rdx, 9 // card offset
shr rsi, 21 // group offset
shr rsi, 20 // group index
lea rsi, [r11 + rsi * 2 + 0x80] // group offset
// check if concurrent marking is in progress
cmp byte ptr [C_VAR(g_sw_ww_enabled_for_gc_heap)], 0
// check if concurrent marking is in progress
cmp r10, 0
jne DirtyCard_CmpXchg
// SETTING CARD FOR rdi
SetCard_CmpXchg:
cmp byte ptr [r11 + rdx], 0
jne CardSet_CmpXchg
jne Exit_CmpXchg
mov byte ptr [r11 + rdx], 1
SetGroup_CmpXchg:
cmp byte ptr [r11 + rsi * 2 + 0x80], 0
cmp byte ptr [rsi], 0
jne CardSet_CmpXchg
mov byte ptr [r11 + rsi * 2 + 0x80], 1
mov byte ptr [rsi], 1
SetPage_CmpXchg:
cmp byte ptr [r11], 0
jne CardSet_CmpXchg
mov byte ptr [r11], 1
CardSet_CmpXchg:
// check if concurrent marking is still not in progress
cmp byte ptr [C_VAR(g_sw_ww_enabled_for_gc_heap)], 0
// check if concurrent marking is still not in progress
cmp qword ptr [C_VAR(g_write_watch_table)], 0
jne DirtyCard_CmpXchg
ret
@ -654,15 +685,13 @@ ALTERNATE_ENTRY RhpCheckedLockCmpXchgAVLocation
je Exit_CmpXchg
mov byte ptr [r11 + rdx], 4
DirtyGroup_CmpXchg:
cmp byte ptr [r11 + rsi * 2 + 0x80], 4
cmp byte ptr [rsi], 4
je Exit_CmpXchg
mov byte ptr [r11 + rsi * 2 + 0x80], 4
mov byte ptr [rsi], 4
DirtyPage_CmpXchg:
cmp byte ptr [r11], 4
je Exit_CmpXchg
mov byte ptr [r11], 4
Exit_CmpXchg:
ret
// this is expected to be rare.
@ -670,39 +699,45 @@ ALTERNATE_ENTRY RhpCheckedLockCmpXchgAVLocation
// 4) check if the source is escaped
mov r11, rsi
add r11, 8 // escape bit is MT + 1
and r11, 0x1FFFFF
shr r11, 3
bt qword ptr [rdx], r11
jb AssignAndMarkCards_CmpXchg // source is already escaped.
// save rax, rdi, rsi, rdx and have enough stack for the callee
// Align rsp
mov r9, rsp
and rsp, -16
sub rsp, 8
// save rsp, rax, rdi, rsi, rdx and have enough stack for the callee
push r9
push rax
push rdi
push rsi
push rdx
sub rsp, 0x20
// void SatoriRegion::EscapeFn(SatoriObject** dst, SatoriObject* src, SatoriRegion* region)
call qword ptr [rdx + 8]
add rsp, 0x20
pop rdx
pop rsi
pop rdi
pop rax
pop rsp
jmp AssignAndMarkCards_CmpXchg
LEAF_END RhpCheckedLockCmpXchg, _TEXT
LEAF_ENTRY RhpCheckedXchg, _TEXT
// Setup rax with the new object for the exchange, that way it will automatically hold the correct result
// afterwards and we can leave rsi unaltered ready for the GC write barrier below.
mov rax, rsi
mov rax, rsi
mov r11, [C_VAR(g_card_bundle_table)] // fetch the page byte map
// check if dst is in heap
mov rdx, rdi
shr rdx, 30 // round to page size ( >> PAGE_BITS )
add rdx, [C_VAR(g_card_bundle_table)] // fetch the page byte map
cmp byte ptr [rdx], 0
cmp byte ptr [r11 + rdx], 0
je JustAssign_Xchg // dst not in heap
// check for escaping assignment
@ -710,8 +745,7 @@ LEAF_ENTRY RhpCheckedXchg, _TEXT
#ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
mov rdx, rsi
shr rdx, 30 // round to page size ( >> PAGE_BITS )
add rdx, [C_VAR(g_card_bundle_table)] // fetch the page byte map
cmp byte ptr [rdx], 0
cmp byte ptr [r11 + rdx], 0
je JustAssign_Xchg // src not in heap
#endif
@ -751,19 +785,30 @@ ALTERNATE_ENTRY RhpCheckedXchgAVLocationNotHeap
ALTERNATE_ENTRY RhpCheckedXchgAVLocation
xchg [rdi], rax
// TUNING: barriers in different modes could be separate pieces of code, but barrier switch
// needs to suspend EE, not sure if skipping mode check would worth that much.
mov r10, [C_VAR(g_write_watch_table)]
// check the barrier state. this must be done after the assignment (in program order)
// if state == 2 we do not set or dirty cards.
cmp r10, 2
jne DoCards_Xchg
Exit_Xchg:
ret
DoCards_Xchg:
// if same region, just check if barrier is not concurrent
xor rsi, rdi
shr rsi, 21
jz CheckConcurrent_Xchg // same region, just check if barrier is not concurrent
// TUNING: nonconcurrent and concurrent barriers could be separate pieces of code, but to switch
// need to suspend EE, not sure if skipping concurrent check would worth that much.
// if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
cmp dword ptr [rdx + 16], 2
jl MarkCards_Xchg
CheckConcurrent_Xchg:
cmp byte ptr [C_VAR(g_sw_ww_enabled_for_gc_heap)], 0
// if concurrent, load card location
cmp r10, 0
jne MarkCards_Xchg
ret
@ -776,29 +821,30 @@ ALTERNATE_ENTRY RhpCheckedXchgAVLocation
sub rdx, r11 // offset in page
mov rsi,rdx
shr rdx, 9 // card offset
shr rsi, 21 // group offset
shr rsi, 20 // group index
lea rsi, [r11 + rsi * 2 + 0x80] // group offset
// check if concurrent marking is in progress
cmp byte ptr [C_VAR(g_sw_ww_enabled_for_gc_heap)], 0
// check if concurrent marking is in progress
cmp r10, 0
jne DirtyCard_Xchg
// SETTING CARD FOR rdi
SetCard_Xchg:
cmp byte ptr [r11 + rdx], 0
jne CardSet_Xchg
jne Exit_Xchg
mov byte ptr [r11 + rdx], 1
SetGroup_Xchg:
cmp byte ptr [r11 + rsi * 2 + 0x80], 0
cmp byte ptr [rsi], 0
jne CardSet_Xchg
mov byte ptr [r11 + rsi * 2 + 0x80], 1
mov byte ptr [rsi], 1
SetPage_Xchg:
cmp byte ptr [r11], 0
jne CardSet_Xchg
mov byte ptr [r11], 1
CardSet_Xchg:
// check if concurrent marking is still not in progress
cmp byte ptr [C_VAR(g_sw_ww_enabled_for_gc_heap)], 0
// check if concurrent marking is still not in progress
cmp qword ptr [C_VAR(g_write_watch_table)], 0
jne DirtyCard_Xchg
ret
@ -808,15 +854,13 @@ ALTERNATE_ENTRY RhpCheckedXchgAVLocation
je Exit_Xchg
mov byte ptr [r11 + rdx], 4
DirtyGroup_Xchg:
cmp byte ptr [r11 + rsi * 2 + 0x80], 4
cmp byte ptr [rsi], 4
je Exit_Xchg
mov byte ptr [r11 + rsi * 2 + 0x80], 4
mov byte ptr [rsi], 4
DirtyPage_Xchg:
cmp byte ptr [r11], 4
je Exit_Xchg
mov byte ptr [r11], 4
Exit_Xchg:
ret
// this is expected to be rare.
@ -824,26 +868,32 @@ ALTERNATE_ENTRY RhpCheckedXchgAVLocation
// 4) check if the source is escaped
mov r11, rsi
add r11, 8 // escape bit is MT + 1
and r11, 0x1FFFFF
shr r11, 3
bt qword ptr [rdx], r11
jb AssignAndMarkCards_Xchg // source is already escaped.
// save rax, rdi, rsi, rdx and have enough stack for the callee
// Align rsp
mov r9, rsp
and rsp, -16
sub rsp, 8
// save rsp, rax, rdi, rsi, rdx and have enough stack for the callee
push r9
push rax
push rdi
push rsi
push rdx
sub rsp, 0x20
// void SatoriRegion::EscapeFn(SatoriObject** dst, SatoriObject* src, SatoriRegion* region)
call qword ptr [rdx + 8]
add rsp, 0x20
pop rdx
pop rsi
pop rdi
pop rax
pop rsp
jmp AssignAndMarkCards_Xchg
LEAF_END RhpCheckedXchg, _TEXT

View file

@ -353,12 +353,12 @@ else ;FEATURE_SATORI_GC
;
LEAF_ENTRY RhpCheckedAssignRef, _TEXT
; See if this is in GCHeap
mov rax, rcx
shr rax, 30 ; round to page size ( >> PAGE_BITS )
add rax, [g_card_bundle_table] ; fetch the page byte map
cmp byte ptr [rax], 0
jne RhpAssignRef
; See if dst is in GCHeap
mov rax, [g_card_bundle_table] ; fetch the page byte map
mov r8, rcx
shr r8, 30 ; dst page index
cmp byte ptr [rax + r8], 0
jne CheckedEntry
NotInHeap:
ALTERNATE_ENTRY RhpCheckedAssignRefAVLocation
@ -371,18 +371,21 @@ LEAF_END RhpCheckedAssignRef, _TEXT
; rdx - object
;
LEAF_ENTRY RhpAssignRef, _TEXT
align 16
; check for escaping assignment
; 1) check if we own the source region
ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
mov rax, rdx
shr rax, 30 ; round to page size ( >> PAGE_BITS )
add rax, [g_card_bundle_table] ; fetch the page byte map
cmp byte ptr [rax], 0
; check if src is in heap
mov rax, [g_card_bundle_table] ; fetch the page byte map
ALTERNATE_ENTRY CheckedEntry
mov r8, rdx
shr r8, 30 ; dst page index
cmp byte ptr [rax + r8], 0
je JustAssign ; src not in heap
else
ALTERNATE_ENTRY CheckedEntry
endif
; check for escaping assignment
; 1) check if we own the source region
mov r8, rdx
and r8, 0FFFFFFFFFFE00000h ; source region
@ -416,21 +419,30 @@ ALTERNATE_ENTRY RhpAssignRefAVLocationNotHeap
ALTERNATE_ENTRY RhpAssignRefAVLocation
mov [rcx], rdx
; TUNING: barriers in different modes could be separate pieces of code, but barrier switch
; needs to suspend EE, not sure if skipping mode check would worth that much.
mov r11, qword ptr [g_write_watch_table]
; check the barrier state. this must be done after the assignment (in program order)
; if state == 2 we do not set or dirty cards.
cmp r11, 2h
jne DoCards
Exit:
ret
DoCards:
; if same region, just check if barrier is not concurrent
xor rdx, rcx
shr rdx, 21
jz CheckConcurrent ; same region, just check if barrier is not concurrent
; TUNING: nonconcurrent and concurrent barriers could be separate pieces of code, but to switch
; need to suspend EE, not sure if skipping concurrent check would worth that much.
jz CheckConcurrent
; if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
cmp dword ptr [r8 + 16], 2
jl MarkCards
CheckConcurrent:
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
jne MarkCards
ret
cmp r11, 0h
je Exit
MarkCards:
; fetch card location for rcx
@ -441,21 +453,22 @@ ALTERNATE_ENTRY RhpAssignRefAVLocation
sub r8, rax ; offset in page
mov rdx,r8
shr r8, 9 ; card offset
shr rdx, 21 ; group offset
shr rdx, 20 ; group index
lea rdx, [rax + rdx * 2 + 80h] ; group offset
; check if concurrent marking is in progress
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
cmp r11, 0h
jne DirtyCard
; SETTING CARD FOR RCX
SetCard:
cmp byte ptr [rax + r8], 0
jne CardSet
jne Exit
mov byte ptr [rax + r8], 1
SetGroup:
cmp byte ptr [rax + rdx * 2 + 80h], 0
cmp byte ptr [rdx], 0
jne CardSet
mov byte ptr [rax + rdx * 2 + 80h], 1
mov byte ptr [rdx], 1
SetPage:
cmp byte ptr [rax], 0
jne CardSet
@ -463,7 +476,7 @@ ALTERNATE_ENTRY RhpAssignRefAVLocation
CardSet:
; check if concurrent marking is still not in progress
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
cmp qword ptr [g_write_watch_table], 0h
jne DirtyCard
ret
@ -471,15 +484,13 @@ ALTERNATE_ENTRY RhpAssignRefAVLocation
DirtyCard:
mov byte ptr [rax + r8], 4
DirtyGroup:
cmp byte ptr [rax + rdx * 2 + 80h], 4
cmp byte ptr [rdx], 4
je Exit
mov byte ptr [rax + rdx * 2 + 80h], 4
mov byte ptr [rdx], 4
DirtyPage:
cmp byte ptr [rax], 4
je Exit
mov byte ptr [rax], 4
Exit:
ret
; this is expected to be rare.
@ -487,12 +498,18 @@ ALTERNATE_ENTRY RhpAssignRefAVLocation
; 4) check if the source is escaped
mov rax, rdx
add rax, 8 ; escape bit is MT + 1
and rax, 01FFFFFh
shr rax, 3
bt qword ptr [r8], rax
jb AssignAndMarkCards ; source is already escaped.
; save rcx, rdx, r8 and have enough stack for the callee
; Align rsp
mov r9, rsp
and rsp, -16
; save rsp, rcx, rdx, r8 and have enough stack for the callee
push r9
push rcx
push rdx
push r8
@ -505,6 +522,7 @@ ALTERNATE_ENTRY RhpAssignRefAVLocation
pop r8
pop rdx
pop rcx
pop rsp
jmp AssignAndMarkCards
LEAF_END RhpAssignRef, _TEXT
@ -526,14 +544,13 @@ ALTERNATE_ENTRY RhpByRefAssignRefAVLocation1
add rdi, 8h
add rsi, 8h
; See if assignment is into heap
mov rax, rcx
shr rax, 30 ; round to page size ( >> PAGE_BITS )
add rax, [g_card_bundle_table] ; fetch the page byte map
cmp byte ptr [rax], 0
jne RhpAssignRef
; See if dst is in GCHeap
mov rax, [g_card_bundle_table] ; fetch the page byte map
mov r8, rcx
shr r8, 30 ; dst page index
cmp byte ptr [rax + r8], 0
jne CheckedEntry
align 16
NotInHeap:
ALTERNATE_ENTRY RhpByRefAssignRefAVLocation2
mov [rcx], rdx
@ -543,13 +560,13 @@ LEAF_END RhpByRefAssignRef, _TEXT
LEAF_ENTRY RhpCheckedLockCmpXchg, _TEXT
;; Setup rax with the new object for the exchange, that way it will automatically hold the correct result
;; afterwards and we can leave rdx unaltered ready for the GC write barrier below.
mov rax, r8
mov rax, r8
mov r11, [g_card_bundle_table] ; fetch the page byte map
; check if dst is in heap
mov r8, rcx
shr r8, 30 ; round to page size ( >> PAGE_BITS )
add r8, [g_card_bundle_table] ; fetch the page byte map
cmp byte ptr [r8], 0
cmp byte ptr [r11 + r8], 0
je JustAssign ; dst not in heap
; check for escaping assignment
@ -557,8 +574,7 @@ LEAF_ENTRY RhpCheckedLockCmpXchg, _TEXT
ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
mov r8, rdx
shr r8, 30 ; round to page size ( >> PAGE_BITS )
add r8, [g_card_bundle_table] ; fetch the page byte map
cmp byte ptr [r8], 0
cmp byte ptr [r11 + r8], 0
je JustAssign ; src not in heap
endif
@ -596,70 +612,78 @@ ALTERNATE_ENTRY RhpCheckedLockCmpXchgAVLocation
lock cmpxchg [rcx], rdx
jne Exit
; TUNING: barriers in different modes could be separate pieces of code, but barrier switch
; needs to suspend EE, not sure if skipping mode check would worth that much.
mov r11, qword ptr [g_write_watch_table]
; check the barrier state. this must be done after the assignment (in program order)
; if state == 2 we do not set or dirty cards.
cmp r11, 2h
jne DoCards
Exit:
ret
DoCards:
; if same region, just check if barrier is not concurrent
xor rdx, rcx
shr rdx, 21
jz CheckConcurrent ; same region, just check if barrier is not concurrent
; TUNING: nonconcurrent and concurrent barriers could be separate pieces of code, but to switch
; need to suspend EE, not sure if skipping concurrent check would worth that much.
jz CheckConcurrent
; if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
cmp dword ptr [r8 + 16], 2
jl MarkCards
CheckConcurrent:
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
jne MarkCards
ret
cmp r11, 0h
je Exit
MarkCards:
; fetch card location for rcx
mov r9 , [g_card_table] ; fetch the page map
mov r8, rcx
shr rcx, 30
mov r11, qword ptr [r9 + rcx * 8] ; page
sub r8, r11 ; offset in page
mov r10, qword ptr [r9 + rcx * 8] ; page
sub r8, r10 ; offset in page
mov rdx,r8
shr r8, 9 ; card offset
shr rdx, 21 ; group offset
shr rdx, 20 ; group index
lea rdx, [r10 + rdx * 2 + 80h] ; group offset
; check if concurrent marking is in progress
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
cmp r11, 0h
jne DirtyCard
; SETTING CARD FOR RCX
SetCard:
cmp byte ptr [r11 + r8], 0
jne CardSet
mov byte ptr [r11 + r8], 1
cmp byte ptr [r10 + r8], 0
jne Exit
mov byte ptr [r10 + r8], 1
SetGroup:
cmp byte ptr [r11 + rdx * 2 + 80h], 0
cmp byte ptr [rdx], 0
jne CardSet
mov byte ptr [r11 + rdx * 2 + 80h], 1
mov byte ptr [rdx], 1
SetPage:
cmp byte ptr [r11], 0
cmp byte ptr [r10], 0
jne CardSet
mov byte ptr [r11], 1
mov byte ptr [r10], 1
CardSet:
; check if concurrent marking is still not in progress
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
cmp qword ptr [g_write_watch_table], 0h
jne DirtyCard
ret
; DIRTYING CARD FOR RCX
DirtyCard:
mov byte ptr [r11 + r8], 4
mov byte ptr [r10 + r8], 4
DirtyGroup:
cmp byte ptr [r11 + rdx * 2 + 80h], 4
cmp byte ptr [rdx], 4
je Exit
mov byte ptr [r11 + rdx * 2 + 80h], 4
mov byte ptr [rdx], 4
DirtyPage:
cmp byte ptr [r11], 4
cmp byte ptr [r10], 4
je Exit
mov byte ptr [r11], 4
Exit:
mov byte ptr [r10], 4
ret
; this is expected to be rare.
@ -667,39 +691,46 @@ ALTERNATE_ENTRY RhpCheckedLockCmpXchgAVLocation
; 4) check if the source is escaped
mov r11, rdx
add r11, 8 ; escape bit is MT + 1
and r11, 01FFFFFh
shr r11, 3
bt qword ptr [r8], r11
jb AssignAndMarkCards ; source is already escaped.
; save rax, rcx, rdx, r8 and have enough stack for the callee
; Align rsp
mov r9, rsp
and rsp, -16
; save rsp, rax, rcx, rdx, r8 and have enough stack for the callee
push r9
push rax
push rcx
push rdx
push r8
sub rsp, 20h
sub rsp, 28h
; void SatoriRegion::EscapeFn(SatoriObject** dst, SatoriObject* src, SatoriRegion* region)
call qword ptr [r8 + 8]
add rsp, 20h
add rsp, 28h
pop r8
pop rdx
pop rcx
pop rax
pop rsp
jmp AssignAndMarkCards
LEAF_END RhpCheckedLockCmpXchg, _TEXT
LEAF_ENTRY RhpCheckedXchg, _TEXT
;; Setup rax with the new object for the exchange, that way it will automatically hold the correct result
;; afterwards and we can leave rdx unaltered ready for the GC write barrier below.
mov rax, rdx
mov rax, rdx
mov r11, [g_card_bundle_table] ; fetch the page byte map
; check if dst is in heap
mov r8, rcx
shr r8, 30 ; round to page size ( >> PAGE_BITS )
add r8, [g_card_bundle_table] ; fetch the page byte map
cmp byte ptr [r8], 0
cmp byte ptr [r11 + r8], 0
je JustAssign ; dst not in heap
; check for escaping assignment
@ -707,8 +738,7 @@ LEAF_ENTRY RhpCheckedXchg, _TEXT
ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
mov r8, rdx
shr r8, 30 ; round to page size ( >> PAGE_BITS )
add r8, [g_card_bundle_table] ; fetch the page byte map
cmp byte ptr [r8], 0
cmp byte ptr [r11 + r8], 0
je JustAssign ; src not in heap
endif
@ -744,70 +774,78 @@ ALTERNATE_ENTRY RhpCheckedXchgAVLocationNotHeap
ALTERNATE_ENTRY RhpCheckedXchgAVLocation
xchg [rcx], rax
; TUNING: barriers in different modes could be separate pieces of code, but barrier switch
; needs to suspend EE, not sure if skipping mode check would worth that much.
mov r11, qword ptr [g_write_watch_table]
; check the barrier state. this must be done after the assignment (in program order)
; if state == 2 we do not set or dirty cards.
cmp r11, 2h
jne DoCards
Exit:
ret
DoCards:
; if same region, just check if barrier is not concurrent
xor rdx, rcx
shr rdx, 21
jz CheckConcurrent ; same region, just check if barrier is not concurrent
; TUNING: nonconcurrent and concurrent barriers could be separate pieces of code, but to switch
; need to suspend EE, not sure if skipping concurrent check would worth that much.
jz CheckConcurrent
; if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
cmp dword ptr [r8 + 16], 2
jl MarkCards
CheckConcurrent:
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
jne MarkCards
ret
cmp r11, 0h
je Exit
MarkCards:
; fetch card location for rcx
mov r9 , [g_card_table] ; fetch the page map
mov r8, rcx
shr rcx, 30
mov r11, qword ptr [r9 + rcx * 8] ; page
sub r8, r11 ; offset in page
mov r10, qword ptr [r9 + rcx * 8] ; page
sub r8, r10 ; offset in page
mov rdx,r8
shr r8, 9 ; card offset
shr rdx, 21 ; group offset
shr rdx, 20 ; group index
lea rdx, [r10 + rdx * 2 + 80h] ; group offset
; check if concurrent marking is in progress
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
cmp r11, 0h
jne DirtyCard
; SETTING CARD FOR RCX
SetCard:
cmp byte ptr [r11 + r8], 0
jne CardSet
mov byte ptr [r11 + r8], 1
cmp byte ptr [r10 + r8], 0
jne Exit
mov byte ptr [r10 + r8], 1
SetGroup:
cmp byte ptr [r11 + rdx * 2 + 80h], 0
cmp byte ptr [rdx], 0
jne CardSet
mov byte ptr [r11 + rdx * 2 + 80h], 1
mov byte ptr [rdx], 1
SetPage:
cmp byte ptr [r11], 0
cmp byte ptr [r10], 0
jne CardSet
mov byte ptr [r11], 1
mov byte ptr [r10], 1
CardSet:
; check if concurrent marking is still not in progress
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
cmp qword ptr [g_write_watch_table], 0h
jne DirtyCard
ret
; DIRTYING CARD FOR RCX
DirtyCard:
mov byte ptr [r11 + r8], 4
mov byte ptr [r10 + r8], 4
DirtyGroup:
cmp byte ptr [r11 + rdx * 2 + 80h], 4
cmp byte ptr [rdx], 4
je Exit
mov byte ptr [r11 + rdx * 2 + 80h], 4
mov byte ptr [rdx], 4
DirtyPage:
cmp byte ptr [r11], 4
cmp byte ptr [r10], 4
je Exit
mov byte ptr [r11], 4
Exit:
mov byte ptr [r10], 4
ret
; this is expected to be rare.
@ -815,26 +853,33 @@ ALTERNATE_ENTRY RhpCheckedXchgAVLocation
; 4) check if the source is escaped
mov r11, rdx
add r11, 8 ; escape bit is MT + 1
and r11, 01FFFFFh
shr r11, 3
bt qword ptr [r8], r11
jb AssignAndMarkCards ; source is already escaped.
; save rax, rcx, rdx, r8 and have enough stack for the callee
; Align rsp
mov r9, rsp
and rsp, -16
; save rsp, rax, rcx, rdx, r8 and have enough stack for the callee
push r9
push rax
push rcx
push rdx
push r8
sub rsp, 20h
sub rsp, 28h
; void SatoriRegion::EscapeFn(SatoriObject** dst, SatoriObject* src, SatoriRegion* region)
call qword ptr [r8 + 8]
add rsp, 20h
add rsp, 28h
pop r8
pop rdx
pop rcx
pop rax
pop rsp
jmp AssignAndMarkCards
LEAF_END RhpCheckedXchg, _TEXT

View file

@ -434,19 +434,23 @@ LEAF_END RhpByRefAssignRefArm64, _TEXT
// on the managed heap.
//
// On entry:
// x14 : the destination address (LHS of the assignment).
// x14 : the destination address (LHS of the assignment)
// May not be a heap location (hence the checked).
// x15 : the object reference (RHS of the assignment).
// x15 : the object reference (RHS of the assignment)
//
// On exit:
// x12, x17 : trashed
// x14 : incremented by 8
// x12 : trashed
// x14 : trashed (incremented by 8 to implement JIT_ByRefWriteBarrier contract)
// x15 : trashed
// x16 : trashed (ip0)
// x17 : trashed (ip1)
LEAF_ENTRY RhpCheckedAssignRefArm64, _TEXT
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x12
add x12, x12, x14, lsr #30
ldrb w12, [x12]
cbz x12, LOCAL_LABEL(NotInHeap)
b C_FUNC(RhpAssignRefArm64)
// See if dst is in GCHeap
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x16
lsr x17, x14, #30 // dst page index
ldrb w12, [x16, x17]
cbz x12 , LOCAL_LABEL(NotInHeap)
b C_FUNC(CheckedEntry)
LOCAL_LABEL(NotInHeap):
ALTERNATE_ENTRY RhpCheckedAssignRefAVLocation
@ -460,25 +464,30 @@ LEAF_END RhpCheckedAssignRefArm64, _TEXT
// reside on the managed heap.
//
// On entry:
// x14 : the destination address (LHS of the assignment).
// x15 : the object reference (RHS of the assignment).
// x14 : the destination address (LHS of the assignment)
// x15 : the object reference (RHS of the assignment)
//
// On exit:
// x12, x17 : trashed
// x14 : incremented by 8
// x12 : trashed
// x14 : trashed (incremented by 8 to implement JIT_ByRefWriteBarrier contract)
// x15 : trashed
// x16 : trashed (ip0)
// x17 : trashed (ip1)
LEAF_ENTRY RhpAssignRefArm64, _TEXT
// check for escaping assignment
// 1) check if we own the source region
#ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x12
add x12, x12, x15, lsr #30
ldrb w12, [x12]
cbz x12, LOCAL_LABEL(JustAssign)
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x16
ALTERNATE_ENTRY CheckedEntry
lsr x17, x15, #30 // source page index
ldrb w12, [x16, x17]
cbz x12, LOCAL_LABEL(JustAssign) // null or external (immutable) object
#else
ALTERNATE_ENTRY CheckedEntry
cbz x15, LOCAL_LABEL(JustAssign) // assigning null
#endif
and x12, x15, #0xFFFFFFFFFFE00000 // source region
ldr x12, [x12] // region tag
and x16, x15, #0xFFFFFFFFFFE00000 // source region
ldr x12, [x16] // region tag
#ifdef TARGET_OSX
mrs x17, TPIDRRO_EL0
and x17, x17, #-8 // thread tag on OSX
@ -489,119 +498,124 @@ LEAF_ENTRY RhpAssignRefArm64, _TEXT
bne LOCAL_LABEL(AssignAndMarkCards) // not local to this thread
// 2) check if the src and dst are from the same region
eor x12, x14, x15
lsr x12, x12, #21
cbnz x12, LOCAL_LABEL(RecordEscape) // cross region assignment. definitely escaping
and x12, x14, #0xFFFFFFFFFFE00000 // target aligned to region
cmp x12, x16
bne LOCAL_LABEL(RecordEscape) // cross region assignment. definitely escaping
// 3) check if the target is exposed
ubfx x17, x14,#9,#12 // word index = (dst >> 9) & 0x1FFFFF
and x12, x15, #0xFFFFFFFFFFE00000 // source region
ldr x17, [x12, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x14, #3 // bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, LOCAL_LABEL(RecordEscape) // target is exposed. record an escape.
str x15, [x14], #8 // UNORDERED assignment of unescaped object
ret lr
ubfx x17, x14,#9,#12 // word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x16, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x14, #3 // bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, LOCAL_LABEL(RecordEscape) // target is exposed. record an escape.
// UNORDERED! assignment of unescaped, null or external (immutable) object
LOCAL_LABEL(JustAssign):
ALTERNATE_ENTRY RhpAssignRefAVLocationNotHeap
stlr x15, [x14] // no card marking, src is not a heap object
add x14, x14, 8
ret lr
str x15, [x14], #8
ret lr
LOCAL_LABEL(AssignAndMarkCards):
ALTERNATE_ENTRY RhpAssignRefAVLocation
stlr x15, [x14]
// need couple temps. Save before using.
stp x2, x3, [sp, -16]!
// TUNING: barriers in different modes could be separate pieces of code, but barrier switch
// needs to suspend EE, not sure if skipping mode check would worth that much.
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x17
// check the barrier state. this must be done after the assignment (in program order)
// if state == 2 we do not set or dirty cards.
tbz x17, #1, LOCAL_LABEL(DoCards)
eor x12, x14, x15
lsr x12, x12, #21
cbz x12, LOCAL_LABEL(CheckConcurrent) // same region, just check if barrier is not concurrent
LOCAL_LABEL(ExitNoCards):
add x14, x14, 8
ret lr
LOCAL_LABEL(DoCards):
// if same region, just check if barrier is not concurrent
and x12, x14, #0xFFFFFFFFFFE00000 // target aligned to region
cmp x12, x16
beq LOCAL_LABEL(CheckConcurrent) // same region, just check if barrier is not concurrent
// if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
and x2, x15, #0xFFFFFFFFFFE00000 // source region
ldr w12, [x2, 16]
ldr w12, [x16, 16] // source region + 16 -> generation
tbz x12, #1, LOCAL_LABEL(MarkCards)
LOCAL_LABEL(CheckConcurrent):
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 // !g_write_watch_table -> !concurrent
cbnz x12, LOCAL_LABEL(MarkCards)
// if not concurrent, exit
cbz x17, LOCAL_LABEL(ExitNoCards)
LOCAL_LABEL(MarkCards):
// need couple temps. Save before using.
stp x2, x3, [sp, -16]!
// fetch card location for x14
PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x12 // fetch the page map
lsr x16, x14, #30
ldr x16, [x12, x16, lsl #3] // page
sub x2, x14, x16 // offset in page
lsr x15, x2, #20 // group index
lsr x2, x2, #9 // card offset
lsl x15, x15, #1 // group offset (index * 2)
// check if concurrent marking is in progress
cbnz x17, LOCAL_LABEL(DirtyCard)
// SETTING CARD FOR X14
LOCAL_LABEL(SetCard):
ldrb w3, [x16, x2]
cbnz w3, LOCAL_LABEL(Exit)
mov w17, #1
strb w17, [x16, x2]
LOCAL_LABEL(SetGroup):
add x12, x16, #0x80
ldrb w3, [x12, x15]
cbnz w3, LOCAL_LABEL(CardSet)
strb w17, [x12, x15]
LOCAL_LABEL(SetPage):
ldrb w3, [x16]
cbnz w3, LOCAL_LABEL(CardSet)
strb w17, [x16]
LOCAL_LABEL(CardSet):
// check if concurrent marking is still not in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12
cbnz x12, LOCAL_LABEL(DirtyCard)
LOCAL_LABEL(Exit):
ldp x2, x3, [sp], 16
add x14, x14, 8
ret lr
LOCAL_LABEL(MarkCards):
// fetch card location for x14
PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x12 // fetch the page map
lsr x17, x14, #30
ldr x17, [x12, x17, lsl #3] // page
sub x2, x14, x17 // offset in page
lsr x15, x2, #21 // group index
lsl x15, x15, #1 // group offset (index * 2)
lsr x2, x2, #9 // card offset
// check if concurrent marking is in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 // !g_write_watch_table -> !concurrent
cbnz x12, LOCAL_LABEL(DirtyCard)
// SETTING CARD FOR X14
LOCAL_LABEL(SetCard):
ldrb w3, [x17, x2]
cbnz w3, LOCAL_LABEL(CardSet)
mov w16, #1
strb w16, [x17, x2]
LOCAL_LABEL(SetGroup):
add x12, x17, #0x80
ldrb w3, [x12, x15]
cbnz w3, LOCAL_LABEL(CardSet)
strb w16, [x12, x15]
LOCAL_LABEL(SetPage):
ldrb w3, [x17]
cbnz w3, LOCAL_LABEL(CardSet)
strb w16, [x17]
LOCAL_LABEL(CardSet):
// check if concurrent marking is still not in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 // !g_write_watch_table -> !concurrent
cbnz x12, LOCAL_LABEL(DirtyCard)
b LOCAL_LABEL(Exit)
// DIRTYING CARD FOR X14
LOCAL_LABEL(DirtyCard):
mov w16, #4
add x2, x2, x17
mov w17, #4
add x2, x2, x16
// must be after the field write to allow concurrent clean
stlrb w16, [x2]
stlrb w17, [x2]
LOCAL_LABEL(DirtyGroup):
add x12, x17, #0x80
add x12, x16, #0x80
ldrb w3, [x12, x15]
tbnz w3, #2, LOCAL_LABEL(Exit)
strb w16, [x12, x15]
strb w17, [x12, x15]
LOCAL_LABEL(DirtyPage):
ldrb w3, [x17]
ldrb w3, [x16]
tbnz w3, #2, LOCAL_LABEL(Exit)
strb w16, [x17]
strb w17, [x16]
b LOCAL_LABEL(Exit)
// this is expected to be rare.
LOCAL_LABEL(RecordEscape):
// 4) check if the source is escaped
and x12, x15, #0xFFFFFFFFFFE00000 // source region
add x16, x15, #8 // escape bit is MT + 1
ubfx x17, x16, #9,#12 // word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x12, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x16, #3 // bit = (dst >> 3) [& 63]
// 4) check if the source is escaped (x16 has source region)
add x12, x15, #8 // escape bit is MT + 1
ubfx x17, x12, #9,#12 // word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x16, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x12, #3 // bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, LOCAL_LABEL(AssignAndMarkCards) // source is already escaped.
// because of the barrier call convention
// we need to preserve caller-saved x0 through x18 and x29/x30
// we need to preserve caller-saved x0 through x15 and x29/x30
stp x29,x30, [sp, -16 * 9]!
stp x0, x1, [sp, 16 * 1]
@ -616,8 +630,8 @@ ALTERNATE_ENTRY RhpAssignRefAVLocation
// void SatoriRegion::EscapeFn(SatoriObject** dst, SatoriObject* src, SatoriRegion* region)
// mov x0, x14 EscapeFn does not use dst, it is just to avoid arg shuffle on x64
mov x1, x15
and x2, x15, #0xFFFFFFFFFFE00000 // source region
ldr x12, [x2, #8] // EscapeFn address
mov x2, x16 // source region
ldr x12, [x16, #8] // EscapeFn address
blr x12
ldp x0, x1, [sp, 16 * 1]
@ -630,6 +644,7 @@ ALTERNATE_ENTRY RhpAssignRefAVLocation
ldp x14,x15, [sp, 16 * 8]
ldp x29,x30, [sp], 16 * 9
and x16, x15, #0xFFFFFFFFFFE00000 // source region
b LOCAL_LABEL(AssignAndMarkCards)
LEAF_END RhpAssignRefArm64, _TEXT
@ -670,8 +685,8 @@ LEAF_ENTRY RhpCheckedLockCmpXchg
#else
cbz x1, LOCAL_LABEL(JustAssign_Cmp_Xchg) // assigning null
#endif
and x12, x1, #0xFFFFFFFFFFE00000 // source region
ldr x12, [x12] // region tag
and x16, x1, #0xFFFFFFFFFFE00000 // source region
ldr x12, [x16] // region tag
#ifdef TARGET_OSX
mrs x17, TPIDRRO_EL0
and x17, x17, #-8 // thread tag on OSX
@ -682,14 +697,13 @@ LEAF_ENTRY RhpCheckedLockCmpXchg
bne LOCAL_LABEL(AssignAndMarkCards_Cmp_Xchg) // not local to this thread
// 2) check if the src and dst are from the same region
eor x12, x0, x1
lsr x12, x12, #21
cbnz x12, LOCAL_LABEL(RecordEscape_Cmp_Xchg) // cross region assignment. definitely escaping
and x12, x0, #0xFFFFFFFFFFE00000 // target aligned to region
cmp x12, x16
bne LOCAL_LABEL(RecordEscape_Cmp_Xchg) // cross region assignment. definitely escaping
// 3) check if the target is exposed
ubfx x17, x0,#9,#12 // word index = (dst >> 9) & 0x1FFFFF
and x12, x1, #0xFFFFFFFFFFE00000 // source region
ldr x17, [x12, x17, lsl #3] // mark word = [region + index * 8]
ldr x17, [x16, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x0, #3 // bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, LOCAL_LABEL(RecordEscape_Cmp_Xchg) // target is exposed. record an escape.
@ -704,8 +718,8 @@ ALTERNATE_ENTRY RhpCheckedLockCmpXchgAVLocationNotHeap
mov x15, x1 // x15 = val
#ifndef LSE_INSTRUCTIONS_ENABLED_BY_DEFAULT
PREPARE_EXTERNAL_VAR_INDIRECT_W g_cpuFeatures, 16
tbz w16, #ARM64_ATOMICS_FEATURE_FLAG_BIT, LOCAL_LABEL(TryAgain1_Cmp_Xchg)
PREPARE_EXTERNAL_VAR_INDIRECT_W g_cpuFeatures, 17
tbz w17, #ARM64_ATOMICS_FEATURE_FLAG_BIT, LOCAL_LABEL(TryAgain1_Cmp_Xchg)
#endif
mov x17, x2
@ -713,7 +727,7 @@ ALTERNATE_ENTRY RhpCheckedLockCmpXchgAVLocation
casal x2, x1, [x0] // exchange
mov x0, x2 // x0 = result
cmp x2, x17
bne LOCAL_LABEL(Exit_Cmp_Xchg)
bne LOCAL_LABEL(Exit_Cmp_XchgNoCards)
#ifndef LSE_INSTRUCTIONS_ENABLED_BY_DEFAULT
b LOCAL_LABEL(SkipLLScCmpXchg)
@ -736,87 +750,99 @@ LOCAL_LABEL(SkipLLScCmpXchg):
#endif
cbnz x10, LOCAL_LABEL(DoCardsCmpXchg)
LOCAL_LABEL(Exit_Cmp_Xchg):
LOCAL_LABEL(Exit_Cmp_XchgNoCards):
ret lr
LOCAL_LABEL(DoCardsCmpXchg):
eor x12, x14, x15
lsr x12, x12, #21
cbz x12, LOCAL_LABEL(CheckConcurrent_Cmp_Xchg) // same region, just check if barrier is not concurrent
// TUNING: barriers in different modes could be separate pieces of code, but barrier switch
// needs to suspend EE, not sure if skipping mode check would worth that much.
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x17
// check the barrier state. this must be done after the assignment (in program order)
// if state == 2 we do not set or dirty cards.
tbnz x17, #1, LOCAL_LABEL(Exit_Cmp_XchgNoCards)
LOCAL_LABEL(DoCardsCmpXchg1):
// if same region, just check if barrier is not concurrent
and x12, x14, #0xFFFFFFFFFFE00000 // target aligned to region
cmp x12, x16
beq LOCAL_LABEL(CheckConcurrentCmpXchg) // same region, just check if barrier is not concurrent
// we will trash x2 and x3, this is a regular call, so it is ok
// if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
and x2, x15, #0xFFFFFFFFFFE00000 // source region
ldr w12, [x2, 16]
tbz x12, #1, LOCAL_LABEL(MarkCards_Cmp_Xchg)
ldr w12, [x16, 16] // source region + 16 -> generation
tbz x12, #1, LOCAL_LABEL(MarkCardsCmpXchg)
LOCAL_LABEL(CheckConcurrentCmpXchg):
// if not concurrent, exit
cbz x17, LOCAL_LABEL(Exit_Cmp_XchgNoCards)
LOCAL_LABEL(MarkCardsCmpXchg):
// need couple temps. Save before using.
stp x2, x3, [sp, -16]!
LOCAL_LABEL(CheckConcurrent_Cmp_Xchg):
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 // !g_write_watch_table -> !concurrent
cbz x12, LOCAL_LABEL(Exit_Cmp_Xchg)
LOCAL_LABEL(MarkCards_Cmp_Xchg):
// fetch card location for x14
PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x12 // fetch the page map
lsr x17, x14, #30
ldr x17, [x12, x17, lsl #3] // page
sub x2, x14, x17 // offset in page
lsr x15, x2, #21 // group index
lsl x15, x15, #1 // group offset (index * 2)
lsr x16, x14, #30
ldr x16, [x12, x16, lsl #3] // page
sub x2, x14, x16 // offset in page
lsr x15, x2, #20 // group index
lsr x2, x2, #9 // card offset
lsl x15, x15, #1 // group offset (index * 2)
// check if concurrent marking is in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 // !g_write_watch_table -> !concurrent
cbnz x12, LOCAL_LABEL(DirtyCard_Cmp_Xchg)
cbnz x17, LOCAL_LABEL(DirtyCardCmpXchg)
// SETTING CARD FOR X14
LOCAL_LABEL(SetCard_Cmp_Xchg):
ldrb w3, [x17, x2]
cbnz w3, LOCAL_LABEL(CardSet_Cmp_Xchg)
mov w16, #1
strb w16, [x17, x2]
LOCAL_LABEL(SetGroup_Cmp_Xchg):
add x12, x17, #0x80
LOCAL_LABEL(SetCardCmpXchg):
ldrb w3, [x16, x2]
cbnz w3, LOCAL_LABEL(ExitCmpXchg)
mov w17, #1
strb w17, [x16, x2]
LOCAL_LABEL(SetGroupCmpXchg):
add x12, x16, #0x80
ldrb w3, [x12, x15]
cbnz w3, LOCAL_LABEL(CardSet_Cmp_Xchg)
strb w16, [x12, x15]
LOCAL_LABEL(SetPage_Cmp_Xchg):
ldrb w3, [x17]
cbnz w3, LOCAL_LABEL(CardSet_Cmp_Xchg)
strb w16, [x17]
cbnz w3, LOCAL_LABEL(CardSetCmpXchg)
strb w17, [x12, x15]
LOCAL_LABEL(SetPageCmpXchg):
ldrb w3, [x16]
cbnz w3, LOCAL_LABEL(CardSetCmpXchg)
strb w17, [x16]
LOCAL_LABEL(CardSet_Cmp_Xchg):
LOCAL_LABEL(CardSetCmpXchg):
// check if concurrent marking is still not in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 // !g_write_watch_table -> !concurrent
cbnz x12, LOCAL_LABEL(DirtyCard_Cmp_Xchg)
ret lr
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12
cbnz x12, LOCAL_LABEL(DirtyCardCmpXchg)
LOCAL_LABEL(ExitCmpXchg):
ldp x2, x3, [sp], 16
ret lr
// DIRTYING CARD FOR X14
LOCAL_LABEL(DirtyCard_Cmp_Xchg):
mov w16, #4
add x2, x2, x17
LOCAL_LABEL(DirtyCardCmpXchg):
mov w17, #4
add x2, x2, x16
// must be after the field write to allow concurrent clean
stlrb w16, [x2]
LOCAL_LABEL(DirtyGroup_Cmp_Xchg):
add x12, x17, #0x80
stlrb w17, [x2]
LOCAL_LABEL(DirtyGroupCmpXchg):
add x12, x16, #0x80
ldrb w3, [x12, x15]
tbnz w3, #2, LOCAL_LABEL(Exit_Cmp_Xchg)
strb w16, [x12, x15]
LOCAL_LABEL(DirtyPage_Cmp_Xchg):
ldrb w3, [x17]
tbnz w3, #2, LOCAL_LABEL(Exit_Cmp_Xchg)
strb w16, [x17]
ret lr
tbnz w3, #2, LOCAL_LABEL(ExitCmpXchg)
strb w17, [x12, x15]
LOCAL_LABEL(DirtyPagCmpXchge):
ldrb w3, [x16]
tbnz w3, #2, LOCAL_LABEL(ExitCmpXchg)
strb w17, [x16]
b LOCAL_LABEL(ExitCmpXchg)
// this is expected to be rare.
LOCAL_LABEL(RecordEscape_Cmp_Xchg):
// 4) check if the source is escaped
and x12, x1, #0xFFFFFFFFFFE00000 // source region
add x16, x1, #8 // escape bit is MT + 1
ubfx x17, x16, #9,#12 // word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x12, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x16, #3 // bit = (dst >> 3) [& 63]
add x12, x1, #8 // escape bit is MT + 1
ubfx x17, x12, #9,#12 // word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x16, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x12, #3 // bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, LOCAL_LABEL(AssignAndMarkCards_Cmp_Xchg) // source is already escaped.
@ -827,8 +853,8 @@ LOCAL_LABEL(DoCardsCmpXchg):
str x2, [sp, 16 * 2]
// void SatoriRegion::EscapeFn(SatoriObject** dst, SatoriObject* src, SatoriRegion* region)
and x2, x1, #0xFFFFFFFFFFE00000 // source region
ldr x12, [x2, #8] // EscapeFn address
mov x2, x16 // source region
ldr x12, [x16, #8] // EscapeFn address
blr x12
ldp x0, x1, [sp, 16 * 1]
@ -837,6 +863,7 @@ LOCAL_LABEL(DoCardsCmpXchg):
// x10 should be not 0 to indicate that can`t skip cards.
mov x10,#1
and x16, x1, #0xFFFFFFFFFFE00000 // source region
b LOCAL_LABEL(AssignAndMarkCards_Cmp_Xchg)
LEAF_END RhpCheckedLockCmpXchg, _TEXT
@ -859,24 +886,23 @@ LEAF_END RhpCheckedLockCmpXchg, _TEXT
//
LEAF_ENTRY RhpCheckedXchg, _TEXT
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x10
// check if dst is in heap
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x12
add x12, x12, x0, lsr #30
add x12, x10, x0, lsr #30
ldrb w12, [x12]
cbz x12, LOCAL_LABEL(JustAssign_Xchg)
// check for escaping assignment
// 1) check if we own the source region
#ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x12
add x12, x12, x1, lsr #30
add x12, x10, x1, lsr #30
ldrb w12, [x12]
cbz x12, LOCAL_LABEL(JustAssign_Xchg)
#else
cbz x1, LOCAL_LABEL(JustAssign_Xchg) // assigning null
#endif
and x12, x1, #0xFFFFFFFFFFE00000 // source region
ldr x12, [x12] // region tag
and x16, x1, #0xFFFFFFFFFFE00000 // source region
ldr x12, [x16] // region tag
#ifdef TARGET_OSX
mrs x17, TPIDRRO_EL0
and x17, x17, #-8 // thread tag on OSX
@ -887,19 +913,19 @@ LEAF_ENTRY RhpCheckedXchg, _TEXT
bne LOCAL_LABEL(AssignAndMarkCards_Xchg) // not local to this thread
// 2) check if the src and dst are from the same region
eor x12, x0, x1
lsr x12, x12, #21
cbnz x12, LOCAL_LABEL(RecordEscape_Xchg) // cross region assignment. definitely escaping
and x12, x0, #0xFFFFFFFFFFE00000 // target aligned to region
cmp x12, x16
bne LOCAL_LABEL(RecordEscape_Xchg) // cross region assignment. definitely escaping
// 3) check if the target is exposed
ubfx x17, x0,#9,#12 // word index = (dst >> 9) & 0x1FFFFF
and x12, x1, #0xFFFFFFFFFFE00000 // source region
ldr x17, [x12, x17, lsl #3] // mark word = [region + index * 8]
ldr x17, [x16, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x0, #3 // bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, LOCAL_LABEL(RecordEscape_Xchg) // target is exposed. record an escape.
LOCAL_LABEL(JustAssign_Xchg):
// TODO: VS use LSE_INSTRUCTIONS_ENABLED_BY_DEFAULT instead
#ifdef TARGET_OSX
ALTERNATE_ENTRY RhpCheckedXchgAVLocationNotHeap
swpal x1, x0, [x0] // exchange
@ -930,85 +956,96 @@ ALTERNATE_ENTRY RhpCheckedXchgAVLocation2
dmb ish
#endif
eor x12, x14, x1
lsr x12, x12, #21
cbz x12, LOCAL_LABEL(CheckConcurrent_Xchg) // same region, just check if barrier is not concurrent
// TUNING: barriers in different modes could be separate pieces of code, but barrier switch
// needs to suspend EE, not sure if skipping mode check would worth that much.
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x17
// check the barrier state. this must be done after the assignment (in program order)
// if state == 2 we do not set or dirty cards.
tbz x17, #1, LOCAL_LABEL(DoCardsXchg)
LOCAL_LABEL(ExitNoCardsXchg):
ret lr
LOCAL_LABEL(DoCardsXchg):
// if same region, just check if barrier is not concurrent
and x12, x14, #0xFFFFFFFFFFE00000 // target aligned to region
cmp x12, x16
beq LOCAL_LABEL(CheckConcurrentXchg) // same region, just check if barrier is not concurrent
// we will trash x2 and x3, this is a regular call, so it is ok
// if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
and x2, x1, #0xFFFFFFFFFFE00000 // source region
ldr w12, [x2, 16]
tbz x12, #1, LOCAL_LABEL(MarkCards_Xchg)
ldr w12, [x16, 16] // source region + 16 -> generation
tbz x12, #1, LOCAL_LABEL(MarkCardsXchg)
LOCAL_LABEL(CheckConcurrent_Xchg):
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 // !g_write_watch_table -> !concurrent
cbnz x12, LOCAL_LABEL(MarkCards_Xchg)
LOCAL_LABEL(Exit_Xchg):
ret lr
LOCAL_LABEL(CheckConcurrentXchg):
// if not concurrent, exit
cbz x17, LOCAL_LABEL(ExitNoCardsXchg)
LOCAL_LABEL(MarkCardsXchg):
// need couple temps. Save before using.
stp x2, x3, [sp, -16]!
LOCAL_LABEL(MarkCards_Xchg):
// fetch card location for x14
PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x12 // fetch the page map
lsr x17, x14, #30
ldr x17, [x12, x17, lsl #3] // page
sub x2, x14, x17 // offset in page
lsr x1, x2, #21 // group index
lsl x1, x1, #1 // group offset (index * 2)
lsr x16, x14, #30
ldr x16, [x12, x16, lsl #3] // page
sub x2, x14, x16 // offset in page
lsr x15, x2, #20 // group index
lsr x2, x2, #9 // card offset
lsl x15, x15, #1 // group offset (index * 2)
// check if concurrent marking is in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 // !g_write_watch_table -> !concurrent
cbnz x12, LOCAL_LABEL(DirtyCard_Xchg)
cbnz x17, LOCAL_LABEL(DirtyCardXchg)
// SETTING CARD FOR X14
LOCAL_LABEL(SetCard_Xchg):
ldrb w3, [x17, x2]
cbnz w3, LOCAL_LABEL(CardSet_Xchg)
mov w16, #1
strb w16, [x17, x2]
LOCAL_LABEL(SetGroup_Xchg):
add x12, x17, #0x80
ldrb w3, [x12, x1]
cbnz w3, LOCAL_LABEL(CardSet_Xchg)
strb w16, [x12, x1]
LOCAL_LABEL(SetPage_Xchg):
ldrb w3, [x17]
cbnz w3, LOCAL_LABEL(CardSet_Xchg)
strb w16, [x17]
LOCAL_LABEL(SetCardXchg):
ldrb w3, [x16, x2]
cbnz w3, LOCAL_LABEL(ExitXchg)
mov w17, #1
strb w17, [x16, x2]
LOCAL_LABEL(SetGroupXchg):
add x12, x16, #0x80
ldrb w3, [x12, x15]
cbnz w3, LOCAL_LABEL(CardSetXchg)
strb w17, [x12, x15]
LOCAL_LABEL(SetPageXchg):
ldrb w3, [x16]
cbnz w3, LOCAL_LABEL(CardSetXchg)
strb w17, [x16]
LOCAL_LABEL(CardSet_Xchg):
LOCAL_LABEL(CardSetXchg):
// check if concurrent marking is still not in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 // !g_write_watch_table -> !concurrent
cbnz x12, LOCAL_LABEL(DirtyCard_Xchg)
b LOCAL_LABEL(Exit_Xchg)
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12
cbnz x12, LOCAL_LABEL(DirtyCardXchg)
LOCAL_LABEL(ExitXchg):
ldp x2, x3, [sp], 16
ret lr
// DIRTYING CARD FOR X14
LOCAL_LABEL(DirtyCard_Xchg):
mov w16, #4
add x2, x2, x17
LOCAL_LABEL(DirtyCardXchg):
mov w17, #4
add x2, x2, x16
// must be after the field write to allow concurrent clean
stlrb w16, [x2]
LOCAL_LABEL(DirtyGroup_Xchg):
add x12, x17, #0x80
ldrb w3, [x12, x1]
tbnz w3, #2, LOCAL_LABEL(Exit_Xchg)
strb w16, [x12, x1]
LOCAL_LABEL(DirtyPage_Xchg):
ldrb w3, [x17]
tbnz w3, #2, LOCAL_LABEL(Exit_Xchg)
strb w16, [x17]
b LOCAL_LABEL(Exit_Xchg)
stlrb w17, [x2]
LOCAL_LABEL(DirtyGroupXchg):
add x12, x16, #0x80
ldrb w3, [x12, x15]
tbnz w3, #2, LOCAL_LABEL(ExitXchg)
strb w17, [x12, x15]
LOCAL_LABEL(DirtyPagXchge):
ldrb w3, [x16]
tbnz w3, #2, LOCAL_LABEL(ExitXchg)
strb w17, [x16]
b LOCAL_LABEL(ExitXchg)
// this is expected to be rare.
LOCAL_LABEL(RecordEscape_Xchg):
// 4) check if the source is escaped
and x12, x1, #0xFFFFFFFFFFE00000 // source region
add x16, x1, #8 // escape bit is MT + 1
ubfx x17, x16, #9,#12 // word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x12, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x16, #3 // bit = (dst >> 3) [& 63]
add x12, x1, #8 // escape bit is MT + 1
ubfx x17, x12, #9,#12 // word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x16, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x12, #3 // bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, LOCAL_LABEL(AssignAndMarkCards_Xchg) // source is already escaped.
@ -1017,13 +1054,14 @@ ALTERNATE_ENTRY RhpCheckedXchgAVLocation2
stp x0, x1, [sp, 16 * 1]
// void SatoriRegion::EscapeFn(SatoriObject** dst, SatoriObject* src, SatoriRegion* region)
and x2, x1, #0xFFFFFFFFFFE00000 // source region
ldr x12, [x2, #8] // EscapeFn address
mov x2, x16 // source region
ldr x12, [x16, #8] // EscapeFn address
blr x12
ldp x0, x1, [sp, 16 * 1]
ldp x29,x30, [sp], 16 * 2
and x16, x1, #0xFFFFFFFFFFE00000 // source region
b LOCAL_LABEL(AssignAndMarkCards_Xchg)
LEAF_END RhpCheckedXchg, _TEXT

View file

@ -431,13 +431,17 @@ NoBarrierXchg
;; x15 : the object reference (RHS of the assignment).
;;
;; On exit:
;; x12, x17 : trashed
;; x14 : incremented by 8
;; x12 : trashed
;; x14 : trashed (incremented by 8 to implement JIT_ByRefWriteBarrier contract)
;; x15 : trashed
;; x16 : trashed (ip0)
;; x17 : trashed (ip1)
LEAF_ENTRY RhpCheckedAssignRefArm64, _TEXT
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x12
add x12, x12, x14, lsr #30
ldrb w12, [x12]
cbnz x12, RhpAssignRefArm64
;; See if dst is in GCHeap
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x16
lsr x17, x14, #30 ;; dst page index
ldrb w12, [x16, x17]
cbnz x12, CheckedEntry
NotInHeap
ALTERNATE_ENTRY RhpCheckedAssignRefAVLocation
@ -451,142 +455,153 @@ NotInHeap
;; reside on the managed heap.
;;
;; On entry:
;; x14 : the destination address (LHS of the assignment).
;; x15 : the object reference (RHS of the assignment).
;; x14 : the destination address (LHS of the assignment)
;; x15 : the object reference (RHS of the assignment)
;;
;; On exit:
;; x12, x17 : trashed
;; x14 : incremented by 8
;; x12 : trashed
;; x14 : trashed (incremented by 8 to implement JIT_ByRefWriteBarrier contract)
;; x15 : trashed
;; x16 : trashed (ip0)
;; x17 : trashed (ip1)
LEAF_ENTRY RhpAssignRefArm64, _TEXT
;; check for escaping assignment
;; 1) check if we own the source region
#ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x12
add x12, x12, x15, lsr #30
ldrb w12, [x12]
cbz x12, JustAssign
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x16
ALTERNATE_ENTRY CheckedEntry
lsr x17, x15, #30 ;; source page index
ldrb w12, [x16, x17]
cbz x12, JustAssign ;; null or external (immutable) object
#else
cbz x15, JustAssign ;; assigning null
ALTERNATE_ENTRY CheckedEntry
cbz x15, JustAssign ;; assigning null
#endif
and x12, x15, #0xFFFFFFFFFFE00000 ;; source region
ldr x12, [x12] ; region tag
cmp x12, x18 ; x18 - TEB
bne AssignAndMarkCards ; not local to this thread
and x16, x15, #0xFFFFFFFFFFE00000 ;; source region
ldr x12, [x16] ;; region tag
cmp x12, x18 ;; x18 - TEB
bne AssignAndMarkCards ;; not local to this thread
;; 2) check if the src and dst are from the same region
eor x12, x14, x15
lsr x12, x12, #21
cbnz x12, RecordEscape ;; cross region assignment. definitely escaping
and x12, x14, #0xFFFFFFFFFFE00000 ;; target aligned to region
cmp x12, x16
bne RecordEscape ;; cross region assignment. definitely escaping
;; 3) check if the target is exposed
ubfx x17, x14,#9,#12 ;; word index = (dst >> 9) & 0x1FFFFF
and x12, x15, #0xFFFFFFFFFFE00000 ;; source region
ldr x17, [x12, x17, lsl #3] ;; mark word = [region + index * 8]
ldr x17, [x16, x17, lsl #3] ;; mark word = [region + index * 8]
lsr x12, x14, #3 ;; bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, RecordEscape ;; target is exposed. record an escape.
str x15, [x14], #8 ;; UNORDERED assignment of unescaped object
ret lr
tbnz x17, #0, RecordEscape ;; target is exposed. record an escape.
;; UNORDERED! assignment of unescaped, null or external (immutable) object
JustAssign
ALTERNATE_ENTRY RhpAssignRefAVLocationNotHeap
stlr x15, [x14] ;; no card marking, src is not a heap object
add x14, x14, 8
ret lr
str x15, [x14], #8
ret lr
AssignAndMarkCards
ALTERNATE_ENTRY RhpAssignRefAVLocation
stlr x15, [x14]
;; need couple temps. Save before using.
stp x2, x3, [sp, -16]!
; TUNING: barriers in different modes could be separate pieces of code, but barrier switch
; needs to suspend EE, not sure if skipping mode check would worth that much.
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x17
; check the barrier state. this must be done after the assignment (in program order
; if state == 2 we do not set or dirty cards.
tbz x17, #1, DoCards
eor x12, x14, x15
lsr x12, x12, #21
cbz x12, CheckConcurrent ;; same region, just check if barrier is not concurrent
ExitNoCards
add x14, x14, 8
ret lr
;; if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
and x2, x15, #0xFFFFFFFFFFE00000 ;; source region
ldr w12, [x2, 16]
DoCards
; if same region, just check if barrier is not concurrent
and x12, x14, #0xFFFFFFFFFFE00000 ; target aligned to region
cmp x12, x16
beq CheckConcurrent ; same region, just check if barrier is not concurrent
; if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
ldr w12, [x16, 16] ; source region + 16 -> generation
tbz x12, #1, MarkCards
CheckConcurrent
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 ;; !g_write_watch_table -> !concurrent
cbnz x12, MarkCards
; if not concurrent, exit
cbz x17, ExitNoCards
MarkCards
; need couple temps. Save before using.
stp x2, x3, [sp, -16]!
; fetch card location for x14
PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x12 ; fetch the page map
lsr x16, x14, #30
ldr x16, [x12, x16, lsl #3] ; page
sub x2, x14, x16 ; offset in page
lsr x15, x2, #20 ; group index
lsr x2, x2, #9 ; card offset
lsl x15, x15, #1 ; group offset (index * 2)
; check if concurrent marking is in progress
cbnz x17, DirtyCard
; SETTING CARD FOR X14
SetCard
ldrb w3, [x16, x2]
cbnz w3, Exit
mov w17, #1
strb w17, [x16, x2]
SetGroup
add x12, x16, #0x80
ldrb w3, [x12, x15]
cbnz w3, CardSet
strb w17, [x12, x15]
SetPage
ldrb w3, [x16]
cbnz w3, CardSet
strb w17, [x16]
CardSet
; check if concurrent marking is still not in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12
cbnz x12, DirtyCard
Exit
ldp x2, x3, [sp], 16
add x14, x14, 8
ret lr
MarkCards
;; fetch card location for x14
PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x12 ;; fetch the page map
lsr x17, x14, #30
ldr x17, [x12, x17, lsl #3] ;; page
sub x2, x14, x17 ;; offset in page
lsr x15, x2, #21 ;; group index
lsl x15, x15, #1 ;; group offset (index * 2)
lsr x2, x2, #9 ;; card offset
;; check if concurrent marking is in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 ;; !g_write_watch_table -> !concurrent
cbnz x12, DirtyCard
;; SETTING CARD FOR X14
SetCard
ldrb w3, [x17, x2]
cbnz w3, CardSet
mov w16, #1
strb w16, [x17, x2]
SetGroup
add x12, x17, #0x80
ldrb w3, [x12, x15]
cbnz w3, CardSet
strb w16, [x12, x15]
SetPage
ldrb w3, [x17]
cbnz w3, CardSet
strb w16, [x17]
CardSet
;; check if concurrent marking is still not in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 ;; !g_write_watch_table -> !concurrent
cbnz x12, DirtyCard
b Exit
;; DIRTYING CARD FOR X14
; DIRTYING CARD FOR X14
DirtyCard
mov w16, #4
add x2, x2, x17
;; must be after the field write to allow concurrent clean
stlrb w16, [x2]
mov w17, #4
add x2, x2, x16
; must be after the field write to allow concurrent clean
stlrb w17, [x2]
DirtyGroup
add x12, x17, #0x80
add x12, x16, #0x80
ldrb w3, [x12, x15]
tbnz w3, #2, Exit
strb w16, [x12, x15]
strb w17, [x12, x15]
DirtyPage
ldrb w3, [x17]
ldrb w3, [x16]
tbnz w3, #2, Exit
strb w16, [x17]
strb w17, [x16]
b Exit
;; this is expected to be rare.
RecordEscape
;; 4) check if the source is escaped
and x12, x15, #0xFFFFFFFFFFE00000 ;; source region
add x16, x15, #8 ;; escape bit is MT + 1
ubfx x17, x16, #9,#12 ;; word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x12, x17, lsl #3] ;; mark word = [region + index * 8]
lsr x12, x16, #3 ;; bit = (dst >> 3) [& 63]
;; 4) check if the source is escaped (x16 has source region)
add x12, x15, #8 ;; escape bit is MT + 1
ubfx x17, x12, #9,#12 ;; word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x16, x17, lsl #3] ;; mark word = [region + index * 8]
lsr x12, x12, #3 ;; bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, AssignAndMarkCards ;; source is already escaped.
;; because of the barrier call convention
;; we need to preserve caller-saved x0 through x18 and x29/x30
;; we need to preserve caller-saved x0 through x15 and x29/x30
stp x29,x30, [sp, -16 * 9]!
stp x0, x1, [sp, 16 * 1]
@ -601,8 +616,8 @@ RecordEscape
;; void SatoriRegion::EscapeFn(SatoriObject** dst, SatoriObject* src, SatoriRegion* region)
;; mov x0, x14 EscapeFn does not use dst, it is just to avoid arg shuffle on x64
mov x1, x15
and x2, x15, #0xFFFFFFFFFFE00000 ;; source region
ldr x12, [x2, #8] ;; EscapeFn address
mov x2, x16 ;; source region
ldr x12, [x16, #8] ;; EscapeFn address
blr x12
ldp x0, x1, [sp, 16 * 1]
@ -615,6 +630,7 @@ RecordEscape
ldp x14,x15, [sp, 16 * 8]
ldp x29,x30, [sp], 16 * 9
and x16, x15, #0xFFFFFFFFFFE00000 ;; source region
b AssignAndMarkCards
LEAF_END RhpAssignRefArm64
@ -655,20 +671,20 @@ RecordEscape
#else
cbz x1, JustAssign_Cmp_Xchg ;; assigning null
#endif
and x12, x1, #0xFFFFFFFFFFE00000 ; source region
ldr x12, [x12] ; region tag
cmp x12, x18 ; x18 - TEB
bne AssignAndMarkCards_Cmp_Xchg ; not local to this thread
and x16, x1, #0xFFFFFFFFFFE00000 ;; source region
ldr x12, [x16] ;; region tag
cmp x12, x18 ;; x18 - TEB
bne AssignAndMarkCards_Cmp_Xchg ;; not local to this thread
;; 2) check if the src and dst are from the same region
eor x12, x0, x1
lsr x12, x12, #21
cbnz x12, RecordEscape_Cmp_Xchg ;; cross region assignment. definitely escaping
and x12, x0, #0xFFFFFFFFFFE00000 ;; target aligned to region
cmp x12, x16
bne RecordEscape_Cmp_Xchg ;; cross region assignment. definitely escaping
;; 3) check if the target is exposed
ubfx x17, x0,#9,#12 ;; word index = (dst >> 9) & 0x1FFFFF
and x12, x1, #0xFFFFFFFFFFE00000 ;; source region
ldr x17, [x12, x17, lsl #3] ;; mark word = [region + index * 8]
ldr x17, [x16, x17, lsl #3] ;; mark word = [region + index * 8]
lsr x12, x0, #3 ;; bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, RecordEscape_Cmp_Xchg ;; target is exposed. record an escape.
@ -683,8 +699,8 @@ AssignAndMarkCards_Cmp_Xchg
mov x15, x1 ;; x15 = val
#ifndef LSE_INSTRUCTIONS_ENABLED_BY_DEFAULT
PREPARE_EXTERNAL_VAR_INDIRECT_W g_cpuFeatures, 16
tbz w16, #ARM64_ATOMICS_FEATURE_FLAG_BIT, TryAgain1_Cmp_Xchg
PREPARE_EXTERNAL_VAR_INDIRECT_W g_cpuFeatures, 17
tbz w17, #ARM64_ATOMICS_FEATURE_FLAG_BIT, TryAgain1_Cmp_Xchg
#endif
mov x17, x2
@ -692,7 +708,7 @@ AssignAndMarkCards_Cmp_Xchg
casal x2, x1, [x0] ;; exchange
mov x0, x2 ;; x0 = result
cmp x2, x17
bne Exit_Cmp_Xchg
bne Exit_Cmp_XchgNoCards
#ifndef LSE_INSTRUCTIONS_ENABLED_BY_DEFAULT
b SkipLLScCmpXchg
@ -715,87 +731,98 @@ SkipLLScCmpXchg
#endif
cbnz x10, DoCardsCmpXchg
Exit_Cmp_Xchg
Exit_Cmp_XchgNoCards
ret lr
DoCardsCmpXchg
eor x12, x14, x15
lsr x12, x12, #21
cbz x12, CheckConcurrent_Cmp_Xchg ;; same region, just check if barrier is not concurrent
; TUNING: barriers in different modes could be separate pieces of code, but barrier switch
; needs to suspend EE, not sure if skipping mode check would worth that much.
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x17
;; we will trash x2 and x3, this is a regular call, so it is ok
;; if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
and x2, x15, #0xFFFFFFFFFFE00000 ;; source region
ldr w12, [x2, 16]
tbz x12, #1, MarkCards_Cmp_Xchg
; check the barrier state. this must be done after the assignment (in program order
; if state == 2 we do not set or dirty cards.
tbnz x17, #1, Exit_Cmp_XchgNoCards
CheckConcurrent_Cmp_Xchg
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 ;; !g_write_watch_table -> !concurrent
cbz x12, Exit_Cmp_Xchg
MarkCards_Cmp_Xchg
;; fetch card location for x14
PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x12 ;; fetch the page map
lsr x17, x14, #30
ldr x17, [x12, x17, lsl #3] ;; page
sub x2, x14, x17 ;; offset in page
lsr x15, x2, #21 ;; group index
lsl x15, x15, #1 ;; group offset (index * 2)
lsr x2, x2, #9 ;; card offset
; if same region, just check if barrier is not concurrent
and x12, x14, #0xFFFFFFFFFFE00000 ; target aligned to region
cmp x12, x16
beq CheckConcurrentCmpXchg ; same region, just check if barrier is not concurrent
;; check if concurrent marking is in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 ;; !g_write_watch_table -> !concurrent
cbnz x12, DirtyCard_Cmp_Xchg
; if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
ldr w12, [x16, 16] ; source region + 16 -> generation
tbz x12, #1, MarkCardsCmpXchg
;; SETTING CARD FOR X14
SetCard_Cmp_Xchg
ldrb w3, [x17, x2]
cbnz w3, CardSet_Cmp_Xchg
mov w16, #1
strb w16, [x17, x2]
SetGroup_Cmp_Xchg
add x12, x17, #0x80
CheckConcurrentCmpXchg
; if not concurrent, exit
cbz x17, Exit_Cmp_XchgNoCards
MarkCardsCmpXchg
; need couple temps. Save before using.
stp x2, x3, [sp, -16]!
; fetch card location for x14
PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x12 ; fetch the page map
lsr x16, x14, #30
ldr x16, [x12, x16, lsl #3] ; page
sub x2, x14, x16 ; offset in page
lsr x15, x2, #20 ; group index
lsr x2, x2, #9 ; card offset
lsl x15, x15, #1 ; group offset (index * 2)
; check if concurrent marking is in progress
cbnz x17, DirtyCardCmpXchg
; SETTING CARD FOR X14
SetCardCmpXchg
ldrb w3, [x16, x2]
cbnz w3, ExitCmpXchg
mov w17, #1
strb w17, [x16, x2]
SetGroupCmpXchg
add x12, x16, #0x80
ldrb w3, [x12, x15]
cbnz w3, CardSet_Cmp_Xchg
strb w16, [x12, x15]
SetPage_Cmp_Xchg
ldrb w3, [x17]
cbnz w3, CardSet_Cmp_Xchg
strb w16, [x17]
cbnz w3, CardSetCmpXchg
strb w17, [x12, x15]
SetPageCmpXchg
ldrb w3, [x16]
cbnz w3, CardSetCmpXchg
strb w17, [x16]
CardSet_Cmp_Xchg
;; check if concurrent marking is still not in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 ;; !g_write_watch_table -> !concurrent
cbnz x12, DirtyCard_Cmp_Xchg
ret lr
CardSetCmpXchg
; check if concurrent marking is still not in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12
cbnz x12, DirtyCardCmpXchg
;; DIRTYING CARD FOR X14
DirtyCard_Cmp_Xchg
mov w16, #4
add x2, x2, x17
;; must be after the field write to allow concurrent clean
stlrb w16, [x2]
DirtyGroup_Cmp_Xchg
add x12, x17, #0x80
ExitCmpXchg
ldp x2, x3, [sp], 16
ret lr
; DIRTYING CARD FOR X14
DirtyCardCmpXchg
mov w17, #4
add x2, x2, x16
; must be after the field write to allow concurrent clean
stlrb w17, [x2]
DirtyGroupCmpXchg
add x12, x16, #0x80
ldrb w3, [x12, x15]
tbnz w3, #2, Exit_Cmp_Xchg
strb w16, [x12, x15]
DirtyPage_Cmp_Xchg
ldrb w3, [x17]
tbnz w3, #2, Exit_Cmp_Xchg
strb w16, [x17]
ret lr
tbnz w3, #2, ExitCmpXchg
strb w17, [x12, x15]
DirtyPageCmpXchg
ldrb w3, [x16]
tbnz w3, #2, ExitCmpXchg
strb w17, [x16]
b Exit
;; this is expected to be rare.
RecordEscape_Cmp_Xchg
;; 4) check if the source is escaped
and x12, x1, #0xFFFFFFFFFFE00000 ;; source region
add x16, x1, #8 ;; escape bit is MT + 1
ubfx x17, x16, #9,#12 ;; word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x12, x17, lsl #3] ;; mark word = [region + index * 8]
lsr x12, x16, #3 ;; bit = (dst >> 3) [& 63]
add x12, x1, #8 ;; escape bit is MT + 1
ubfx x17, x12, #9,#12 ;; word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x16, x17, lsl #3] ;; mark word = [region + index * 8]
lsr x12, x12, #3 ;; bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, AssignAndMarkCards_Cmp_Xchg ;; source is already escaped.
@ -806,8 +833,8 @@ RecordEscape_Cmp_Xchg
str x2, [sp, 16 * 2]
;; void SatoriRegion::EscapeFn(SatoriObject** dst, SatoriObject* src, SatoriRegion* region)
and x2, x1, #0xFFFFFFFFFFE00000 ;; source region
ldr x12, [x2, #8] ;; EscapeFn address
mov x2, x16 ;; source region
ldr x12, [x16, #8] ;; EscapeFn address
blr x12
ldp x0, x1, [sp, 16 * 1]
@ -816,6 +843,7 @@ RecordEscape_Cmp_Xchg
;; x10 should be not 0 to indicate that can`t skip cards.
mov x10,#1
and x16, x1, #0xFFFFFFFFFFE00000 ;; source region
b AssignAndMarkCards_Cmp_Xchg
LEAF_END RhpCheckedLockCmpXchg
@ -838,43 +866,43 @@ RecordEscape_Cmp_Xchg
;;
LEAF_ENTRY RhpCheckedXchg, _TEXT
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x10
;; check if dst is in heap
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x12
add x12, x12, x0, lsr #30
add x12, x10, x0, lsr #30
ldrb w12, [x12]
cbz x12, JustAssign_Xchg
;; check for escaping assignment
;; 1) check if we own the source region
#ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x12
add x12, x12, x1, lsr #30
add x12, x10, x1, lsr #30
ldrb w12, [x12]
cbz x12, JustAssign_Xchg
#else
cbz x1, JustAssign_Xchg ;; assigning null
#endif
and x12, x1, #0xFFFFFFFFFFE00000 ; source region
ldr x12, [x12] ; region tag
cmp x12, x18 ; x18 - TEB
bne AssignAndMarkCards_Xchg ; not local to this thread
and x16, x1, #0xFFFFFFFFFFE00000 ;; source region
ldr x12, [x16] ;; region tag
cmp x12, x18 ;; x18 - TEB
bne AssignAndMarkCards_Xchg ;; not local to this thread
;; 2) check if the src and dst are from the same region
eor x12, x0, x1
lsr x12, x12, #21
cbnz x12, RecordEscape_Xchg ;; cross region assignment. definitely escaping
and x12, x0, #0xFFFFFFFFFFE00000 ;; target aligned to region
cmp x12, x16
bne RecordEscape_Xchg ;; cross region assignment. definitely escaping
;; 3) check if the target is exposed
ubfx x17, x0,#9,#12 ;; word index = (dst >> 9) & 0x1FFFFF
and x12, x1, #0xFFFFFFFFFFE00000 ;; source region
ldr x17, [x12, x17, lsl #3] ;; mark word = [region + index * 8]
lsr x12, x0, #3 ;; bit = (dst >> 3) [& 63]
ubfx x17, x0,#9,#12 // word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x16, x17, lsl #3] // mark word = [region + index * 8]
lsr x12, x0, #3 // bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, RecordEscape_Xchg ;; target is exposed. record an escape.
JustAssign_Xchg
TryAgain_Xchg
ALTERNATE_ENTRY RhpCheckedXchgAVLocationNotHeap
;; TODO: VS use LSE_INSTRUCTIONS_ENABLED_BY_DEFAULT instead
ldaxr x17, [x0]
stlxr w12, x1, [x0]
cbnz w12, TryAgain_Xchg
@ -893,85 +921,97 @@ TryAgain1_Xchg
mov x0, x17
dmb ish
eor x12, x14, x1
lsr x12, x12, #21
cbz x12, CheckConcurrent_Xchg ;; same region, just check if barrier is not concurrent
; TUNING: barriers in different modes could be separate pieces of code, but barrier switch
; needs to suspend EE, not sure if skipping mode check would worth that much.
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x17
;; we will trash x2 and x3, this is a regular call, so it is ok
;; if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
and x2, x1, #0xFFFFFFFFFFE00000 ;; source region
ldr w12, [x2, 16]
tbz x12, #1, MarkCards_Xchg
; check the barrier state. this must be done after the assignment (in program order
; if state == 2 we do not set or dirty cards.
tbz x17, #1, DoCardsXchg
CheckConcurrent_Xchg
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 ;; !g_write_watch_table -> !concurrent
cbnz x12, MarkCards_Xchg
Exit_Xchg
ExitNoCardsXchg
ret lr
DoCardsXchg
; if same region, just check if barrier is not concurrent
and x12, x14, #0xFFFFFFFFFFE00000 ; target aligned to region
cmp x12, x16
beq CheckConcurrentXchg ; same region, just check if barrier is not concurrent
; if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
ldr w12, [x16, 16] ; source region + 16 -> generation
tbz x12, #1, MarkCardsXchg
CheckConcurrentXchg
; if not concurrent, exit
cbz x17, ExitNoCardsXchg
MarkCardsXchg
; need couple temps. Save before using.
stp x2, x3, [sp, -16]!
; fetch card location for x14
PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x12 ; fetch the page map
lsr x16, x14, #30
ldr x16, [x12, x16, lsl #3] ; page
sub x2, x14, x16 ; offset in page
lsr x15, x2, #20 ; group index
lsr x2, x2, #9 ; card offset
lsl x15, x15, #1 ; group offset (index * 2)
; check if concurrent marking is in progress
cbnz x17, DirtyCardXchg
; SETTING CARD FOR X14
SetCardXchg
ldrb w3, [x16, x2]
cbnz w3, ExitXchg
mov w17, #1
strb w17, [x16, x2]
SetGroupXchg
add x12, x16, #0x80
ldrb w3, [x12, x15]
cbnz w3, CardSetXchg
strb w17, [x12, x15]
SetPageXchg
ldrb w3, [x16]
cbnz w3, CardSetXchg
strb w17, [x16]
CardSetXchg
; check if concurrent marking is still not in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12
cbnz x12, DirtyCardXchg
ExitXchg
ldp x2, x3, [sp], 16
ret lr
MarkCards_Xchg
;; fetch card location for x14
PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x12 ;; fetch the page map
lsr x17, x14, #30
ldr x17, [x12, x17, lsl #3] ;; page
sub x2, x14, x17 ;; offset in page
lsr x1, x2, #21 ;; group index
lsl x1, x1, #1 ;; group offset (index * 2)
lsr x2, x2, #9 ;; card offset
;; check if concurrent marking is in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 ;; !g_write_watch_table -> !concurrent
cbnz x12, DirtyCard_Xchg
;; SETTING CARD FOR X14
SetCard_Xchg
ldrb w3, [x17, x2]
cbnz w3, CardSet_Xchg
mov w16, #1
strb w16, [x17, x2]
SetGroup_Xchg
add x12, x17, #0x80
ldrb w3, [x12, x1]
cbnz w3, CardSet_Xchg
strb w16, [x12, x1]
SetPage_Xchg
ldrb w3, [x17]
cbnz w3, CardSet_Xchg
strb w16, [x17]
CardSet_Xchg
;; check if concurrent marking is still not in progress
PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x12 ;; !g_write_watch_table -> !concurrent
cbnz x12, DirtyCard_Xchg
b Exit_Xchg
;; DIRTYING CARD FOR X14
DirtyCard_Xchg
mov w16, #4
add x2, x2, x17
;; must be after the field write to allow concurrent clean
stlrb w16, [x2]
DirtyGroup_Xchg
add x12, x17, #0x80
ldrb w3, [x12, x1]
tbnz w3, #2, Exit_Xchg
strb w16, [x12, x1]
DirtyPage_Xchg
ldrb w3, [x17]
tbnz w3, #2, Exit_Xchg
strb w16, [x17]
b Exit_Xchg
; DIRTYING CARD FOR X14
DirtyCardXchg
mov w17, #4
add x2, x2, x16
; must be after the field write to allow concurrent clean
stlrb w17, [x2]
DirtyGroupXchg
add x12, x16, #0x80
ldrb w3, [x12, x15]
tbnz w3, #2, ExitXchg
strb w17, [x12, x15]
DirtyPageXchg
ldrb w3, [x16]
tbnz w3, #2, ExitXchg
strb w17, [x16]
b ExitXchg
;; this is expected to be rare.
RecordEscape_Xchg
;; 4) check if the source is escaped
and x12, x1, #0xFFFFFFFFFFE00000 ;; source region
add x16, x1, #8 ;; escape bit is MT + 1
ubfx x17, x16, #9,#12 ;; word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x12, x17, lsl #3] ;; mark word = [region + index * 8]
lsr x12, x16, #3 ;; bit = (dst >> 3) [& 63]
add x12, x1, #8 ;; escape bit is MT + 1
ubfx x17, x12, #9,#12 ;; word index = (dst >> 9) & 0x1FFFFF
ldr x17, [x16, x17, lsl #3] ;; mark word = [region + index * 8]
lsr x12, x12, #3 ;; bit = (dst >> 3) [& 63]
lsr x17, x17, x12
tbnz x17, #0, AssignAndMarkCards_Xchg ;; source is already escaped.
@ -980,13 +1020,14 @@ RecordEscape_Xchg
stp x0, x1, [sp, 16 * 1]
;; void SatoriRegion::EscapeFn(SatoriObject** dst, SatoriObject* src, SatoriRegion* region)
and x2, x1, #0xFFFFFFFFFFE00000 ;; source region
ldr x12, [x2, #8] ;; EscapeFn address
mov x2, x16 ;; source region
ldr x12, [x16, #8] ;; EscapeFn address
blr x12
ldp x0, x1, [sp, 16 * 1]
ldp x29,x30, [sp], 16 * 2
;; and x16, x1, #0xFFFFFFFFFFE00000 ;; source region
b AssignAndMarkCards_Xchg
LEAF_END RhpCheckedXchg

View file

@ -468,7 +468,7 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
#if FEATURE_SATORI_GC
case WriteBarrierOp::StartConcurrentMarkingSatori:
g_write_watch_table = (uint8_t*)1;
g_write_watch_table = args->write_watch_table;
g_sw_ww_enabled_for_gc_heap = true;
if (!is_runtime_suspended)
{
@ -481,7 +481,7 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
case WriteBarrierOp::StopConcurrentMarkingSatori:
assert(args->is_runtime_suspended && "the runtime must be suspended here!");
g_write_watch_table = (uint8_t*)0;
g_write_watch_table = args->write_watch_table;
g_sw_ww_enabled_for_gc_heap = false;
return;
#endif

View file

@ -501,6 +501,8 @@ set(GC_SOURCES_WKS
../gc/satori/SatoriAllocationContext.cpp
../gc/satori/SatoriUtil.cpp
../gc/satori/SatoriLock.cpp
../gc/satori/SatoriWorkList.cpp
../gc/satori/SatoriGate.cpp
)
set(GC_HEADERS_WKS
@ -528,6 +530,7 @@ set(GC_HEADERS_WKS
../gc/satori/SatoriAllocationContext.h
../gc/satori/SatoriUtil.h
../gc/satori/SatoriLock.h
../gc/satori/SatoriGate.h
)
if(FEATURE_EVENT_TRACE)

View file

@ -334,13 +334,12 @@ LEAF_END JIT_PatchedCodeStart, _TEXT
; void JIT_CheckedWriteBarrier(Object** dst, Object* src)
LEAF_ENTRY JIT_CheckedWriteBarrier, _TEXT
; See if this is in GCHeap
mov rax, rcx
shr rax, 30 ; round to page size ( >> PAGE_BITS )
add rax, [g_card_bundle_table] ; fetch the page byte map
cmp byte ptr [rax], 0
jne JIT_WriteBarrier
; See if dst is in GCHeap
mov rax, [g_card_bundle_table] ; fetch the page byte map
mov r8, rcx
shr r8, 30 ; dst page index
cmp byte ptr [rax + r8], 0
jne CheckedEntry
NotInHeap:
; See comment above about possible AV
@ -348,23 +347,32 @@ LEAF_ENTRY JIT_CheckedWriteBarrier, _TEXT
ret
LEAF_END_MARKED JIT_CheckedWriteBarrier, _TEXT
ALTERNATE_ENTRY macro Name
Name label proc
PUBLIC Name
endm
;
; rcx - dest address
; rdx - object
;
LEAF_ENTRY JIT_WriteBarrier, _TEXT
align 16
; check for escaping assignment
; 1) check if we own the source region
ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
mov rax, rdx
shr rax, 30 ; round to page size ( >> PAGE_BITS )
add rax, [g_card_bundle_table] ; fetch the page byte map
cmp byte ptr [rax], 0
je JustAssign ; src not in heap
; check if src is in heap
mov rax, [g_card_bundle_table] ; fetch the page byte map
ALTERNATE_ENTRY CheckedEntry
mov r8, rdx
shr r8, 30 ; src page index
cmp byte ptr [rax + r8], 0
je JustAssign ; src not in heap
else
ALTERNATE_ENTRY CheckedEntry
endif
; check for escaping assignment
; 1) check if we own the source region
mov r8, rdx
and r8, 0FFFFFFFFFFE00000h ; source region
@ -396,21 +404,30 @@ endif
AssignAndMarkCards:
mov [rcx], rdx
; TUNING: barriers in different modes could be separate pieces of code, but barrier switch
; needs to suspend EE, not sure if skipping mode check would worth that much.
mov r11, qword ptr [g_sw_ww_table]
; check the barrier state. this must be done after the assignment (in program order)
; if state == 2 we do not set or dirty cards.
cmp r11, 2h
jne DoCards
Exit:
ret
DoCards:
; if same region, just check if barrier is not concurrent
xor rdx, rcx
shr rdx, 21
jz CheckConcurrent ; same region, just check if barrier is not concurrent
; TUNING: nonconcurrent and concurrent barriers could be separate pieces of code, but to switch
; need to suspend EE, not sure if skipping concurrent check would worth that much.
jz CheckConcurrent
; if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
cmp dword ptr [r8 + 16], 2
jl MarkCards
CheckConcurrent:
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
jne MarkCards
ret
cmp r11, 0h
je Exit
MarkCards:
; fetch card location for rcx
@ -421,21 +438,22 @@ endif
sub r8, rax ; offset in page
mov rdx,r8
shr r8, 9 ; card offset
shr rdx, 21 ; group offset
shr rdx, 20 ; group index
lea rdx, [rax + rdx * 2 + 80h] ; group offset
; check if concurrent marking is in progress
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
cmp r11, 0h
jne DirtyCard
; SETTING CARD FOR RCX
SetCard:
cmp byte ptr [rax + r8], 0
jne CardSet
jne Exit
mov byte ptr [rax + r8], 1
SetGroup:
cmp byte ptr [rax + rdx * 2 + 80h], 0
cmp byte ptr [rdx], 0
jne CardSet
mov byte ptr [rax + rdx * 2 + 80h], 1
mov byte ptr [rdx], 1
SetPage:
cmp byte ptr [rax], 0
jne CardSet
@ -443,7 +461,7 @@ endif
CardSet:
; check if concurrent marking is still not in progress
cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h
cmp qword ptr [g_sw_ww_table], 0h
jne DirtyCard
ret
@ -451,15 +469,13 @@ endif
DirtyCard:
mov byte ptr [rax + r8], 4
DirtyGroup:
cmp byte ptr [rax + rdx * 2 + 80h], 4
cmp byte ptr [rdx], 4
je Exit
mov byte ptr [rax + rdx * 2 + 80h], 4
mov byte ptr [rdx], 4
DirtyPage:
cmp byte ptr [rax], 4
je Exit
mov byte ptr [rax], 4
Exit:
ret
; this is expected to be rare.
@ -467,12 +483,18 @@ endif
; 4) check if the source is escaped
mov rax, rdx
add rax, 8 ; escape bit is MT + 1
and rax, 01FFFFFh
shr rax, 3
bt qword ptr [r8], rax
jb AssignAndMarkCards ; source is already escaped.
; save rcx, rdx, r8 and have enough stack for the callee
; Align rsp
mov r9, rsp
and rsp, -16
; save rsp, rcx, rdx, r8 and have enough stack for the callee
push r9
push rcx
push rdx
push r8
@ -485,6 +507,7 @@ endif
pop r8
pop rdx
pop rcx
pop rsp
jmp AssignAndMarkCards
LEAF_END_MARKED JIT_WriteBarrier, _TEXT
@ -505,14 +528,13 @@ LEAF_ENTRY JIT_ByRefWriteBarrier, _TEXT
add rdi, 8h
add rsi, 8h
; See if assignment is into heap
mov rax, rcx
shr rax, 30 ; round to page size ( >> PAGE_BITS )
add rax, [g_card_bundle_table] ; fetch the page byte map
cmp byte ptr [rax], 0
jne JIT_WriteBarrier
; See if dst is in GCHeap
mov rax, [g_card_bundle_table] ; fetch the page byte map
mov r8, rcx
shr r8, 30 ; dst page index
cmp byte ptr [rax + r8], 0
jne CheckedEntry
align 16
NotInHeap:
mov [rcx], rdx
ret

View file

@ -215,6 +215,11 @@ LEAF_END_MARKED JIT_ByRefWriteBarrier, _TEXT
#else //FEATURE_SATORI_GC ##############################################################################
.macro ALTERNATE_ENTRY Name
.global C_FUNC(\Name)
C_FUNC(\Name):
.endm
// Mark start of the code region that we patch at runtime
LEAF_ENTRY JIT_PatchedCodeStart, _TEXT
ret
@ -224,13 +229,13 @@ LEAF_END JIT_PatchedCodeStart, _TEXT
// void JIT_CheckedWriteBarrier(Object** dst, Object* src)
LEAF_ENTRY JIT_CheckedWriteBarrier, _TEXT
// See if this is in GCHeap
mov rax, rdi
shr rax, 30 // round to page size ( >> PAGE_BITS )
PREPARE_EXTERNAL_VAR g_card_bundle_table, r8
add rax, [r8] // fetch the page byte map
cmp byte ptr [rax], 0
jne C_FUNC(JIT_WriteBarrier)
// See if dst is in GCHeap
PREPARE_EXTERNAL_VAR g_card_bundle_table, rax // fetch the page byte map
mov rax, [rax]
mov r8, rdi
shr r8, 30 // dst page index
cmp byte ptr [rax + r8], 0
jne C_FUNC(CheckedEntry)
NotInHeap:
// See comment above about possible AV
@ -246,13 +251,18 @@ LEAF_END_MARKED JIT_CheckedWriteBarrier, _TEXT
.balign 16
LEAF_ENTRY JIT_WriteBarrier, _TEXT
#ifdef FEATURE_SATORI_EXTERNAL_OBJECTS
mov rax, rsi
shr rax, 30 // round to page size ( >> PAGE_BITS )
PREPARE_EXTERNAL_VAR g_card_bundle_table, r8
add rax, [r8] // fetch the page byte map
cmp byte ptr [rax], 0
je JustAssign // src not in heap
// check if src is in heap
PREPARE_EXTERNAL_VAR g_card_bundle_table, rax // fetch the page byte map
mov rax, [rax]
ALTERNATE_ENTRY CheckedEntry
mov r8, rsi
shr r8, 30 // src page index
cmp byte ptr [rax + r8], 0
je JustAssign // src not in heap
#else
ALTERNATE_ENTRY CheckedEntry
#endif
// check for escaping assignment
// 1) check if we own the source region
mov rdx, rsi
@ -292,18 +302,30 @@ LEAF_ENTRY JIT_WriteBarrier, _TEXT
AssignAndMarkCards:
mov [rdi], rsi
PREPARE_EXTERNAL_VAR g_sw_ww_enabled_for_gc_heap, r11
// TUNING: barriers in different modes could be separate pieces of code, but barrier switch
// needs to suspend EE, not sure if skipping mode check would worth that much.
PREPARE_EXTERNAL_VAR g_sw_ww_table, rcx
mov r11, [rcx]
// set rdi per contract with JIT_ByRefWriteBarrier
mov rax, rdi
add rdi, 8
xor rsi, rax
shr rsi, 21 // check if assigning within the same region (sets flags)
// check the barrier state. this must be done after the assignment (in program order)
// if state == 2 we do not set or dirty cards.
cmp r11, 2
jne DoCards
// set rsi per contract with JIT_ByRefWriteBarrier
mov rsi, r10
Exit:
ret
DoCards:
// if same region, just check if barrier is not concurrent
xor rsi, rax
shr rsi, 21
// set rsi per contract with JIT_ByRefWriteBarrier
mov rsi, r10
jz CheckConcurrent // same region, just check if barrier is not concurrent
// if src is in gen2/3 and the barrier is not concurrent we do not need to mark cards
@ -311,9 +333,9 @@ LEAF_ENTRY JIT_WriteBarrier, _TEXT
jl MarkCards
CheckConcurrent:
cmp byte ptr [r11], 0
jne MarkCards
ret
// if concurrent, load card location
cmp r11, 0
je Exit
MarkCards:
// fetch card location for rax (saved rdi)
@ -325,21 +347,22 @@ LEAF_ENTRY JIT_WriteBarrier, _TEXT
sub rdx, rax // offset in page
mov r8, rdx
shr rdx, 9 // card offset
shr r8, 21 // group offset
shr r8, 20 // group index
lea r8, [rax + r8 * 2 + 0x80] // group offset
// check if concurrent marking is in progress
cmp byte ptr [r11], 0
cmp r11, 0
jne DirtyCard
// SETTING CARD
SetCard:
cmp byte ptr [rax + rdx], 0
jne CardSet
jne Exit
mov byte ptr [rax + rdx], 1
SetGroup:
cmp byte ptr [rax + r8 * 2 + 0x80], 0
cmp byte ptr [r8], 0
jne CardSet
mov byte ptr [rax + r8 * 2 + 0x80], 1
mov byte ptr [r8], 1
SetPage:
cmp byte ptr [rax], 0
jne CardSet
@ -347,7 +370,7 @@ LEAF_ENTRY JIT_WriteBarrier, _TEXT
CardSet:
// check if concurrent marking is still not in progress
cmp byte ptr [r11], 0
cmp qword ptr [rcx], 0
jne DirtyCard
ret
@ -355,27 +378,33 @@ LEAF_ENTRY JIT_WriteBarrier, _TEXT
DirtyCard:
mov byte ptr [rax + rdx], 4
DirtyGroup:
cmp byte ptr [rax + r8 * 2 + 0x80], 4
cmp byte ptr [r8], 4
je Exit
mov byte ptr [rax + r8 * 2 + 0x80], 4
mov byte ptr [r8], 4
DirtyPage:
cmp byte ptr [rax], 4
je Exit
mov byte ptr [rax], 4
Exit:
ret
// this is expected to be rare.
RecordEscape:
// 4) check if the source is escaped
mov rax, rsi
add rax, 8 // escape bit is MT + 1
and rax, 0x1FFFFF
shr rax, 3
bt qword ptr [rdx], rax
jb AssignAndMarkCards // source is already escaped.
RecordEscape:
// save rdi, rsi, rdx and r10 (possibly preadjusted rsi)
// Align rsp
mov r9, rsp
and rsp, -16
sub rsp, 8
// save rsp, rdi, rsi, rdx and r10 (possibly preadjusted rsi)
push r9
push rdi
push rsi
push rdx
@ -388,6 +417,7 @@ LEAF_ENTRY JIT_WriteBarrier, _TEXT
pop rdx
pop rsi
pop rdi
pop rsp
jmp AssignAndMarkCards
LEAF_END_MARKED JIT_WriteBarrier, _TEXT
@ -407,19 +437,15 @@ LEAF_ENTRY JIT_ByRefWriteBarrier, _TEXT
lea r10, [rsi + 8]
mov rsi, [rsi]
// See if this is in GCHeap
PREPARE_EXTERNAL_VAR g_highest_address, rax
cmp rdi, [rax]
ja NotInHeap_ByRefWriteBarrier
// See if dst is in GCHeap
PREPARE_EXTERNAL_VAR g_card_bundle_table, rax // fetch the page byte map
mov rax, [rax]
PREPARE_EXTERNAL_VAR g_card_table, r8
mov r8, [r8] // fetch the page map
mov rax, rdi
shr rax, 30 // round to page size ( >> PAGE_BITS )
cmp qword ptr [r8 + rax * 8], 0
jne C_FUNC(JIT_WriteBarrier)
mov r8, rdi
shr r8, 30 // dst page index
cmp byte ptr [rax + r8], 0
jne C_FUNC(CheckedEntry)
.balign 16
NotInHeap_ByRefWriteBarrier:
mov [rdi], rsi
add rdi, 8

View file

@ -1154,7 +1154,7 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
#if FEATURE_SATORI_GC
case WriteBarrierOp::StartConcurrentMarkingSatori:
g_sw_ww_table = (uint8_t*)1;
g_sw_ww_table = args->write_watch_table;
g_sw_ww_enabled_for_gc_heap = true;
stompWBCompleteActions |= ::SwitchToWriteWatchBarrier(is_runtime_suspended);
if (!is_runtime_suspended)
@ -1163,11 +1163,12 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
// observing future allocations.
FlushProcessWriteBuffers();
}
return;
case WriteBarrierOp::StopConcurrentMarkingSatori:
assert(args->is_runtime_suspended && "the runtime must be suspended here!");
g_sw_ww_table = (uint8_t*)0;
g_sw_ww_table = args->write_watch_table;
g_sw_ww_enabled_for_gc_heap = false;
stompWBCompleteActions |= ::SwitchToNonWriteWatchBarrier(true);
return;

View file

@ -438,6 +438,7 @@ class Object
LIMITED_METHOD_CONTRACT;
SUPPORTS_DAC;
#if !defined(FEATURE_SATORI_GC)
// lose GC marking bit and the reserved bit
// A method table pointer should always be aligned. During GC we set the least
// significant bit for marked objects, and the second to least significant
@ -448,6 +449,11 @@ class Object
#else
return dac_cast<PTR_MethodTable>((dac_cast<TADDR>(m_pMethTab)) & ~((UINT_PTR)3));
#endif //TARGET_64BIT
#else
// Satori does not mess up MT pointers.
_ASSERTE((dac_cast<TADDR>(m_pMethTab) & 7) == 0);
return dac_cast<PTR_MethodTable>((dac_cast<TADDR>(m_pMethTab)));
#endif
}
// There are some cases where it is unsafe to get the type handle during a GC.

View file

@ -3253,8 +3253,7 @@ COR_PRF_SUSPEND_REASON GCSuspendReasonToProfSuspendReason(ThreadSuspend::SUSPEND
#endif // PROFILING_SUPPORTED
// exponential spinwait with an approximate time limit for waiting in microsecond range.
// when iteration == -1, only usecLimit is used
void SpinWait(int iteration, int usecLimit)
void SpinWait(int usecLimit)
{
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
@ -3264,20 +3263,26 @@ void SpinWait(int iteration, int usecLimit)
int64_t ticksPerSecond = li.QuadPart;
int64_t endTicks = startTicks + (usecLimit * ticksPerSecond) / 1000000;
int l = min((unsigned)iteration, 30);
for (int i = 0; i < l; i++)
#ifdef TARGET_UNIX
if (usecLimit > 10)
{
for (int j = 0; j < (1 << i); j++)
{
System_YieldProcessor();
}
PAL_nanosleep(usecLimit * 1000);
}
#endif // TARGET_UNIX
for (int i = 0; i < 30; i++)
{
QueryPerformanceCounter(&li);
int64_t currentTicks = li.QuadPart;
if (currentTicks > endTicks)
{
break;
}
for (int j = 0; j < (1 << i); j++)
{
System_YieldProcessor();
}
}
}