diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs new file mode 100644 index 00000000000000..1a2216d8d6723c --- /dev/null +++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs @@ -0,0 +1,58 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.InteropServices; + +internal static partial class Interop +{ + internal static partial class Sys + { + /// Wraps io_uring_setup(2): creates an io_uring instance. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimSetup")] + internal static unsafe partial Error IoUringShimSetup( + uint entries, void* parms, int* ringFd); + + /// Wraps io_uring_enter(2): submits SQEs and/or waits for CQEs. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimEnter")] + internal static unsafe partial Error IoUringShimEnter( + int ringFd, uint toSubmit, uint minComplete, uint flags, int* result); + + /// Wraps io_uring_enter2(2) with IORING_ENTER_EXT_ARG for bounded waits. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimEnterExt")] + internal static unsafe partial Error IoUringShimEnterExt( + int ringFd, uint toSubmit, uint minComplete, uint flags, void* arg, int* result); + + /// Wraps io_uring_register(2): registers resources (files, buffers, ring fds). + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimRegister")] + internal static unsafe partial Error IoUringShimRegister( + int ringFd, uint opcode, void* arg, uint nrArgs, int* result); + + /// Wraps mmap(2): maps io_uring SQ/CQ ring memory. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimMmap")] + internal static unsafe partial Error IoUringShimMmap( + int ringFd, ulong size, ulong offset, void** mappedPtr); + + /// Wraps munmap(2): unmaps io_uring ring memory. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimMunmap")] + internal static unsafe partial Error IoUringShimMunmap( + void* addr, ulong size); + + /// Creates an eventfd for io_uring wakeup signaling. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimCreateEventFd")] + internal static unsafe partial Error IoUringShimCreateEventFd( + int* eventFd); + + /// Writes to an eventfd to wake the io_uring event loop. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimWriteEventFd")] + internal static partial Error IoUringShimWriteEventFd(int eventFd); + + /// Reads from an eventfd to consume a wakeup signal. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimReadEventFd")] + internal static unsafe partial Error IoUringShimReadEventFd( + int eventFd, ulong* value); + + /// Wraps close(2): closes a file descriptor. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimCloseFd")] + internal static partial Error IoUringShimCloseFd(int fd); + } +} diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs new file mode 100644 index 00000000000000..1472d04c8b676a --- /dev/null +++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs @@ -0,0 +1,150 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Net.Sockets; +using System.Runtime.InteropServices; + +internal static partial class Interop +{ + internal static partial class Sys + { + /// Derived SQ ring state computed after mmap, used by the managed submission path. + [StructLayout(LayoutKind.Sequential)] + internal struct IoUringSqRingInfo + { + public IntPtr SqeBase; + public IntPtr SqTailPtr; + public IntPtr SqHeadPtr; + public uint SqMask; + public uint SqEntries; + public uint SqeSize; + public byte UsesNoSqArray; + public int RingFd; + public int RegisteredRingFd; + public byte UsesEnterExtArg; + public byte UsesRegisteredFiles; + } + + /// Mirrors kernel struct io_sqring_offsets (40 bytes). Fields at offset 28+ (resv1, user_addr) are unused. + [StructLayout(LayoutKind.Explicit, Size = 40)] + internal struct IoUringSqOffsets + { + [FieldOffset(0)] public uint Head; + [FieldOffset(4)] public uint Tail; + [FieldOffset(8)] public uint RingMask; + [FieldOffset(12)] public uint RingEntries; + [FieldOffset(16)] public uint Flags; + [FieldOffset(20)] public uint Dropped; + [FieldOffset(24)] public uint Array; + // resv1 at 28, user_addr at 32 - not needed by managed code + } + + /// Mirrors kernel struct io_cqring_offsets (40 bytes). Fields at offset 28+ (resv1, user_addr) are unused. + [StructLayout(LayoutKind.Explicit, Size = 40)] + internal struct IoUringCqOffsets + { + [FieldOffset(0)] public uint Head; + [FieldOffset(4)] public uint Tail; + [FieldOffset(8)] public uint RingMask; + [FieldOffset(12)] public uint RingEntries; + [FieldOffset(16)] public uint Overflow; + [FieldOffset(20)] public uint Cqes; + [FieldOffset(24)] public uint Flags; + // resv1 at 28, user_addr at 32 - not needed by managed code + } + + /// Mirrors kernel struct io_uring_params (120 bytes), passed to io_uring_setup. + [StructLayout(LayoutKind.Explicit, Size = 120)] + internal struct IoUringParams + { + [FieldOffset(0)] public uint SqEntries; + [FieldOffset(4)] public uint CqEntries; + [FieldOffset(8)] public uint Flags; + [FieldOffset(12)] public uint SqThreadCpu; + [FieldOffset(16)] public uint SqThreadIdle; + [FieldOffset(20)] public uint Features; + [FieldOffset(24)] public uint WqFd; + // resv[3] at 28-39 + [FieldOffset(40)] public IoUringSqOffsets SqOff; + [FieldOffset(80)] public IoUringCqOffsets CqOff; + } + + /// Mirrors kernel struct io_uring_cqe (16 bytes), read from the CQ ring. + [StructLayout(LayoutKind.Explicit, Size = 16)] + internal struct IoUringCqe + { + [FieldOffset(0)] public ulong UserData; + [FieldOffset(8)] public int Result; + [FieldOffset(12)] public uint Flags; + } + + /// Mirrors kernel struct io_uring_buf (16 bytes), used by provided-buffer rings. + [StructLayout(LayoutKind.Explicit, Size = 16)] + internal struct IoUringBuf + { + [FieldOffset(0)] public ulong Address; + [FieldOffset(8)] public uint Length; + [FieldOffset(12)] public ushort BufferId; + [FieldOffset(14)] public ushort Reserved; + } + + /// + /// Mirrors the header overlay of kernel struct io_uring_buf_ring (16 bytes). + /// In UAPI this shares offset 0 with the first io_uring_buf entry via a union. + /// + [StructLayout(LayoutKind.Explicit, Size = 16)] + internal struct IoUringBufRingHeader + { + [FieldOffset(0)] public ulong Reserved1; + [FieldOffset(8)] public uint Reserved2; + [FieldOffset(12)] public ushort Reserved3; + [FieldOffset(14)] public ushort Tail; + } + + /// Mirrors kernel struct io_uring_buf_reg (40 bytes), used for pbuf ring registration. + [StructLayout(LayoutKind.Explicit, Size = 40)] + internal struct IoUringBufReg + { + [FieldOffset(0)] public ulong RingAddress; + [FieldOffset(8)] public uint RingEntries; + [FieldOffset(12)] public ushort BufferGroupId; + [FieldOffset(14)] public ushort Padding; + [FieldOffset(16)] public ulong Reserved0; + [FieldOffset(24)] public ulong Reserved1; + [FieldOffset(32)] public ulong Reserved2; + } + + /// Derived CQ ring state computed after mmap, used by the managed completion drain path. + [StructLayout(LayoutKind.Sequential)] + internal struct IoUringCqRingInfo + { + public IntPtr CqeBase; // io_uring_cqe* base of CQE array + public IntPtr CqTailPtr; // uint32_t* kernel writes CQ tail + public IntPtr CqHeadPtr; // uint32_t* managed advances CQ head + public uint CqMask; // CqEntries - 1 + public uint CqEntries; // number of CQ slots + public uint CqeSize; // sizeof(io_uring_cqe) = 16 + public IntPtr CqOverflowPtr; // uint32_t* kernel CQ overflow counter + } + + /// Mirrors kernel struct io_uring_getevents_arg, used with IORING_ENTER_EXT_ARG. + [StructLayout(LayoutKind.Sequential)] + internal struct IoUringGeteventsArg + { + public ulong Sigmask; + public uint SigmaskSize; + public uint MinWaitUsec; + public ulong Ts; + } + + /// Mirrors kernel struct __kernel_timespec, used for io_uring timeout arguments. + [StructLayout(LayoutKind.Sequential)] + internal struct IoUringKernelTimespec + { + public long TvSec; + public long TvNsec; + } + + } +} diff --git a/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj b/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj index 2426a84e8a225c..d676f4a0010840 100644 --- a/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj +++ b/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj @@ -197,9 +197,20 @@ + + + + + + + diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs new file mode 100644 index 00000000000000..4961bd76e759b0 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs @@ -0,0 +1,816 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace System.Net.Sockets +{ + internal sealed partial class SocketAsyncEngine + { + private const int IoUringProvidedBufferRingEntries = (int)IoUringConstants.QueueEntries; + private const int IoUringProvidedBufferSizeDefault = 4096; + private const ushort IoUringProvidedBufferGroupIdDefault = 1; + private static readonly int s_ioUringProvidedBufferSize = GetConfiguredIoUringProvidedBufferSize(); + private static readonly bool s_ioUringAdaptiveBufferSizingEnabled = IsAdaptiveIoUringProvidedBufferSizingEnabled(); + private static readonly bool s_ioUringRegisterBuffersEnabled = IsIoUringRegisterBuffersEnabled(); + private bool _adaptiveBufferSizingEnabled; + + /// + /// Initializes a provided-buffer ring and registers it with the kernel when supported. + /// Failures are non-fatal and leave completion mode enabled without provided buffers. + /// + private void InitializeIoUringProvidedBufferRingIfSupported(int ringFd) + { + _supportsProvidedBufferRings = false; + _ioUringBuffersRegistered = false; + _adaptiveBufferSizingEnabled = false; + _ioUringProvidedBufferGroupId = 0; + _ioUringProvidedBufferRing = null; + + if (!IoUringProvidedBufferRing.TryCreate( + IoUringProvidedBufferGroupIdDefault, + IoUringProvidedBufferRingEntries, + s_ioUringProvidedBufferSize, + s_ioUringAdaptiveBufferSizingEnabled, + out IoUringProvidedBufferRing? bufferRing) || + bufferRing is null) + { + return; + } + + Interop.Error registerError = bufferRing.Register(ringFd); + if (registerError != Interop.Error.SUCCESS) + { + bufferRing.Dispose(); + return; + } + + _ioUringProvidedBufferRing = bufferRing; + _ioUringProvidedBufferGroupId = bufferRing.BufferGroupId; + _supportsProvidedBufferRings = true; + _adaptiveBufferSizingEnabled = s_ioUringAdaptiveBufferSizingEnabled; + _ioUringBuffersRegistered = TryRegisterProvidedBuffersWithTelemetry(bufferRing, ringFd, isReregistration: false); + + SocketsTelemetry.Log.IoUringProvidedBufferCurrentSize(bufferRing.BufferSize); + } + + /// + /// Evaluates adaptive buffer-sizing recommendations and hot-swaps the provided-buffer ring when safe. + /// Must run on the event-loop thread. + /// + private void EvaluateProvidedBufferRingResize() + { + if (!_adaptiveBufferSizingEnabled || _managedRingFd < 0) + { + return; + } + + IoUringProvidedBufferRing? currentRing = _ioUringProvidedBufferRing; + if (currentRing is null) + { + return; + } + + int currentBufferSize = currentRing.BufferSize; + int recommendedBufferSize = currentRing.RecommendedBufferSize; + if (recommendedBufferSize == 0 || recommendedBufferSize == currentBufferSize) + { + return; + } + + if (currentRing.InUseCount > 0) + { + return; + } + + ushort newGroupId = _ioUringProvidedBufferGroupId == 1 ? (ushort)2 : (ushort)1; + if (!IoUringProvidedBufferRing.TryCreate( + newGroupId, + IoUringProvidedBufferRingEntries, + recommendedBufferSize, + adaptiveSizingEnabled: true, + out IoUringProvidedBufferRing? replacementRing) || + replacementRing is null) + { + return; + } + + bool restorePreviousBufferRegistration = _ioUringBuffersRegistered; + TryUnregisterProvidedBuffersIfRegistered(currentRing, _managedRingFd); + + if (replacementRing.Register(_managedRingFd) != Interop.Error.SUCCESS) + { + replacementRing.Dispose(); + if (restorePreviousBufferRegistration) + { + _ioUringBuffersRegistered = TryRegisterProvidedBuffersWithTelemetry( + currentRing, + _managedRingFd, + isReregistration: true); + } + + return; + } + + currentRing.Unregister(_managedRingFd); + currentRing.Dispose(); + + _ioUringProvidedBufferRing = replacementRing; + _ioUringProvidedBufferGroupId = replacementRing.BufferGroupId; + _supportsProvidedBufferRings = true; + RefreshIoUringMultishotRecvSupport(); + _ioUringBuffersRegistered = TryRegisterProvidedBuffersWithTelemetry( + replacementRing, + _managedRingFd, + isReregistration: true); + + SocketsTelemetry.Log.IoUringProvidedBufferResize(); + SocketsTelemetry.Log.IoUringProvidedBufferCurrentSize(replacementRing.BufferSize); + } + + private static int GetConfiguredIoUringProvidedBufferSize() + { +#if DEBUG + string? configuredValue = Environment.GetEnvironmentVariable( + "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PROVIDED_BUFFER_SIZE"); + + if (!string.IsNullOrWhiteSpace(configuredValue)) + { + return int.TryParse(configuredValue, out int parsedSize) && parsedSize > 0 + ? parsedSize + : IoUringProvidedBufferSizeDefault; + } +#endif + + return IoUringProvidedBufferSizeDefault; + } + + private static bool IsAdaptiveIoUringProvidedBufferSizingEnabled() + { +#if DEBUG + string? configuredValue = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ADAPTIVE_BUFFER_SIZING"); + return string.Equals(configuredValue, "1", StringComparison.Ordinal); +#else + return false; +#endif + } + + private static bool IsIoUringRegisterBuffersEnabled() + { +#if DEBUG + // Test-only override for deterministic tests. + string? configuredValue = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS"); + if (string.Equals(configuredValue, "1", StringComparison.Ordinal)) + { + return true; + } + + if (string.Equals(configuredValue, "0", StringComparison.Ordinal)) + { + return false; + } +#endif + + // Default: enabled. + return true; + } + + private static bool TryRegisterProvidedBuffersWithTelemetry( + IoUringProvidedBufferRing bufferRing, + int ringFd, + bool isReregistration) + { + if (!s_ioUringRegisterBuffersEnabled || ringFd < 0) + { + return false; + } + + // REGISTER_BUFFERS is orthogonal to provided-buffer selection (RECV + IOSQE_BUFFER_SELECT). + // Any performance benefit for this path is kernel-dependent and must be validated empirically. + bool registered = bufferRing.TryRegisterBuffersWithKernel(ringFd); + if (isReregistration) + { + SocketsTelemetry.Log.IoUringRegisteredBuffersReregistration(registered); + } + else + { + SocketsTelemetry.Log.IoUringRegisteredBuffersResult( + registered, + IoUringProvidedBufferRingEntries, + bufferRing.BufferSize); + } + + return registered; + } + + private void TryUnregisterProvidedBuffersIfRegistered(IoUringProvidedBufferRing bufferRing, int ringFd) + { + if (!_ioUringBuffersRegistered || ringFd < 0) + { + return; + } + + bufferRing.TryUnregisterBuffersFromKernel(ringFd); + _ioUringBuffersRegistered = false; + } + + /// Unregisters and disposes the provided-buffer ring. + private void FreeIoUringProvidedBufferRing() + { + IoUringProvidedBufferRing? bufferRing = _ioUringProvidedBufferRing; + _ioUringProvidedBufferRing = null; + _supportsProvidedBufferRings = false; + _adaptiveBufferSizingEnabled = false; + _ioUringProvidedBufferGroupId = 0; + + if (bufferRing is null) + { + return; + } + + int recycledForTeardown = bufferRing.RecycleCheckedOutBuffersForTeardown(); + if (recycledForTeardown > 0) + { + SocketsTelemetry.Log.IoUringProvidedBufferRecycle(recycledForTeardown); + } + + TryUnregisterProvidedBuffersIfRegistered(bufferRing, _managedRingFd); + + if (_managedRingFd >= 0) + { + bufferRing.Unregister(_managedRingFd); + } + + bufferRing.Dispose(); + _ioUringBuffersRegistered = false; + } + + /// + /// Owns a managed provided-buffer ring registration: native ring memory, pinned managed + /// buffers, buffer-id lifecycle, and recycle counters. + /// + private sealed unsafe class IoUringProvidedBufferRing : IDisposable + { + private const int AdaptiveWindowCompletionCount = 256; + private const int AdaptiveMinBufferSize = 128; + private const int AdaptiveMaxBufferSize = 65536; + private const int PreparedReceiveMinimumReserve = 8; + private const int PreparedReceiveMaximumReserve = 64; + private const byte BufferStatePosted = 1; + private const byte BufferStateCheckedOut = 2; + + private readonly ushort _bufferGroupId; + private readonly int _bufferSize; + private readonly uint _ringEntries; + private readonly uint _ringMask; + private readonly bool _adaptiveSizingEnabled; + private readonly GCHandle[] _bufferHandles; + private readonly byte[] _bufferStates; + private readonly Interop.Sys.IoUringBuf* _ringBuffers; + private readonly Interop.Sys.IoUringBufRingHeader* _ringHeader; + private readonly void* _ringMemory; + private bool _registered; + private bool _disposed; + private int _availableCount; + private int _inUseCount; + private long _recycledCount; + private long _allocationFailureCount; + private long _totalCompletionBytes; + private long _totalCompletionCount; + private long _completionsAboveHighWatermark; + private long _completionsBelowLowWatermark; + private int _recommendedBufferSize; + private uint _nextPreparedReceiveBufferHint; + private int _debugOwningThreadId; + + internal ushort BufferGroupId => _bufferGroupId; + internal int BufferSize => _bufferSize; + internal int AvailableCount => Volatile.Read(ref _availableCount); + internal int InUseCount => Volatile.Read(ref _inUseCount); + internal long RecycledCount => Interlocked.Read(ref _recycledCount); + internal long AllocationFailureCount => Interlocked.Read(ref _allocationFailureCount); + internal int RecommendedBufferSize => Volatile.Read(ref _recommendedBufferSize); + + private IoUringProvidedBufferRing(ushort bufferGroupId, int ringEntries, int bufferSize, bool adaptiveSizingEnabled) + { + ArgumentOutOfRangeException.ThrowIfNegativeOrZero(ringEntries); + if (!BitOperations.IsPow2((uint)ringEntries) || ringEntries > ushort.MaxValue) + { + throw new ArgumentOutOfRangeException(nameof(ringEntries)); + } + + ArgumentOutOfRangeException.ThrowIfNegativeOrZero(bufferSize); + + _bufferGroupId = bufferGroupId; + _bufferSize = bufferSize; + _adaptiveSizingEnabled = adaptiveSizingEnabled; + _ringEntries = (uint)ringEntries; + _ringMask = (uint)ringEntries - 1; + _availableCount = ringEntries; + _recommendedBufferSize = bufferSize; + _bufferHandles = new GCHandle[ringEntries]; + _bufferStates = GC.AllocateUninitializedArray(ringEntries); + + nuint ringByteCount = checked((nuint)ringEntries * (nuint)sizeof(Interop.Sys.IoUringBuf)); + _ringMemory = NativeMemory.AlignedAlloc(ringByteCount, (nuint)Environment.SystemPageSize); + if (_ringMemory is null) + { + throw new OutOfMemoryException(); + } + + NativeMemory.Clear(_ringMemory, ringByteCount); + _ringBuffers = (Interop.Sys.IoUringBuf*)_ringMemory; + _ringHeader = (Interop.Sys.IoUringBufRingHeader*)_ringMemory; + + int initializedCount = 0; + try + { + for (int i = 0; i < ringEntries; i++) + { + byte[] buffer = GC.AllocateUninitializedArray(bufferSize); + GCHandle handle = GCHandle.Alloc(buffer, GCHandleType.Pinned); + + _bufferHandles[i] = handle; + _bufferStates[i] = BufferStatePosted; + + WriteBufferDescriptor((uint)i, (ushort)i); + initializedCount++; + } + + PublishTail((ushort)initializedCount); + } + catch + { + Interlocked.Increment(ref _allocationFailureCount); + ReleasePinnedBuffers(initializedCount); + NativeMemory.AlignedFree(_ringMemory); + throw; + } + } + + internal static bool TryCreate( + ushort bufferGroupId, + int ringEntries, + int bufferSize, + bool adaptiveSizingEnabled, + out IoUringProvidedBufferRing? bufferRing) + { + try + { + bufferRing = new IoUringProvidedBufferRing(bufferGroupId, ringEntries, bufferSize, adaptiveSizingEnabled); + return true; + } + catch (ArgumentOutOfRangeException) + { + } + catch (OutOfMemoryException) + { + } + + bufferRing = null; + return false; + } + + /// Records a completion's bytes-transferred for adaptive sizing decisions. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void RecordCompletionUtilization(int bytesTransferred) + { + AssertSingleThreadAccess(); + if (!_adaptiveSizingEnabled || bytesTransferred <= 0) + { + return; + } + + int clampedBytes = Math.Min(bytesTransferred, _bufferSize); + _totalCompletionBytes += clampedBytes; + long count = ++_totalCompletionCount; + + int highWatermark = (_bufferSize * 3) / 4; + int lowWatermark = _bufferSize / 4; + if (clampedBytes > highWatermark) + { + _completionsAboveHighWatermark++; + } + else if (clampedBytes < lowWatermark) + { + _completionsBelowLowWatermark++; + } + + if ((count & (AdaptiveWindowCompletionCount - 1)) == 0) + { + EvaluateAdaptiveResize(); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private void EvaluateAdaptiveResize() + { + AssertSingleThreadAccess(); + if (!_adaptiveSizingEnabled) + { + return; + } + + long windowBytes = _totalCompletionBytes; + long aboveHigh = _completionsAboveHighWatermark; + long belowLow = _completionsBelowLowWatermark; + _totalCompletionBytes = 0; + _completionsAboveHighWatermark = 0; + _completionsBelowLowWatermark = 0; + + int currentSize = _bufferSize; + int recommendedSize = currentSize; + if (aboveHigh > AdaptiveWindowCompletionCount / 2 || + windowBytes > (long)AdaptiveWindowCompletionCount * ((long)currentSize * 3 / 4)) + { + recommendedSize = Math.Min(currentSize * 2, AdaptiveMaxBufferSize); + } + else if (belowLow > AdaptiveWindowCompletionCount / 2 || + windowBytes < (long)AdaptiveWindowCompletionCount * ((long)currentSize / 4)) + { + recommendedSize = Math.Max(currentSize / 2, AdaptiveMinBufferSize); + } + + Volatile.Write(ref _recommendedBufferSize, recommendedSize); + } + + internal Interop.Error Register(int ringFd) + { + Debug.Assert(!_disposed); + + if (_registered) + { + return Interop.Error.SUCCESS; + } + + Interop.Sys.IoUringBufReg registration = default; + registration.RingAddress = (ulong)(nuint)_ringMemory; + registration.RingEntries = _ringEntries; + registration.BufferGroupId = _bufferGroupId; + + int result; + Interop.Error registerError = Interop.Sys.IoUringShimRegister( + ringFd, + IoUringConstants.RegisterPbufRing, + ®istration, + 1u, + &result); + if (registerError == Interop.Error.SUCCESS) + { + _registered = true; + } + + return registerError; + } + + internal Interop.Error Unregister(int ringFd) + { + if (!_registered) + { + return Interop.Error.SUCCESS; + } + + Interop.Sys.IoUringBufReg registration = default; + registration.BufferGroupId = _bufferGroupId; + int result; + Interop.Error unregisterError = Interop.Sys.IoUringShimRegister( + ringFd, + IoUringConstants.UnregisterPbufRing, + ®istration, + 1u, + &result); + if (unregisterError == Interop.Error.SUCCESS) + { + _registered = false; + } + + return unregisterError; + } + + /// + /// Attempts to register pinned buffer payload pages with the kernel via IORING_REGISTER_BUFFERS. + /// Failure is non-fatal and callers should gracefully continue with unregistered buffers. + /// This does not switch recv SQEs to fixed-buffer opcodes; provided-buffer recv stays on + /// IORING_OP_RECV + IOSQE_BUFFER_SELECT. + /// + internal bool TryRegisterBuffersWithKernel(int ringFd) + { + if (_disposed || ringFd < 0 || _bufferHandles.Length == 0) + { + return false; + } + + nuint allocationSize = checked((nuint)_bufferHandles.Length * (nuint)sizeof(Interop.Sys.IOVector)); + Interop.Sys.IOVector* iovecArray; + try + { + iovecArray = (Interop.Sys.IOVector*)NativeMemory.Alloc(allocationSize); + } + catch (OutOfMemoryException) + { + return false; + } + + try + { + for (int i = 0; i < _bufferHandles.Length; i++) + { + GCHandle handle = _bufferHandles[i]; + if (!handle.IsAllocated) + { + return false; + } + + iovecArray[i].Base = (byte*)handle.AddrOfPinnedObject(); + iovecArray[i].Count = (UIntPtr)_bufferSize; + } + + int result; + Interop.Error registerError = Interop.Sys.IoUringShimRegister( + ringFd, + IoUringConstants.RegisterBuffers, + iovecArray, + (uint)_bufferHandles.Length, + &result); + return registerError == Interop.Error.SUCCESS; + } + finally + { + NativeMemory.Free(iovecArray); + } + } + + /// Unregisters previously registered pinned buffers via IORING_UNREGISTER_BUFFERS. + internal bool TryUnregisterBuffersFromKernel(int ringFd) + { + if (_disposed || ringFd < 0) + { + return false; + } + + int result; + Interop.Error unregisterError = Interop.Sys.IoUringShimRegister( + ringFd, + IoUringConstants.UnregisterBuffers, + null, + 0u, + &result); + return unregisterError == Interop.Error.SUCCESS; + } + + /// Acquires a kernel-selected buffer id for completion processing. + internal bool TryAcquireBufferForCompletion(ushort bufferId, out byte* buffer, out int bufferLength) + { + AssertSingleThreadAccess(); + buffer = null; + bufferLength = 0; + + if (bufferId >= _ringEntries) + { + Interlocked.Increment(ref _allocationFailureCount); + return false; + } + + byte state = _bufferStates[bufferId]; + if (state != BufferStatePosted) + { + Debug.Assert( + state == BufferStateCheckedOut, + $"Unexpected provided-buffer state during acquire: id={bufferId}, state={state}"); + Interlocked.Increment(ref _allocationFailureCount); + return false; + } + + _bufferStates[bufferId] = BufferStateCheckedOut; + Debug.Assert(_availableCount > 0, "Provided-buffer available count underflow."); + _availableCount--; + _inUseCount++; + + GCHandle handle = _bufferHandles[bufferId]; + if (!handle.IsAllocated) + { + _bufferStates[bufferId] = BufferStatePosted; + _availableCount++; + _inUseCount--; + Interlocked.Increment(ref _allocationFailureCount); + return false; + } + + buffer = (byte*)handle.AddrOfPinnedObject(); + bufferLength = _bufferSize; + return true; + } + + /// + /// Acquires any currently posted provided buffer for fixed-recv submission. + /// The acquired buffer remains checked out until completion recycles it. + /// + internal bool TryAcquireBufferForPreparedReceive(out ushort bufferId, out byte* buffer, out int bufferLength) + { + AssertSingleThreadAccess(); + bufferId = 0; + buffer = null; + bufferLength = 0; + + // Keep a reserve for kernel-selected (IOSQE_BUFFER_SELECT) receive completions so + // fixed-recv one-shots don't deplete the provided-buffer pool under sustained load. + int reserveCount = GetPreparedReceiveReserveCount(); + if (Volatile.Read(ref _availableCount) <= reserveCount) + { + return false; + } + + uint start = _nextPreparedReceiveBufferHint; + for (uint i = 0; i < _ringEntries; i++) + { + uint candidate = (start + i) & _ringMask; + ushort candidateId = (ushort)candidate; + if (_bufferStates[candidateId] != BufferStatePosted) + { + continue; + } + + if (TryAcquireBufferForCompletion(candidateId, out buffer, out bufferLength)) + { + bufferId = candidateId; + _nextPreparedReceiveBufferHint = (candidate + 1) & _ringMask; + return true; + } + } + + return false; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private int GetPreparedReceiveReserveCount() + { + int ringEntryCount = (int)_ringEntries; + int dynamicReserve = ringEntryCount / 16; + return Math.Clamp(dynamicReserve, PreparedReceiveMinimumReserve, PreparedReceiveMaximumReserve); + } + + /// Returns the pointer/length for a buffer that is already checked out. + internal bool TryGetCheckedOutBuffer(ushort bufferId, out byte* buffer, out int bufferLength) + { + buffer = null; + bufferLength = 0; + + if (bufferId >= _ringEntries || _bufferStates[bufferId] != BufferStateCheckedOut) + { + return false; + } + + GCHandle handle = _bufferHandles[bufferId]; + if (!handle.IsAllocated) + { + Interlocked.Increment(ref _allocationFailureCount); + return false; + } + + buffer = (byte*)handle.AddrOfPinnedObject(); + bufferLength = _bufferSize; + return true; + } + + /// Returns a previously acquired buffer id back to the provided-buffer ring. + internal bool TryRecycleBufferFromCompletion(ushort bufferId) + { + AssertSingleThreadAccess(); + if (bufferId >= _ringEntries) + { + return false; + } + + byte state = _bufferStates[bufferId]; + if (state != BufferStateCheckedOut) + { + Debug.Assert( + state == BufferStatePosted, + $"Unexpected provided-buffer state during recycle: id={bufferId}, state={state}"); + return false; + } + + RecycleCheckedOutBuffer(bufferId); + return true; + } + + /// + /// Recycles any still-checked-out ids back into the ring during teardown. + /// Returns the number of ids recycled. + /// + internal int RecycleCheckedOutBuffersForTeardown() + { + AssertSingleThreadAccess(); + int recycledCount = 0; + for (ushort bufferId = 0; bufferId < _ringEntries; bufferId++) + { + if (_bufferStates[bufferId] != BufferStateCheckedOut) + { + continue; + } + + RecycleCheckedOutBuffer(bufferId); + recycledCount++; + } + + return recycledCount; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RecycleCheckedOutBuffer(ushort bufferId) + { + ushort tail = ReadTail(); + uint ringIndex = (uint)tail & _ringMask; + WriteBufferDescriptor(ringIndex, bufferId); + _bufferStates[bufferId] = BufferStatePosted; + _availableCount++; + Debug.Assert(_inUseCount > 0, "Provided-buffer in-use count underflow."); + _inUseCount--; + PublishTail(unchecked((ushort)(tail + 1))); + Interlocked.Increment(ref _recycledCount); + } + + [Conditional("DEBUG")] + private void AssertSingleThreadAccess() + { + int currentThreadId = Environment.CurrentManagedThreadId; + int ownerThreadId = Volatile.Read(ref _debugOwningThreadId); + if (ownerThreadId == 0) + { + int prior = Interlocked.CompareExchange(ref _debugOwningThreadId, currentThreadId, comparand: 0); + ownerThreadId = prior == 0 ? currentThreadId : prior; + } + + Debug.Assert( + ownerThreadId == currentThreadId, + $"IoUringProvidedBufferRing mutable state must be accessed from one thread. Owner={ownerThreadId}, current={currentThreadId}"); + } + + public void Dispose() + { + if (_disposed) + { + return; + } + +#if DEBUG + int checkedOutBufferCount = 0; + for (int i = 0; i < _bufferStates.Length; i++) + { + if (_bufferStates[i] == BufferStateCheckedOut) + { + checkedOutBufferCount++; + } + } + + Debug.Assert( + checkedOutBufferCount == 0, + $"Disposing provided-buffer ring with outstanding checked-out buffers: {checkedOutBufferCount}"); +#endif + + _registered = false; + ReleasePinnedBuffers(_bufferHandles.Length); + NativeMemory.AlignedFree(_ringMemory); + _disposed = true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private ushort ReadTail() => + Volatile.Read(ref Unsafe.AsRef(&_ringHeader->Tail)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void PublishTail(ushort tail) => + Volatile.Write(ref Unsafe.AsRef(&_ringHeader->Tail), tail); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void WriteBufferDescriptor(uint ringIndex, ushort bufferId) + { + Debug.Assert(ringIndex < _ringEntries); + Debug.Assert(bufferId < _ringEntries); + Debug.Assert(_bufferHandles[bufferId].IsAllocated); + + Interop.Sys.IoUringBuf* bufferSlot = _ringBuffers + ringIndex; + bufferSlot->Address = (ulong)(nuint)_bufferHandles[bufferId].AddrOfPinnedObject(); + bufferSlot->Length = (uint)_bufferSize; + bufferSlot->BufferId = bufferId; + bufferSlot->Reserved = 0; + } + + private void ReleasePinnedBuffers(int count) + { + for (int i = 0; i < count; i++) + { + if (_bufferHandles[i].IsAllocated) + { + _bufferHandles[i].Free(); + } + } + } + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs new file mode 100644 index 00000000000000..e4548a7cbe5294 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs @@ -0,0 +1,276 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.CompilerServices; +using System.Threading; + +namespace System.Net.Sockets +{ + /// + /// Lock-free multi-producer, single-consumer queue optimized for the io_uring + /// event loop pattern where many threads enqueue work items but exactly one + /// thread drains them. + /// + /// Liveness contract: + /// TryDequeue/IsEmpty may observe a producer between index claim and publish + /// (Interlocked.Increment followed by Volatile.Write), and can transiently report + /// no available item even though an enqueue is in progress. Callers must provide + /// their own wakeup/progress mechanism after Enqueue. + /// + internal sealed class MpscQueue + { + private const int DefaultSegmentSize = 256; + + private readonly int _segmentSize; + private PaddedSegment _head; + private PaddedSegment _tail; + // Safe to recycle only segments that lost the tail->next link race and were never published. + // Reusing drained, previously-linked segments would require producer quiescence tracking to + // avoid stale producer references writing into a reset segment. + private Segment? _cachedUnlinkedSegment; + + internal MpscQueue(int segmentSize = DefaultSegmentSize) + { + ArgumentOutOfRangeException.ThrowIfNegativeOrZero(segmentSize); + _segmentSize = segmentSize; + Segment initial = new Segment(segmentSize); + _head.Value = initial; + _tail.Value = initial; + } + + /// + /// Enqueues an item. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void Enqueue(T item) + { + if (!TryEnqueueFast(item)) + { + EnqueueSlow(item); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryEnqueueFast(T item) + { + Segment tail = Volatile.Read(ref _tail.Value)!; + int index = Interlocked.Increment(ref tail.EnqueueIndex.Value) - 1; + if ((uint)index < (uint)tail.States.Length) + { + // Publish item data before making the slot visible to the consumer. + tail.Items[index] = item; + Volatile.Write(ref tail.States[index], 1); + return true; + } + + return false; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private void EnqueueSlow(T item) + { + while (true) + { + Segment tail = Volatile.Read(ref _tail.Value)!; + int index = Interlocked.Increment(ref tail.EnqueueIndex.Value) - 1; + if ((uint)index < (uint)tail.States.Length) + { + tail.Items[index] = item; + Volatile.Write(ref tail.States[index], 1); + return; + } + + Segment? next = Volatile.Read(ref tail.Next); + if (next is null) + { + Segment newSegment = RentUnlinkedSegment(); + if (Interlocked.CompareExchange(ref tail.Next, newSegment, null) is null) + { + next = newSegment; + } + else + { + // Another producer linked its own segment first. Reuse ours later. + ReturnUnlinkedSegment(newSegment); + next = Volatile.Read(ref tail.Next); + } + } + + if (next is not null) + { + Interlocked.CompareExchange(ref _tail.Value, next, tail); + } + } + } + + /// + /// Attempts to dequeue an item. Must only be called by the single consumer thread. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool TryDequeue(out T item) + { + if (TryDequeueFast(out item)) + { + return true; + } + + return TryDequeueSlow(out item); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool TryDequeueFromSegment(Segment head, out T item) + { + int index = head.DequeueIndex; + if ((uint)index >= (uint)head.States.Length) + { + item = default!; + return false; + } + + // Acquire published slot before reading the item value. + if (Volatile.Read(ref head.States[index]) != 1) + { + item = default!; + return false; + } + + item = head.Items[index]; + if (RuntimeHelpers.IsReferenceOrContainsReferences()) + { + head.Items[index] = default!; + } + + head.DequeueIndex = index + 1; + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryDequeueFast(out T item) + { + Segment head = Volatile.Read(ref _head.Value)!; + return TryDequeueFromSegment(head, out item); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private bool TryDequeueSlow(out T item) + { + Segment head = Volatile.Read(ref _head.Value)!; + while ((uint)head.DequeueIndex >= (uint)head.States.Length) + { + Segment? next = Volatile.Read(ref head.Next); + if (next is null) + { + item = default!; + return false; + } + + _head.Value = next; + head = next; + } + + return TryDequeueFromSegment(head, out item); + } + + /// + /// Returns whether the queue currently appears empty (snapshot, not linearizable). + /// A return value of can also mean an enqueue is mid-flight. + /// + internal bool IsEmpty + { + get + { + Segment head = Volatile.Read(ref _head.Value)!; + while (true) + { + int index = head.DequeueIndex; + if ((uint)index >= (uint)head.States.Length) + { + Segment? next = Volatile.Read(ref head.Next); + if (next is null) + { + return true; + } + + head = next; + continue; + } + + return Volatile.Read(ref head.States[index]) != 1; + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private Segment RentUnlinkedSegment() + { + Segment? segment = Interlocked.Exchange(ref _cachedUnlinkedSegment, null); + if (segment is null) + { + return new Segment(_segmentSize); + } + + segment.ResetForReuse(); + return segment; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ReturnUnlinkedSegment(Segment segment) + { + segment.ResetForReuse(); + Interlocked.CompareExchange(ref _cachedUnlinkedSegment, segment, null); + } + + private sealed class Segment + { + internal readonly T[] Items; + internal readonly int[] States; + internal PaddedInt32 EnqueueIndex; + internal int DequeueIndex; + internal Segment? Next; + + internal Segment(int size) + { + Items = new T[size]; + States = new int[size]; + ResetForReuse(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void ResetForReuse() + { + EnqueueIndex.Value = 0; + DequeueIndex = 0; + Next = null; + Array.Clear(States); + if (RuntimeHelpers.IsReferenceOrContainsReferences()) + { + Array.Clear(Items); + } + } + } + +#if TARGET_ARM64 || TARGET_LOONGARCH64 + private const int CacheLineWordCount = 16; // 128-byte cache line / sizeof(nint) +#else + private const int CacheLineWordCount = 8; // 64-byte cache line / sizeof(nint) +#endif + + [InlineArray(CacheLineWordCount - 1)] + private struct CacheLinePadding + { + internal nint _element0; + } + + private struct PaddedSegment + { + internal Segment? Value; + internal CacheLinePadding _padding; + } + + private struct PaddedInt32 + { + internal int Value; + internal CacheLinePadding _padding; + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs new file mode 100644 index 00000000000000..80b1dfa07abc78 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs @@ -0,0 +1,2501 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace System.Net.Sockets +{ + internal sealed partial class SocketAsyncContext + { + private static long s_ioUringNonPinnablePrepareFallbackCount; + private const int MultishotAcceptQueueMaxSize = 256; + private const int PersistentMultishotRecvDataQueueMaxSize = 16; + private ConcurrentQueue? _multishotAcceptQueue; + private int _multishotAcceptArmed; // 0=not armed, 1=armed, 2=arming + private ulong _multishotAcceptUserData; + private ulong _persistentMultishotRecvUserData; // user_data of armed multishot recv SQE + private int _persistentMultishotRecvArmed; // 0=not armed, 1=armed + private ConcurrentQueue? _persistentMultishotRecvDataQueue; + private BufferedPersistentMultishotRecvData? _persistentMultishotRecvDataHead; + private int _persistentMultishotRecvDataHeadOffset; + private int _persistentMultishotRecvDataQueueCount; + private int _persistentMultishotRecvDataConsumerGate; + + private readonly struct BufferedPersistentMultishotRecvData + { + internal readonly byte[] Data; + internal readonly int Length; + internal readonly bool UsesPooledBuffer; + + internal BufferedPersistentMultishotRecvData(byte[] data, int length, bool usesPooledBuffer) + { + Data = data; + Length = length; + UsesPooledBuffer = usesPooledBuffer; + } + } + + /// Holds a pre-accepted connection's fd and socket address from a multishot accept CQE. + private readonly struct PreAcceptedConnection + { + internal readonly IntPtr FileDescriptor; + internal readonly byte[] SocketAddressData; + internal readonly int SocketAddressLength; + internal readonly bool UsesPooledBuffer; + + internal PreAcceptedConnection(IntPtr fileDescriptor, byte[] socketAddressData, int socketAddressLength, bool usesPooledBuffer) + { + FileDescriptor = fileDescriptor; + SocketAddressData = socketAddressData; + SocketAddressLength = socketAddressLength; + UsesPooledBuffer = usesPooledBuffer; + } + } + + /// Returns whether this context's engine is using io_uring completion mode. + private bool IsIoUringCompletionModeEnabled() + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + return engine is not null && engine.IsIoUringCompletionModeEnabled; + } + + /// Returns the global count of non-pinnable buffer prepare fallbacks for telemetry. + internal static long GetIoUringNonPinnablePrepareFallbackCount() => + Interlocked.Read(ref s_ioUringNonPinnablePrepareFallbackCount); + + /// Returns whether a multishot accept SQE is currently armed for this context. + internal bool IsMultishotAcceptArmed => Volatile.Read(ref _multishotAcceptArmed) != 0; + + /// Returns the user_data payload for the armed multishot accept SQE, if any. + internal ulong MultishotAcceptUserData => Volatile.Read(ref _multishotAcceptUserData); + + /// Clears multishot accept armed-state for this context. + internal void DisarmMultishotAccept() + { + Volatile.Write(ref _multishotAcceptUserData, 0); + Volatile.Write(ref _multishotAcceptArmed, 0); + } + + /// Returns whether a persistent multishot recv SQE is currently armed for this context. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool IsPersistentMultishotRecvArmed() => + Volatile.Read(ref _persistentMultishotRecvArmed) != 0; + + /// Records that a persistent multishot recv SQE has been armed for this context. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void SetPersistentMultishotRecvArmed(ulong userData) + { + Volatile.Write(ref _persistentMultishotRecvUserData, userData); + Volatile.Write(ref _persistentMultishotRecvArmed, 1); + } + + /// Clears this context's armed persistent multishot recv state. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void ClearPersistentMultishotRecvArmed() + { + Volatile.Write(ref _persistentMultishotRecvUserData, 0); + Volatile.Write(ref _persistentMultishotRecvArmed, 0); + } + + /// Gets the user_data of the armed persistent multishot recv SQE, or 0 if none is armed. + internal ulong PersistentMultishotRecvUserData => + Volatile.Read(ref _persistentMultishotRecvUserData); + + /// + /// Clears persistent multishot recv armed-state and requests ASYNC_CANCEL for + /// the armed user_data when available. + /// + internal void RequestPersistentMultishotRecvCancel() + { + ulong recvUserData = Volatile.Read(ref _persistentMultishotRecvUserData); + ClearPersistentMultishotRecvArmed(); + if (recvUserData != 0) + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + engine?.TryRequestIoUringCancellation(recvUserData); + } + } + + /// Copies an early multishot-recv payload into the per-socket replay queue. + internal bool TryBufferEarlyPersistentMultishotRecvData(ReadOnlySpan payload) + { + if (payload.Length == 0) + { + return true; + } + + EnsurePersistentMultishotRecvDataQueueInitialized(); + ConcurrentQueue? queue = _persistentMultishotRecvDataQueue; + if (queue is null) + { + return false; + } + + byte[] copy = ArrayPool.Shared.Rent(payload.Length); + payload.CopyTo(copy); + if (Interlocked.Increment(ref _persistentMultishotRecvDataQueueCount) > PersistentMultishotRecvDataQueueMaxSize) + { + Interlocked.Decrement(ref _persistentMultishotRecvDataQueueCount); + ArrayPool.Shared.Return(copy); + return false; + } + + queue.Enqueue(new BufferedPersistentMultishotRecvData(copy, payload.Length, usesPooledBuffer: true)); + return true; + } + + /// Attempts to drain buffered multishot-recv payload into the caller destination. + internal bool TryConsumeBufferedPersistentMultishotRecvData(Memory destination, out int bytesTransferred) + { + bytesTransferred = 0; + if (destination.Length == 0) + { + return false; + } + + EnterPersistentMultishotRecvDataConsumerGate(); + try + { + if (!TryAcquirePersistentMultishotRecvDataHead(out BufferedPersistentMultishotRecvData buffered)) + { + return false; + } + + int headOffset = _persistentMultishotRecvDataHeadOffset; + int remaining = buffered.Length - headOffset; + if (remaining <= 0) + { + ReleasePersistentMultishotRecvDataHead(); + return false; + } + + int toCopy = Math.Min(destination.Length, remaining); + buffered.Data.AsSpan(headOffset, toCopy).CopyTo(destination.Span); + _persistentMultishotRecvDataHeadOffset = headOffset + toCopy; + bytesTransferred = toCopy; + + if (_persistentMultishotRecvDataHeadOffset >= buffered.Length) + { + ReleasePersistentMultishotRecvDataHead(); + } + + return true; + } + finally + { + ExitPersistentMultishotRecvDataConsumerGate(); + } + } + + /// Ensures the pre-accepted connection queue exists. + private void EnsureMultishotAcceptQueueInitialized() + { + if (_multishotAcceptQueue is null) + { + Interlocked.CompareExchange(ref _multishotAcceptQueue, new ConcurrentQueue(), null); + } + } + + /// + /// Attempts to enqueue a pre-accepted connection from a multishot accept CQE. + /// Caller is responsible for closing when this returns false. + /// + internal bool TryEnqueuePreAcceptedConnection(IntPtr acceptedFd, ReadOnlySpan socketAddressData, int socketAddressLen) + { + EnsureMultishotAcceptQueueInitialized(); + ConcurrentQueue? queue = _multishotAcceptQueue; + if (queue is null || queue.Count >= MultishotAcceptQueueMaxSize) + { + return false; + } + + int length = socketAddressLen; + if (length < 0) + { + length = 0; + } + + if ((uint)length > (uint)socketAddressData.Length) + { + length = socketAddressData.Length; + } + + byte[] copy; + if (length != 0) + { + copy = ArrayPool.Shared.Rent(length); + socketAddressData.Slice(0, length).CopyTo(copy); + } + else + { + copy = Array.Empty(); + } + + queue.Enqueue(new PreAcceptedConnection(acceptedFd, copy, length, usesPooledBuffer: length != 0)); + return true; + } + + /// + /// Attempts to dequeue a pre-accepted connection from the multishot accept queue. + /// Returns true if a connection was available, populating the operation fields. + /// + internal bool TryDequeuePreAcceptedConnection(AcceptOperation operation) + { + EnsureMultishotAcceptQueueInitialized(); + ConcurrentQueue? queue = _multishotAcceptQueue; + if (queue is null || !queue.TryDequeue(out PreAcceptedConnection accepted)) + { + return false; + } + + try + { + operation.AcceptedFileDescriptor = accepted.FileDescriptor; + int socketAddressLen = accepted.SocketAddressLength; + if ((uint)socketAddressLen > (uint)operation.SocketAddress.Length) + { + socketAddressLen = operation.SocketAddress.Length; + } + + if (socketAddressLen != 0) + { + accepted.SocketAddressData.AsSpan(0, socketAddressLen).CopyTo(operation.SocketAddress.Span); + } + + operation.AcceptSocketAddressLength = socketAddressLen; + operation.SocketAddress = operation.SocketAddress.Slice(0, socketAddressLen); + operation.ErrorCode = SocketError.Success; + return true; + } + finally + { + ReturnPooledBufferIfNeeded(accepted.SocketAddressData, accepted.UsesPooledBuffer); + } + } + + /// Removes a completed io_uring operation from its queue and signals or dispatches its callback. + internal bool TryCompleteIoUringOperation(AsyncOperation operation) + { + bool removed = + operation is ReadOperation readOperation ? _receiveQueue.TryRemoveCompletedOperation(this, readOperation) : + operation is WriteOperation writeOperation ? _sendQueue.TryRemoveCompletedOperation(this, writeOperation) : + false; + if (!removed) + { + return false; + } + + ManualResetEventSlim? e = operation.Event; + if (e is not null) + { + e.Set(); + return true; + } + + operation.CancellationRegistration.Dispose(); + if (operation.ShouldDispatchCallback) + { + ThreadPool.UnsafeQueueUserWorkItem(static o => ((AsyncOperation)o!).InvokeCallback(allowPooling: true), operation, preferLocal: false); + } + + return true; + } + + /// Enqueues an operation for deferred SQE preparation on the event loop thread. + private bool TryEnqueueIoUringPreparation(AsyncOperation operation, long prepareSequence) + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + return engine is not null && engine.TryEnqueueIoUringPreparation(operation, prepareSequence); + } + + /// Applies cancellation and/or untracking to an operation's io_uring state. + private void HandleIoUringCancellationTransition( + AsyncOperation operation, + bool requestKernelCancellation, + bool untrackAndClear) + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + ulong userData = operation.IoUringUserData; + if (userData == 0) + { + return; + } + + if (requestKernelCancellation) + { + engine?.TryRequestIoUringCancellation(userData); + } + + if (untrackAndClear) + { + bool clearAllowed = engine?.TryUntrackIoUringOperation(userData, operation) ?? true; + if (clearAllowed) + { + operation.ClearIoUringUserData(); + } + } + } + + /// Requests kernel-level ASYNC_CANCEL for an in-flight operation. + private void TryRequestIoUringCancellation(AsyncOperation operation) + { + HandleIoUringCancellationTransition( + operation, + requestKernelCancellation: true, + untrackAndClear: false); + } + + /// Removes an operation from the registry and clears its user_data. + internal void TryUntrackIoUringOperation(AsyncOperation operation) + { + HandleIoUringCancellationTransition( + operation, + requestKernelCancellation: false, + untrackAndClear: true); + } + + /// Stages an operation for io_uring preparation if completion mode is active. + static partial void LinuxTryStageIoUringOperation(AsyncOperation operation) + { + if (operation.Event is null && operation.AssociatedContext.IsIoUringCompletionModeEnabled()) + { + if (!operation.TryQueueIoUringPreparation()) + { + operation.EmitReadinessFallbackForQueueOverflow(); + } + } + } + + partial void LinuxTryDequeuePreAcceptedConnection(AcceptOperation operation, ref bool dequeued) + { + dequeued = TryDequeuePreAcceptedConnection(operation); + } + + partial void LinuxTryConsumeBufferedPersistentMultishotRecvData(Memory destination, ref bool consumed, ref int bytesTransferred) + { + consumed = TryConsumeBufferedPersistentMultishotRecvData(destination, out bytesTransferred); + } + + /// Cleans up multishot-accept state and queued pre-accepted descriptors during abort. + partial void LinuxOnStopAndAbort() + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + if (IsPersistentMultishotRecvArmed()) + { + RequestPersistentMultishotRecvCancel(); + } + + ulong armedUserData = GetArmedMultishotAcceptUserDataForCancellation(); + if (engine is not null && armedUserData != 0) + { + engine.TryRequestIoUringCancellation(armedUserData); + } + + DisarmMultishotAccept(); + + if (_multishotAcceptQueue is not null) + { + while (_multishotAcceptQueue.TryDequeue(out PreAcceptedConnection accepted)) + { + Interop.Sys.Close(accepted.FileDescriptor); + ReturnPooledBufferIfNeeded(accepted.SocketAddressData, accepted.UsesPooledBuffer); + } + } + + EnterPersistentMultishotRecvDataConsumerGate(); + try + { + ReleasePersistentMultishotRecvDataHead(); + + if (_persistentMultishotRecvDataQueue is not null) + { + while (_persistentMultishotRecvDataQueue.TryDequeue(out BufferedPersistentMultishotRecvData buffered)) + { + Interlocked.Decrement(ref _persistentMultishotRecvDataQueueCount); + ReturnPooledBufferIfNeeded(buffered.Data, buffered.UsesPooledBuffer); + } + } + } + finally + { + ExitPersistentMultishotRecvDataConsumerGate(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void EnsurePersistentMultishotRecvDataQueueInitialized() + { + if (_persistentMultishotRecvDataQueue is null) + { + Interlocked.CompareExchange( + ref _persistentMultishotRecvDataQueue, + new ConcurrentQueue(), + comparand: null); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryAcquirePersistentMultishotRecvDataHead(out BufferedPersistentMultishotRecvData buffered) + { + if (_persistentMultishotRecvDataHead is BufferedPersistentMultishotRecvData existingHead) + { + buffered = existingHead; + return true; + } + + if (_persistentMultishotRecvDataQueue is null || + !_persistentMultishotRecvDataQueue.TryDequeue(out BufferedPersistentMultishotRecvData dequeued)) + { + buffered = default; + return false; + } + + _persistentMultishotRecvDataHead = dequeued; + _persistentMultishotRecvDataHeadOffset = 0; + buffered = dequeued; + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ReleasePersistentMultishotRecvDataHead() + { + if (_persistentMultishotRecvDataHead is not BufferedPersistentMultishotRecvData head) + { + return; + } + + _persistentMultishotRecvDataHead = null; + _persistentMultishotRecvDataHeadOffset = 0; + Interlocked.Decrement(ref _persistentMultishotRecvDataQueueCount); + ReturnPooledBufferIfNeeded(head.Data, head.UsesPooledBuffer); + } + + private void EnterPersistentMultishotRecvDataConsumerGate() + { + SpinWait spinWait = default; + while (Interlocked.CompareExchange( + ref _persistentMultishotRecvDataConsumerGate, + value: 1, + comparand: 0) != 0) + { + spinWait.SpinOnce(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ExitPersistentMultishotRecvDataConsumerGate() + { + Volatile.Write(ref _persistentMultishotRecvDataConsumerGate, 0); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ReturnPooledBufferIfNeeded(byte[] buffer, bool usesPooledBuffer) + { + if (usesPooledBuffer) + { + ArrayPool.Shared.Return(buffer); + } + } + + private ulong GetArmedMultishotAcceptUserDataForCancellation() + { + ulong userData = Volatile.Read(ref _multishotAcceptUserData); + if (userData != 0 || Volatile.Read(ref _multishotAcceptArmed) == 0) + { + return userData; + } + + SpinWait spinner = default; + for (int i = 0; i < 64; i++) + { + spinner.SpinOnce(); + userData = Volatile.Read(ref _multishotAcceptUserData); + if (userData != 0 || Volatile.Read(ref _multishotAcceptArmed) == 0) + { + break; + } + } + + return userData; + } + + internal abstract partial class AsyncOperation + { + /// Outcome of processing an io_uring CQE, determining the dispatch action. + internal enum IoUringCompletionResult + { + Completed = 0, + Pending = 1, + Canceled = 2, + Ignored = 3 + } + + /// Tri-state result from direct (managed) SQE preparation. + internal enum IoUringDirectPrepareResult + { + Unsupported = 0, // Direct path unavailable for this shape; caller keeps operation pending. + Prepared = 1, // SQE written + PrepareFailed = 2 // Direct preparation failed; caller handles retry/fallback without native prepare. + } + + /// Tracks whether a receive operation prepared as one-shot or multishot. + internal enum IoUringReceiveSubmissionMode : byte + { + None = 0, + OneShot = 1, + Multishot = 2 + } + + private long _ioUringPrepareSequence; + private int _ioUringPrepareQueued; + private int _ioUringPreparationReusable; + private MemoryHandle _ioUringPinnedBuffer; + private int _ioUringPinnedBufferActive; + private int _ioUringCompletionSocketAddressLen; + private int _ioUringCompletionControlBufferLen; + private int _ioUringReceiveSubmissionMode; + private int _ioUringSlotExhaustionRetryCount; + internal ulong IoUringUserData; + + /// Requests kernel cancellation if the flag is set. + partial void LinuxRequestIoUringCancellationIfNeeded(bool requestIoUringCancellation) + { + if (requestIoUringCancellation) + { + AssociatedContext.TryRequestIoUringCancellation(this); + } + } + + /// Untracks this operation unless it is in the Canceled state awaiting a terminal CQE. + partial void LinuxUntrackIoUringOperation() + { + // Canceled operations remain tracked until the terminal CQE arrives so that + // pinned/user-owned resources are not released while the kernel may still + // reference them. Dispatch will clear resources on that terminal completion. + if (_state == State.Canceled) + { + return; + } + + AssociatedContext.TryUntrackIoUringOperation(this); + } + + /// Resets all io_uring preparation state and advances the prepare sequence. + partial void ResetIoUringState() + { + ReleaseIoUringPreparationResources(); + IoUringUserData = 0; + Volatile.Write(ref _ioUringPreparationReusable, 0); + _ioUringCompletionSocketAddressLen = 0; + _ioUringCompletionControlBufferLen = 0; + _ioUringReceiveSubmissionMode = (int)IoUringReceiveSubmissionMode.None; + _ioUringSlotExhaustionRetryCount = 0; + long nextPrepareSequence = unchecked(_ioUringPrepareSequence + 1); + // Keep sequence strictly positive so stale queued work from previous resets never matches. + if (nextPrepareSequence <= 0) + { + nextPrepareSequence = 1; + } + + Volatile.Write(ref _ioUringPrepareSequence, nextPrepareSequence); + Volatile.Write(ref _ioUringPrepareQueued, 0); + } + + /// Marks this operation as ready for SQE preparation and returns its sequence number. + internal long MarkReadyForIoUringPreparation() + { + long prepareSequence = Volatile.Read(ref _ioUringPrepareSequence); + Debug.Assert(prepareSequence > 0); + Volatile.Write(ref _ioUringPrepareQueued, 1); + return prepareSequence; + } + + /// Cancels a pending preparation if the sequence number still matches. + internal void CancelPendingIoUringPreparation(long prepareSequence) + { + if (Volatile.Read(ref _ioUringPrepareSequence) == prepareSequence) + { + Volatile.Write(ref _ioUringPrepareQueued, 0); + } + } + + /// Attempts to prepare an SQE for this operation via the managed direct path. + internal bool TryPrepareIoUring(SocketAsyncContext context, long prepareSequence) + { + if (prepareSequence <= 0 || + Volatile.Read(ref _ioUringPrepareSequence) != prepareSequence || + Interlocked.Exchange(ref _ioUringPrepareQueued, 0) == 0 || + _state != State.Waiting) + { + return false; + } + + if (Interlocked.Exchange(ref _ioUringPreparationReusable, 0) == 0) + { + ReleaseIoUringPreparationResources(); + } + + SocketAsyncEngine? engine = Volatile.Read(ref context._asyncEngine); + if (engine is null || !engine.IsIoUringDirectSqeEnabled) + { + // Managed completion mode assumes direct SQE submission. + // If direct submission is unavailable, keep operation pending for fallback handling. + ErrorCode = SocketError.Success; + IoUringUserData = 0; + return false; + } + + IoUringDirectPrepareResult directResult = IoUringPrepareDirect(context, engine, out ulong directUserData); + if (directResult == IoUringDirectPrepareResult.Prepared) + { + _ioUringSlotExhaustionRetryCount = 0; + IoUringUserData = ErrorCode == SocketError.Success ? directUserData : 0; + return true; + } + + if (directResult == IoUringDirectPrepareResult.PrepareFailed) + { + IoUringUserData = 0; + return false; + } + + // Direct preparation unsupported for this operation shape. + // Leave operation pending so caller can use completion-path fallback semantics. + ErrorCode = SocketError.Success; + IoUringUserData = 0; + return false; + } + + /// Queues this operation for deferred preparation on the event loop thread. + internal bool TryQueueIoUringPreparation() + { + if (!AssociatedContext.IsIoUringCompletionModeEnabled()) + { + return false; + } + + long prepareSequence = MarkReadyForIoUringPreparation(); + if (AssociatedContext.TryEnqueueIoUringPreparation(this, prepareSequence)) + { + return true; + } + + CancelPendingIoUringPreparation(prepareSequence); + return false; + } + + /// Returns whether this operation is currently in the waiting state. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool IsInWaitingState() => _state == State.Waiting; + + /// Increments and returns the slot-exhaustion retry count for this operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal int IncrementIoUringSlotExhaustionRetryCount() => ++_ioUringSlotExhaustionRetryCount; + + /// Resets slot-exhaustion retry tracking for this operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void ResetIoUringSlotExhaustionRetryCount() => _ioUringSlotExhaustionRetryCount = 0; + + /// + /// Emits a readiness fallback event when io_uring prepare-queue staging fails. + /// + internal void EmitReadinessFallbackForQueueOverflow() + { + Interop.Sys.SocketEvents fallbackEvents = GetIoUringFallbackSocketEvents(); + if (fallbackEvents == Interop.Sys.SocketEvents.None) + { + return; + } + + SocketAsyncContext context = AssociatedContext; + SocketAsyncEngine? engine = Volatile.Read(ref context._asyncEngine); + if (engine is null) + { + return; + } + + engine.EnqueueReadinessFallbackEvent( + context, + fallbackEvents, + countAsPrepareQueueOverflowFallback: true); + } + + /// Processes a CQE result and returns the dispatch action for the completion handler. + internal IoUringCompletionResult ProcessIoUringCompletionResult(int result, uint flags, uint auxiliaryData) + { + Trace($"Enter, result={result}, flags={flags}, auxiliaryData={auxiliaryData}"); + + // Claim ownership of completion processing; if cancellation already won, do not publish completion. + State oldState = Interlocked.CompareExchange(ref _state, State.Running, State.Waiting); + if (oldState == State.Canceled) + { + Trace("Exit, previously canceled"); + return IoUringCompletionResult.Canceled; + } + + if (oldState != State.Waiting) + { + Trace("Exit, ignored"); + return IoUringCompletionResult.Ignored; + } + + if (ProcessIoUringCompletion(AssociatedContext, result, flags, auxiliaryData)) + { + _state = State.Complete; + Trace("Exit, completed"); + return IoUringCompletionResult.Completed; + } + + // Incomplete path (e.g. transient retry): mirror TryComplete state transition handling. + State newState; + while (true) + { + State state = _state; + Debug.Assert(state is State.Running or State.RunningWithPendingCancellation, $"Unexpected operation state: {(State)state}"); + + newState = (state == State.Running ? State.Waiting : State.Canceled); + if (state == Interlocked.CompareExchange(ref _state, newState, state)) + { + break; + } + } + + if (newState == State.Canceled) + { + ProcessCancellation(); + Trace("Exit, canceled while pending"); + return IoUringCompletionResult.Canceled; + } + + Trace("Exit, pending"); + return IoUringCompletionResult.Pending; + } + + /// Stores recvmsg output lengths from the CQE for post-completion processing. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void SetIoUringCompletionMessageMetadata(int socketAddressLen, int controlBufferLen) + { + _ioUringCompletionSocketAddressLen = socketAddressLen; + _ioUringCompletionControlBufferLen = controlBufferLen; + } + + /// Releases preparation resources and resets the user_data to zero. + internal void ClearIoUringUserData() + { + ReleaseIoUringPreparationResources(); + IoUringUserData = 0; + Volatile.Write(ref _ioUringPreparationReusable, 0); + _ioUringCompletionSocketAddressLen = 0; + _ioUringCompletionControlBufferLen = 0; + _ioUringReceiveSubmissionMode = (int)IoUringReceiveSubmissionMode.None; + _ioUringSlotExhaustionRetryCount = 0; + } + + /// Clears user_data without releasing preparation resources for pending requeue. + internal void ResetIoUringUserDataForRequeue() + { + IoUringUserData = 0; + _ioUringCompletionSocketAddressLen = 0; + _ioUringCompletionControlBufferLen = 0; + } + + /// Records whether the current receive preparation uses one-shot or multishot mode. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + protected void SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode mode) + { + Volatile.Write(ref _ioUringReceiveSubmissionMode, (int)mode); + } + + /// Marks preparation resources as reusable so the next prepare skips re-pinning. + internal void MarkIoUringPreparationReusable() + { + Volatile.Write(ref _ioUringPreparationReusable, 1); + } + + /// Socket address length reported by the kernel in the CQE. + protected int IoUringCompletionSocketAddressLen => _ioUringCompletionSocketAddressLen; + /// Control buffer length reported by the kernel in the CQE. + protected int IoUringCompletionControlBufferLen => _ioUringCompletionControlBufferLen; + + /// Pins a buffer and returns the raw pointer, recording the handle for later release. + protected unsafe byte* PinIoUringBuffer(Memory buffer) + { + ReleasePinnedIoUringBuffer(); + if (buffer.Length == 0) + { + return null; + } + + _ioUringPinnedBuffer = buffer.Pin(); + Volatile.Write(ref _ioUringPinnedBufferActive, 1); + return (byte*)_ioUringPinnedBuffer.Pointer; + } + + /// Attempts to pin a buffer, falling back to the readiness path if not pinnable. + protected unsafe bool TryPinIoUringBuffer(Memory buffer, out byte* pinnedBuffer) + { + if (Volatile.Read(ref _ioUringPinnedBufferActive) != 0) + { + pinnedBuffer = (byte*)_ioUringPinnedBuffer.Pointer; + if (buffer.Length > 0 && pinnedBuffer is null) + { + ReleasePinnedIoUringBuffer(); + RecordIoUringNonPinnablePrepareFallback("null-reused-pin-pointer", buffer.Length); + ErrorCode = SocketError.Success; + return false; + } + + return true; + } + + try + { + pinnedBuffer = PinIoUringBuffer(buffer); + if (buffer.Length > 0 && pinnedBuffer is null) + { + ReleasePinnedIoUringBuffer(); + RecordIoUringNonPinnablePrepareFallback("null-pin-pointer", buffer.Length); + ErrorCode = SocketError.Success; + return false; + } + + return true; + } + catch (NotSupportedException) + { + pinnedBuffer = null; + RecordIoUringNonPinnablePrepareFallback("pin-not-supported", buffer.Length); + ErrorCode = SocketError.Success; + return false; + } + } + + /// Transfers ownership of the active pinned buffer to the caller. + internal MemoryHandle TransferPinnedBuffer() + { + if (Interlocked.Exchange(ref _ioUringPinnedBufferActive, 0) == 0) + { + return default; + } + + MemoryHandle pinnedBuffer = _ioUringPinnedBuffer; + _ioUringPinnedBuffer = default; + return pinnedBuffer; + } + + /// + /// Attempts to pin a socket address buffer, reusing an existing pin when possible. + /// Caller is responsible for setting operation ErrorCode on failure if needed. + /// + protected static unsafe bool TryPinIoUringSocketAddress( + Memory socketAddress, + ref MemoryHandle pinnedSocketAddress, + ref int pinnedSocketAddressActive, + out byte* rawSocketAddress) + { + rawSocketAddress = null; + if (socketAddress.Length == 0) + { + return true; + } + + if (Volatile.Read(ref pinnedSocketAddressActive) != 0) + { + rawSocketAddress = (byte*)pinnedSocketAddress.Pointer; + if (rawSocketAddress is null) + { + pinnedSocketAddress.Dispose(); + pinnedSocketAddress = default; + Volatile.Write(ref pinnedSocketAddressActive, 0); + return false; + } + + return true; + } + + try + { + pinnedSocketAddress = socketAddress.Pin(); + Volatile.Write(ref pinnedSocketAddressActive, 1); + } + catch (NotSupportedException) + { + rawSocketAddress = null; + return false; + } + + rawSocketAddress = (byte*)pinnedSocketAddress.Pointer; + if (rawSocketAddress is null) + { + pinnedSocketAddress.Dispose(); + pinnedSocketAddress = default; + Volatile.Write(ref pinnedSocketAddressActive, 0); + return false; + } + + return true; + } + + /// + /// Pins a socket address buffer and normalizes pinning failures to a non-terminal fallback signal. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + protected unsafe bool TryPinIoUringSocketAddressForPrepare( + Memory socketAddress, + ref MemoryHandle pinnedSocketAddress, + ref int pinnedSocketAddressActive, + out byte* rawSocketAddress) + { + if (TryPinIoUringSocketAddress( + socketAddress, + ref pinnedSocketAddress, + ref pinnedSocketAddressActive, + out rawSocketAddress)) + { + return true; + } + + ErrorCode = SocketError.Success; + return false; + } + + /// Releases an operation-owned pinned socket-address buffer and message-header allocation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + protected static unsafe void ReleaseIoUringSocketAddressAndMessageHeader( + ref MemoryHandle pinnedSocketAddress, + ref int pinnedSocketAddressActive, + ref IntPtr messageHeader) + { + if (Interlocked.Exchange(ref pinnedSocketAddressActive, 0) != 0) + { + pinnedSocketAddress.Dispose(); + pinnedSocketAddress = default; + } + + IntPtr header = Interlocked.Exchange(ref messageHeader, IntPtr.Zero); + if (header != IntPtr.Zero) + { + NativeMemory.Free((void*)header); + } + } + + /// Records a telemetry counter for a non-pinnable buffer fallback. + private void RecordIoUringNonPinnablePrepareFallback(string reason, int bufferLength) + { + if (!AssociatedContext.IsIoUringCompletionModeEnabled()) + { + return; + } + + long count = Interlocked.Increment(ref s_ioUringNonPinnablePrepareFallbackCount); + if (NetEventSource.Log.IsEnabled() && (count & 0x3F) == 1) + { + LogIoUringNonPinnablePrepareFallback(reason, bufferLength, count); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + void LogIoUringNonPinnablePrepareFallback(string fallbackReason, int fallbackBufferLength, long fallbackCount) + { + NetEventSource.Info( + AssociatedContext, + $"io_uring prepare fallback due to non-pinnable buffer: reason={fallbackReason}, length={fallbackBufferLength}, count={fallbackCount}"); + } + } + + /// Releases the currently pinned buffer handle if active. + private void ReleasePinnedIoUringBuffer() + { + if (Interlocked.Exchange(ref _ioUringPinnedBufferActive, 0) != 0) + { + _ioUringPinnedBuffer.Dispose(); + _ioUringPinnedBuffer = default; + } + } + + /// Releases the pinned buffer when the operation shape (single vs list) changes. + protected void ReleaseIoUringPinnedBufferForShapeTransition() => + ReleasePinnedIoUringBuffer(); + + /// Releases all preparation resources including the pinned buffer and subclass resources. + private void ReleaseIoUringPreparationResources() + { + ReleasePinnedIoUringBuffer(); + ReleaseIoUringPreparationResourcesCore(); + } + + /// Subclass hook to release operation-specific preparation resources. + protected virtual void ReleaseIoUringPreparationResourcesCore() + { + } + + /// Frees a set of GCHandles used for buffer list pinning. + protected static void ReleasePinnedHandles(GCHandle[] pinnedHandles, int count) + { + if (count <= 0) + { + return; + } + + int releaseCount = count < pinnedHandles.Length ? count : pinnedHandles.Length; + for (int i = 0; i < releaseCount; i++) + { + if (pinnedHandles[i].IsAllocated) + { + pinnedHandles[i].Free(); + } + } + } + + /// Rents an array from the shared pool for temporary io_uring preparation use. + private static T[] RentIoUringArray(int minimumLength) => + minimumLength == 0 ? Array.Empty() : ArrayPool.Shared.Rent(minimumLength); + + /// Returns a rented array to the shared pool. + private static void ReturnIoUringArray(T[] array, bool clearArray = false) + { + if (array.Length != 0) + { + ArrayPool.Shared.Return(array, clearArray); + } + } + + /// Releases pinned handles and returns the iovec array to the pool. + protected static void ReleaseIoUringPinnedHandlesAndIovecs( + ref GCHandle[]? pinnedHandles, + ref Interop.Sys.IOVector[]? iovecs, + ref int pinnedHandleCount) + { + GCHandle[]? handles = Interlocked.Exchange(ref pinnedHandles, null); + int handleCount = Interlocked.Exchange(ref pinnedHandleCount, 0); + if (handles is not null) + { + ReleasePinnedHandles(handles, handleCount); + ReturnIoUringArray(handles); + } + + Interop.Sys.IOVector[]? vectors = Interlocked.Exchange(ref iovecs, null); + if (vectors is not null) + { + ReturnIoUringArray(vectors); + } + } + + /// Pins a list of buffer segments and builds an iovec array for scatter/gather I/O. + protected static unsafe bool TryPinBufferListForIoUring( + IList> buffers, + int startIndex, + int startOffset, + out GCHandle[] pinnedHandles, + out Interop.Sys.IOVector[] iovecs, + out int iovCount, + out int pinnedHandleCount, + out SocketError errorCode) + { + iovCount = 0; + pinnedHandleCount = 0; + if ((uint)startIndex > (uint)buffers.Count) + { + errorCode = SocketError.InvalidArgument; + pinnedHandles = Array.Empty(); + iovecs = Array.Empty(); + return false; + } + + int remainingBufferCount = buffers.Count - startIndex; + pinnedHandles = RentIoUringArray(remainingBufferCount); + iovecs = RentIoUringArray(remainingBufferCount); + + int currentOffset = startOffset; + byte[]? lastPinnedArray = null; + GCHandle lastPinnedHandle = default; + try + { + for (int i = 0; i < remainingBufferCount; i++, currentOffset = 0) + { + ArraySegment buffer = buffers[startIndex + i]; + RangeValidationHelpers.ValidateSegment(buffer); + + if ((uint)currentOffset > (uint)buffer.Count) + { + ReleasePinnedHandles(pinnedHandles, pinnedHandleCount); + ReturnIoUringArray(pinnedHandles); + ReturnIoUringArray(iovecs); + errorCode = SocketError.InvalidArgument; + return false; + } + + int bufferCount = buffer.Count - currentOffset; + byte* basePtr = null; + if (bufferCount != 0) + { + byte[] array = buffer.Array!; + GCHandle handle; + if (ReferenceEquals(array, lastPinnedArray)) + { + handle = lastPinnedHandle; + } + else + { + handle = GCHandle.Alloc(array, GCHandleType.Pinned); + pinnedHandles[pinnedHandleCount] = handle; + pinnedHandleCount++; + lastPinnedArray = array; + lastPinnedHandle = handle; + } + + basePtr = &((byte*)handle.AddrOfPinnedObject())[buffer.Offset + currentOffset]; + } + + iovecs[i].Base = basePtr; + iovecs[i].Count = (UIntPtr)bufferCount; + iovCount++; + } + } + catch + { + ReleasePinnedHandles(pinnedHandles, pinnedHandleCount); + ReturnIoUringArray(pinnedHandles); + ReturnIoUringArray(iovecs); + throw; + } + + errorCode = SocketError.Success; + return true; + } + + /// Prepares an SQE via the managed direct path. Override in subclasses for direct submission. + protected virtual IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + return IoUringDirectPrepareResult.Unsupported; + } + + /// Routes a CQE to the success or error handler based on the result sign. + protected virtual bool ProcessIoUringCompletion(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + return result >= 0 ? + ProcessIoUringCompletionSuccess(context, result, flags, auxiliaryData) : + ProcessIoUringCompletionError(context, result, flags, auxiliaryData); + } + + /// Processes a successful (non-negative) io_uring completion result. + protected virtual bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + Debug.Assert(result >= 0, $"Expected non-negative io_uring result, got {result}"); + ErrorCode = SocketError.Success; + return true; + } + + /// Processes a failed (negative) io_uring completion result. + protected virtual bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}"); + ErrorCode = SocketPal.GetSocketErrorForErrorCode(GetIoUringPalError(result)); + return true; + } + + /// Whether preparation resources should be preserved when the operation is requeued. + internal virtual bool ShouldReuseIoUringPreparationResourcesOnPending => false; + + /// Returns whether the negative result represents EAGAIN/EWOULDBLOCK. + protected static bool IsIoUringRetryableError(int result) + { + if (result >= 0) + { + return false; + } + + Interop.Error error = GetIoUringPalError(result); + return error == Interop.Error.EAGAIN || error == Interop.Error.EWOULDBLOCK; + } + + /// Converts a negative io_uring result to a SocketError, returning false for retryable errors. + protected static bool ProcessIoUringErrorResult(int result, out SocketError errorCode) + { + Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}"); + + if (IsIoUringRetryableError(result)) + { + errorCode = SocketError.Success; + return false; + } + + errorCode = SocketPal.GetSocketErrorForErrorCode(GetIoUringPalError(result)); + return true; + } + + /// Converts a negative io_uring CQE result (raw -errno) to PAL error space. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + protected static Interop.Error GetIoUringPalError(int result) + { + Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}"); + int platformErrno = -result; + return Interop.Sys.ConvertErrorPlatformToPal(platformErrno); + } + + /// Returns the epoll event mask to use when falling back from io_uring to readiness notification. + internal virtual Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() => + Interop.Sys.SocketEvents.None; + + /// + /// Copies payload bytes from a provided-buffer ring selection into the operation's target memory. + /// Returns false when this operation shape does not support provided-buffer payload materialization. + /// + internal virtual unsafe bool TryProcessIoUringProvidedBufferCompletion( + byte* providedBuffer, + int providedBufferLength, + int bytesTransferred, + ref uint auxiliaryData) + { + _ = providedBuffer; + _ = providedBufferLength; + _ = bytesTransferred; + _ = auxiliaryData; + return false; + } + } + + internal abstract partial class ReadOperation + { + /// + protected override bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) => + ProcessIoUringErrorResult(result, out ErrorCode); + + /// + // Retained only for defensive fallback paths; regular completion mode avoids readiness fallback. + internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() => + Interop.Sys.SocketEvents.Read; + } + + private abstract partial class WriteOperation + { + /// + protected override bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) => + ProcessIoUringErrorResult(result, out ErrorCode); + + /// + // Retained only for defensive fallback paths; regular completion mode avoids readiness fallback. + internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() => + Interop.Sys.SocketEvents.Write; + } + + private abstract partial class SendOperation + { + /// + protected override bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + if (result == 0) + { + ErrorCode = SocketError.Success; + return true; + } + + Debug.Assert(result > 0, $"Expected positive io_uring send completion size, got {result}"); + Debug.Assert(result <= Count, $"Unexpected io_uring send completion size: result={result}, count={Count}"); + + int sent = Math.Min(result, Count); + BytesTransferred += sent; + Offset += sent; + Count -= sent; + ErrorCode = SocketError.Success; + return Count == 0; + } + } + + private partial class BufferMemorySendOperation + { + private IntPtr _ioUringMessageHeader; + private MemoryHandle _ioUringPinnedSocketAddress; + private int _ioUringPinnedSocketAddressActive; + + /// + internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true; + + /// + protected override unsafe void ReleaseIoUringPreparationResourcesCore() + { + ReleaseIoUringSocketAddressAndMessageHeader( + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + ref _ioUringMessageHeader); + } + + /// Gets a message header buffer and sets the common sendmsg fields. + private unsafe Interop.Sys.MessageHeader* GetOrCreateIoUringSendMessageHeader(byte* rawSocketAddress) + { + Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader; + if (messageHeader is null) + { + messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader)); + _ioUringMessageHeader = (IntPtr)messageHeader; + } + + messageHeader->SocketAddress = rawSocketAddress; + messageHeader->SocketAddressLen = SocketAddress.Length; + messageHeader->ControlBuffer = null; + messageHeader->ControlBufferLen = 0; + messageHeader->Flags = SocketFlags.None; + return messageHeader; + } + + /// Configures a message header with zero or one iovec entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void ConfigureSingleIov( + Interop.Sys.MessageHeader* messageHeader, + byte* rawBuffer, + int bufferLength, + Interop.Sys.IOVector* iov) + { + if (bufferLength == 0) + { + messageHeader->IOVectors = null; + messageHeader->IOVectorCount = 0; + return; + } + + iov->Base = rawBuffer; + iov->Count = (UIntPtr)bufferLength; + messageHeader->IOVectors = iov; + messageHeader->IOVectorCount = 1; + } + + /// Builds a connected send or sendmsg preparation request. + private unsafe IoUringDirectPrepareResult IoUringPrepareDirectSendMessage( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (!TryPinIoUringSocketAddressForPrepare( + SocketAddress, + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + out byte* rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (rawBuffer is not null) + { + rawBuffer += Offset; + } + + Interop.Sys.MessageHeader* messageHeader = GetOrCreateIoUringSendMessageHeader(rawSocketAddress); + Interop.Sys.IOVector sendIov; + ConfigureSingleIov(messageHeader, rawBuffer, Count, &sendIov); + + IoUringDirectPrepareResult sendMessagePrepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback( + context._socket, + messageHeader, + Count, + Flags, + out userData, + out SocketError sendMessageErrorCode); + ErrorCode = sendMessageErrorCode; + return sendMessagePrepareResult; + } + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (SocketAddress.Length == 0) + { + if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (rawBuffer is not null) + { + rawBuffer += Offset; + } + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectSendWithZeroCopyFallback( + context._socket, + rawBuffer, + Count, + Flags, + out bool usedZeroCopy, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + if (usedZeroCopy && prepareResult == IoUringDirectPrepareResult.Prepared) + { + engine.TransferIoUringZeroCopyPinHold(userData, TransferPinnedBuffer()); + } + + return prepareResult; + } + + return IoUringPrepareDirectSendMessage(context, engine, out userData); + } + } + + private sealed partial class BufferListSendOperation + { + private GCHandle[]? _ioUringPinnedBufferHandles; + private Interop.Sys.IOVector[]? _ioUringIovecs; + private int _ioUringPinnedHandleCount; + private int _ioUringPreparedBufferCount = -1; + private int _ioUringPreparedStartIndex = -1; + private int _ioUringPreparedStartOffset = -1; + private int _ioUringPreparedIovCount; + + /// + internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true; + + /// + protected override void ReleaseIoUringPreparationResourcesCore() + { + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + _ioUringPreparedBufferCount = -1; + _ioUringPreparedStartIndex = -1; + _ioUringPreparedStartOffset = -1; + _ioUringPreparedIovCount = 0; + } + + /// Pins buffer segments starting at BufferIndex/Offset and builds the iovec array. + private bool TryPinIoUringBuffers( + IList> buffers, + int startIndex, + int startOffset, + out int iovCount) + { + if (_ioUringPinnedBufferHandles is not null && + _ioUringIovecs is not null && + _ioUringPreparedBufferCount == buffers.Count && + _ioUringPreparedStartIndex == startIndex && + _ioUringPreparedStartOffset == startOffset && + _ioUringPreparedIovCount <= _ioUringIovecs.Length) + { + iovCount = _ioUringPreparedIovCount; + return true; + } + + // Release any existing pinned handles and rented arrays before creating new ones. + // This handles the partial-send case where BufferIndex/Offset advanced, causing the + // reuse check above to fail while old resources are still held. + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + + if (!TryPinBufferListForIoUring( + buffers, + startIndex, + startOffset, + out GCHandle[] pinnedHandles, + out Interop.Sys.IOVector[] iovecs, + out iovCount, + out int pinnedHandleCount, + out SocketError errorCode)) + { + ErrorCode = errorCode; + return false; + } + + _ioUringPinnedBufferHandles = pinnedHandles; + _ioUringIovecs = iovecs; + _ioUringPinnedHandleCount = pinnedHandleCount; + _ioUringPreparedBufferCount = buffers.Count; + _ioUringPreparedStartIndex = startIndex; + _ioUringPreparedStartOffset = startOffset; + _ioUringPreparedIovCount = iovCount; + return true; + } + + /// Advances the buffer position after a partial send, returning true when all data is sent. + private bool AdvanceSendBufferPosition(int bytesSent) + { + IList>? buffers = Buffers; + if (buffers is null || bytesSent <= 0) + { + return buffers is null || BufferIndex >= buffers.Count; + } + + int remaining = bytesSent; + int index = BufferIndex; + int offset = Offset; + + while (remaining > 0 && index < buffers.Count) + { + int available = buffers[index].Count - offset; + Debug.Assert(available >= 0, "Unexpected negative buffer availability during io_uring send completion."); + + if (available > remaining) + { + offset += remaining; + break; + } + + remaining -= Math.Max(available, 0); + index++; + offset = 0; + } + + BufferIndex = index; + Offset = offset; + return index >= buffers.Count; + } + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (context.IsPersistentMultishotRecvArmed()) + { + context.RequestPersistentMultishotRecvCancel(); + } + + IList>? buffers = Buffers; + if (buffers is null) + { + ErrorCode = SocketError.Success; + return IoUringDirectPrepareResult.PrepareFailed; + } + + if ((uint)BufferIndex > (uint)buffers.Count) + { + ErrorCode = SocketError.Success; + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (!TryPinIoUringBuffers(buffers, BufferIndex, Offset, out int iovCount)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + byte* rawSocketAddress = null; + if (SocketAddress.Length != 0 && !TryPinIoUringBuffer(SocketAddress, out rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + Interop.Sys.MessageHeader messageHeader; + messageHeader.SocketAddress = rawSocketAddress; + messageHeader.SocketAddressLen = SocketAddress.Length; + messageHeader.ControlBuffer = null; + messageHeader.ControlBufferLen = 0; + messageHeader.Flags = SocketFlags.None; + + Interop.Sys.IOVector[] iovecs = _ioUringIovecs!; + if (iovCount != 0) + { + fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0]) + { + messageHeader.IOVectors = iovecsPtr; + messageHeader.IOVectorCount = iovCount; + // Buffer-list sends can be many small segments (e.g. 4KB chunks). Use + // aggregate payload size for zero-copy eligibility, not per-segment size. + long totalPayloadBytes = 0; + for (int i = 0; i < iovCount; i++) + { + totalPayloadBytes += (long)(nuint)iovecs[i].Count; + if (totalPayloadBytes >= int.MaxValue) + { + totalPayloadBytes = int.MaxValue; + break; + } + } + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback( + context._socket, + &messageHeader, + (int)totalPayloadBytes, + Flags, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + } + + messageHeader.IOVectors = null; + messageHeader.IOVectorCount = 0; + IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback( + context._socket, + &messageHeader, + payloadLength: 0, + Flags, + out userData, + out SocketError zeroIovErrorCode); + ErrorCode = zeroIovErrorCode; + return zeroIovPrepareResult; + } + + /// + protected override bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + if (result == 0) + { + ErrorCode = SocketError.Success; + return true; + } + + Debug.Assert(result > 0, $"Expected positive io_uring send completion size, got {result}"); + BytesTransferred += result; + bool complete = AdvanceSendBufferPosition(result); + ErrorCode = SocketError.Success; + return complete; + } + } + + private sealed partial class BufferMemoryReceiveOperation + { + private IntPtr _ioUringMessageHeader; + private MemoryHandle _ioUringPinnedSocketAddress; + private int _ioUringPinnedSocketAddressActive; + + /// + internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true; + + /// + protected override unsafe void ReleaseIoUringPreparationResourcesCore() + { + ReleaseIoUringSocketAddressAndMessageHeader( + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + ref _ioUringMessageHeader); + } + + /// Gets a message header buffer and sets the common recvmsg fields. + private unsafe Interop.Sys.MessageHeader* GetOrCreateIoUringReceiveMessageHeader(byte* rawSocketAddress) + { + Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader; + if (messageHeader is null) + { + messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader)); + _ioUringMessageHeader = (IntPtr)messageHeader; + } + + InitializeReceiveMessageHeader(messageHeader, rawSocketAddress); + return messageHeader; + } + + /// Initializes recvmsg header fields shared by direct preparation variants. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void InitializeReceiveMessageHeader(Interop.Sys.MessageHeader* messageHeader, byte* rawSocketAddress) + { + messageHeader->SocketAddress = rawSocketAddress; + messageHeader->SocketAddressLen = SocketAddress.Length; + messageHeader->ControlBuffer = null; + messageHeader->ControlBufferLen = 0; + messageHeader->Flags = SocketFlags.None; + } + + /// Configures a message header with a single iovec entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void ConfigureSingleIov( + Interop.Sys.MessageHeader* messageHeader, + byte* rawBuffer, + int bufferLength, + Interop.Sys.IOVector* iov) + { + // Keep a single iovec even for zero-length receives so recvmsg preserves + // completion-mode readiness probe behavior for zero-byte operations. + iov->Base = rawBuffer; + iov->Count = (UIntPtr)bufferLength; + messageHeader->IOVectors = iov; + messageHeader->IOVectorCount = 1; + } + + /// Builds a connected or receive-from recvmsg operation. + private unsafe IoUringDirectPrepareResult IoUringPrepareDirectReceiveMessage( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (!TryPinIoUringSocketAddressForPrepare( + SocketAddress, + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + out byte* rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + Interop.Sys.MessageHeader* messageHeader = GetOrCreateIoUringReceiveMessageHeader(rawSocketAddress); + Interop.Sys.IOVector receiveIov; + ConfigureSingleIov(messageHeader, rawBuffer, Buffer.Length, &receiveIov); + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + + /// + /// Returns whether this operation shape is eligible for multishot recv submission. + /// Eligible: connected TCP receive (no socket address, no recvmsg flags) with non-empty buffer. + /// Ineligible: zero-byte probes, recvmsg-based receive paths (SetReceivedFlags/socket address). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsEligibleForIoUringMultishotRecv() + { + if (SetReceivedFlags || SocketAddress.Length != 0) + { + return false; + } + + return Buffer.Length != 0; + } + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (SetReceivedFlags || SocketAddress.Length != 0) + { + if (context.IsPersistentMultishotRecvArmed()) + { + context.RequestPersistentMultishotRecvCancel(); + } + + SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.OneShot); + IoUringDirectPrepareResult receiveMessagePrepareResult = + IoUringPrepareDirectReceiveMessage(context, engine, out userData); + if (receiveMessagePrepareResult != IoUringDirectPrepareResult.Prepared || ErrorCode != SocketError.Success) + { + SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None); + } + + return receiveMessagePrepareResult; + } + + bool allowMultishotRecv = IsEligibleForIoUringMultishotRecv() && engine.SupportsMultishotRecv; + if (!allowMultishotRecv && context.IsPersistentMultishotRecvArmed()) + { + context.RequestPersistentMultishotRecvCancel(); + } + + SetIoUringReceiveSubmissionMode( + allowMultishotRecv ? IoUringReceiveSubmissionMode.Multishot : IoUringReceiveSubmissionMode.OneShot); + + // Persistent multishot receive: if one is already armed, attach this operation to + // that existing user_data instead of submitting a new recv SQE. + if (allowMultishotRecv && context.IsPersistentMultishotRecvArmed()) + { + ulong armedUserData = context.PersistentMultishotRecvUserData; + if (armedUserData != 0 && + engine.TryReplaceIoUringTrackedOperation(armedUserData, this)) + { + SocketsTelemetry.Log.IoUringPersistentMultishotRecvReuse(); + userData = armedUserData; + ErrorCode = SocketError.Success; + return IoUringDirectPrepareResult.Prepared; + } + + // Stale armed-state; clear and submit a fresh SQE below. + context.ClearPersistentMultishotRecvArmed(); + } + + if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer)) + { + ErrorCode = SocketError.Success; + SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None); + return IoUringDirectPrepareResult.PrepareFailed; + } + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectRecv( + context._socket, + rawBuffer, + Buffer.Length, + Flags, + allowMultishotRecv, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + if (allowMultishotRecv && + prepareResult == IoUringDirectPrepareResult.Prepared && + errorCode == SocketError.Success) + { + context.SetPersistentMultishotRecvArmed(userData); + } + + if (prepareResult != IoUringDirectPrepareResult.Prepared || errorCode != SocketError.Success) + { + SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None); + } + + return prepareResult; + } + + /// + protected override bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + BytesTransferred = result; + ReceivedFlags = SetReceivedFlags ? (SocketFlags)(int)auxiliaryData : SocketFlags.None; + + if (SocketAddress.Length != 0) + { + int socketAddressLen = IoUringCompletionSocketAddressLen; + if (socketAddressLen < 0) + { + socketAddressLen = 0; + } + + if ((uint)socketAddressLen > (uint)SocketAddress.Length) + { + socketAddressLen = SocketAddress.Length; + } + + SocketAddress = SocketAddress.Slice(0, socketAddressLen); + } + ErrorCode = SocketError.Success; + return true; + } + + /// + internal override unsafe bool TryProcessIoUringProvidedBufferCompletion( + byte* providedBuffer, + int providedBufferLength, + int bytesTransferred, + ref uint auxiliaryData) + { + _ = auxiliaryData; + + if (bytesTransferred <= 0) + { + return true; + } + + if (SetReceivedFlags || SocketAddress.Length != 0) + { + return false; + } + + if ((uint)bytesTransferred > (uint)providedBufferLength || + (uint)bytesTransferred > (uint)Buffer.Length) + { + return false; + } + + new ReadOnlySpan(providedBuffer, bytesTransferred).CopyTo(Buffer.Span); + return true; + } + } + + private sealed partial class BufferListReceiveOperation + { + private GCHandle[]? _ioUringPinnedBufferHandles; + private Interop.Sys.IOVector[]? _ioUringIovecs; + private int _ioUringPinnedHandleCount; + private IntPtr _ioUringMessageHeader; + private int _ioUringPreparedIovCount; + private int _ioUringPreparedBufferCount = -1; + + /// + internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true; + + /// + protected override unsafe void ReleaseIoUringPreparationResourcesCore() + { + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + _ioUringPreparedIovCount = 0; + _ioUringPreparedBufferCount = -1; + + IntPtr messageHeader = Interlocked.Exchange(ref _ioUringMessageHeader, IntPtr.Zero); + if (messageHeader != IntPtr.Zero) + { + NativeMemory.Free((void*)messageHeader); + } + } + + /// Pins all buffer segments and builds the iovec array. + private bool TryPinIoUringBuffers(IList> buffers, out int iovCount) + { + if (_ioUringPinnedBufferHandles is not null && + _ioUringIovecs is not null && + _ioUringPreparedIovCount != 0 && + _ioUringPreparedIovCount <= _ioUringIovecs.Length && + _ioUringPreparedBufferCount == buffers.Count) + { + iovCount = _ioUringPreparedIovCount; + return true; + } + + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + + if (!TryPinBufferListForIoUring( + buffers, + startIndex: 0, + startOffset: 0, + out GCHandle[] pinnedHandles, + out Interop.Sys.IOVector[] iovecs, + out iovCount, + out int pinnedHandleCount, + out SocketError errorCode)) + { + ErrorCode = errorCode; + return false; + } + + _ioUringPinnedBufferHandles = pinnedHandles; + _ioUringIovecs = iovecs; + _ioUringPinnedHandleCount = pinnedHandleCount; + _ioUringPreparedIovCount = iovCount; + _ioUringPreparedBufferCount = buffers.Count; + return true; + } + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + IList>? buffers = Buffers; + if (buffers is null) + { + ErrorCode = SocketError.Success; + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (!TryPinIoUringBuffers(buffers, out int iovCount)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + byte* rawSocketAddress = null; + if (SocketAddress.Length != 0 && !TryPinIoUringBuffer(SocketAddress, out rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader; + if (messageHeader is null) + { + messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader)); + _ioUringMessageHeader = (IntPtr)messageHeader; + } + + messageHeader->SocketAddress = rawSocketAddress; + messageHeader->SocketAddressLen = SocketAddress.Length; + messageHeader->ControlBuffer = null; + messageHeader->ControlBufferLen = 0; + messageHeader->Flags = SocketFlags.None; + + Interop.Sys.IOVector[] iovecs = _ioUringIovecs!; + if (iovCount != 0) + { + fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0]) + { + messageHeader->IOVectors = iovecsPtr; + messageHeader->IOVectorCount = iovCount; + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + } + + messageHeader->IOVectors = null; + messageHeader->IOVectorCount = 0; + IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError zeroIovErrorCode); + ErrorCode = zeroIovErrorCode; + return zeroIovPrepareResult; + } + + /// + protected override unsafe bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + BytesTransferred = result; + ReceivedFlags = (SocketFlags)(int)auxiliaryData; + ErrorCode = SocketError.Success; + + if (_ioUringMessageHeader != IntPtr.Zero && SocketAddress.Length != 0) + { + int socketAddressLen = IoUringCompletionSocketAddressLen; + if (socketAddressLen < 0) + { + socketAddressLen = 0; + } + + if ((uint)socketAddressLen > (uint)SocketAddress.Length) + { + socketAddressLen = SocketAddress.Length; + } + + SocketAddress = SocketAddress.Slice(0, socketAddressLen); + } + + return true; + } + } + + private sealed partial class ReceiveMessageFromOperation + { + private GCHandle[]? _ioUringPinnedBufferHandles; + private Interop.Sys.IOVector[]? _ioUringIovecs; + private int _ioUringPinnedHandleCount; + private int _ioUringPreparedIovCount; + private int _ioUringPreparedBufferListCount = -1; + private IntPtr _ioUringMessageHeader; + private IntPtr _ioUringControlBuffer; + private int _ioUringControlBufferLength; + private MemoryHandle _ioUringPinnedSocketAddress; + private int _ioUringPinnedSocketAddressActive; + + /// + internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true; + + /// + protected override unsafe void ReleaseIoUringPreparationResourcesCore() + { + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + _ioUringPreparedIovCount = 0; + _ioUringPreparedBufferListCount = -1; + + IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero); + if (controlBuffer != IntPtr.Zero) + { + NativeMemory.Free((void*)controlBuffer); + } + _ioUringControlBufferLength = 0; + + ReleaseIoUringSocketAddressAndMessageHeader( + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + ref _ioUringMessageHeader); + } + + /// Pins buffer segments and builds the iovec array for recvmsg. + private bool TryPinIoUringBuffers(IList> buffers, out int iovCount) + { + if (_ioUringPinnedBufferHandles is not null && + _ioUringIovecs is not null && + _ioUringPreparedIovCount <= _ioUringIovecs.Length && + _ioUringPreparedBufferListCount == buffers.Count) + { + iovCount = _ioUringPreparedIovCount; + return true; + } + + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + + if (!TryPinBufferListForIoUring( + buffers, + startIndex: 0, + startOffset: 0, + out GCHandle[] pinnedHandles, + out Interop.Sys.IOVector[] iovecs, + out iovCount, + out int pinnedHandleCount, + out SocketError errorCode)) + { + ErrorCode = errorCode; + return false; + } + + _ioUringPinnedBufferHandles = pinnedHandles; + _ioUringIovecs = iovecs; + _ioUringPinnedHandleCount = pinnedHandleCount; + _ioUringPreparedIovCount = iovCount; + _ioUringPreparedBufferListCount = buffers.Count; + return true; + } + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (context.IsPersistentMultishotRecvArmed()) + { + context.RequestPersistentMultishotRecvCancel(); + } + + IList>? buffers = Buffers; + byte* rawBuffer = null; + int iovCount; + if (buffers is not null) + { + ReleaseIoUringPinnedBufferForShapeTransition(); + if (!TryPinIoUringBuffers(buffers, out iovCount)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + } + else + { + if (!TryPinIoUringBuffer(Buffer, out rawBuffer)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (_ioUringPinnedBufferHandles is not null || _ioUringIovecs is not null) + { + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + _ioUringPreparedIovCount = 0; + _ioUringPreparedBufferListCount = -1; + } + + iovCount = 1; + } + + if (!TryPinIoUringSocketAddressForPrepare( + SocketAddress, + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + out byte* rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader; + if (messageHeader is null) + { + messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader)); + _ioUringMessageHeader = (IntPtr)messageHeader; + } + + messageHeader->SocketAddress = rawSocketAddress; + messageHeader->SocketAddressLen = SocketAddress.Length; + messageHeader->Flags = SocketFlags.None; + + int controlBufferLen = Interop.Sys.GetControlMessageBufferSize(Convert.ToInt32(IsIPv4), Convert.ToInt32(IsIPv6)); + if (controlBufferLen < 0) + { + ErrorCode = SocketError.Success; + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (controlBufferLen != 0) + { + if (_ioUringControlBuffer == IntPtr.Zero || _ioUringControlBufferLength != controlBufferLen) + { + IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero); + if (controlBuffer != IntPtr.Zero) + { + NativeMemory.Free((void*)controlBuffer); + } + + void* rawControlBuffer = NativeMemory.Alloc((nuint)controlBufferLen); + _ioUringControlBuffer = (IntPtr)rawControlBuffer; + _ioUringControlBufferLength = controlBufferLen; + } + + messageHeader->ControlBuffer = (byte*)_ioUringControlBuffer; + messageHeader->ControlBufferLen = controlBufferLen; + } + else + { + IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero); + if (controlBuffer != IntPtr.Zero) + { + NativeMemory.Free((void*)controlBuffer); + } + + _ioUringControlBufferLength = 0; + messageHeader->ControlBuffer = null; + messageHeader->ControlBufferLen = 0; + } + + if (buffers is not null) + { + Interop.Sys.IOVector[] iovecs = _ioUringIovecs!; + if (iovCount != 0) + { + fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0]) + { + messageHeader->IOVectors = iovecsPtr; + messageHeader->IOVectorCount = iovCount; + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + } + + messageHeader->IOVectors = null; + messageHeader->IOVectorCount = 0; + IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError zeroIovErrorCode); + ErrorCode = zeroIovErrorCode; + return zeroIovPrepareResult; + } + + Interop.Sys.IOVector iov; + iov.Base = rawBuffer; + iov.Count = (UIntPtr)Buffer.Length; + messageHeader->IOVectors = &iov; + messageHeader->IOVectorCount = 1; + IoUringDirectPrepareResult singleBufferPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError singleBufferErrorCode); + ErrorCode = singleBufferErrorCode; + return singleBufferPrepareResult; + } + + /// + protected override unsafe bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + BytesTransferred = result; + ReceivedFlags = (SocketFlags)(int)auxiliaryData; + ErrorCode = SocketError.Success; + IPPacketInformation = default; + + if (_ioUringMessageHeader != IntPtr.Zero) + { + Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader; + int socketAddressCapacity = SocketAddress.Length; + int socketAddressLen = IoUringCompletionSocketAddressLen; + if (socketAddressLen < 0) + { + socketAddressLen = 0; + } + + if ((uint)socketAddressLen > (uint)socketAddressCapacity) + { + socketAddressLen = socketAddressCapacity; + } + + if (socketAddressLen == 0 && socketAddressCapacity != 0) + { + socketAddressLen = socketAddressCapacity; + SocketAddress.Span.Clear(); + } + + int controlBufferCapacity = messageHeader->ControlBufferLen; + int controlBufferLen = IoUringCompletionControlBufferLen; + if (controlBufferLen < 0) + { + controlBufferLen = 0; + } + + if ((uint)controlBufferLen > (uint)controlBufferCapacity) + { + controlBufferLen = controlBufferCapacity; + } + + messageHeader->SocketAddressLen = socketAddressLen; + messageHeader->ControlBufferLen = controlBufferLen; + messageHeader->Flags = ReceivedFlags; + + SocketAddress = SocketAddress.Slice(0, socketAddressLen); + + IPPacketInformation = SocketPal.GetIoUringIPPacketInformation(messageHeader, IsIPv4, IsIPv6); + } + + return true; + } + + /// + protected override bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + if (!ProcessIoUringErrorResult(result, out ErrorCode)) + { + return false; + } + + IPPacketInformation = default; + return true; + } + } + + internal sealed partial class AcceptOperation + { + /// + internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() => + Interop.Sys.SocketEvents.Read; + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + AcceptSocketAddressLength = SocketAddress.Length; + if (!TryPinIoUringBuffer(SocketAddress, out byte* rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (engine.SupportsMultishotAccept && + Interlocked.CompareExchange(ref context._multishotAcceptArmed, 2, 0) == 0) + { + context.EnsureMultishotAcceptQueueInitialized(); + IoUringDirectPrepareResult multishotPrepareResult = engine.TryPrepareIoUringDirectMultishotAccept( + context._socket, + rawSocketAddress, + SocketAddress.Length, + out userData, + out SocketError multishotErrorCode); + if (multishotPrepareResult == IoUringDirectPrepareResult.Prepared) + { + Volatile.Write(ref context._multishotAcceptUserData, userData); + Volatile.Write(ref context._multishotAcceptArmed, 1); + ErrorCode = multishotErrorCode; + return multishotPrepareResult; + } + + context.DisarmMultishotAccept(); + } + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectAccept( + context._socket, + rawSocketAddress, + SocketAddress.Length, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + + /// + protected override bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + AcceptedFileDescriptor = (IntPtr)result; + ErrorCode = SocketError.Success; + // Keep parity with readiness path: always honor reported address length, including 0. + AcceptSocketAddressLength = auxiliaryData > (uint)SocketAddress.Length ? SocketAddress.Length : (int)auxiliaryData; + SocketAddress = SocketAddress.Slice(0, AcceptSocketAddressLength); + return true; + } + + /// + protected override bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + AcceptedFileDescriptor = (IntPtr)(-1); + return base.ProcessIoUringCompletionError(context, result, flags, auxiliaryData); + } + } + + private sealed partial class ConnectOperation + { + /// + internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() => + Interop.Sys.SocketEvents.Write; + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (!TryPinIoUringBuffer(SocketAddress, out byte* rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectConnect( + context._socket, + rawSocketAddress, + SocketAddress.Length, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + + /// + protected override bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + Interop.Error error = GetIoUringPalError(result); + if (error == Interop.Error.EINPROGRESS) + { + ErrorCode = SocketError.Success; + return false; + } + + if (!base.ProcessIoUringCompletionError(context, result, flags, auxiliaryData)) + { + return false; + } + + context._socket.RegisterConnectResult(ErrorCode); + return true; + } + + /// + protected override bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) + { + ErrorCode = SocketError.Success; + context._socket.RegisterConnectResult(ErrorCode); + + if (Buffer.Length > 0) + { + Action, SocketFlags, SocketError>? callback = Callback; + Debug.Assert(callback is not null); + SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, default, ref BytesTransferred, callback!, default); + if (error == SocketError.IOPending) + { + // Callback ownership moved to the async send operation. + Callback = null; + Buffer = default; + } + else + { + if (error != SocketError.Success) + { + ErrorCode = error; + context._socket.RegisterConnectResult(ErrorCode); + } + + // Follow-up send completed synchronously (success/error), so invoke + // Connect callback from this operation path. + Buffer = default; + } + } + + return true; + } + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs index 4e2e117984084c..94d6838e1a890d 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs @@ -43,10 +43,10 @@ internal sealed partial class SocketAsyncContext private BufferListReceiveOperation? _cachedBufferListReceiveOperation; private BufferMemorySendOperation? _cachedBufferMemorySendOperation; private BufferListSendOperation? _cachedBufferListSendOperation; - private void ReturnOperation(AcceptOperation operation) { operation.Reset(); + operation.AcceptSocketAddressLength = 0; operation.Callback = null; operation.SocketAddress = default; Volatile.Write(ref _cachedAcceptOperation, operation); // benign race condition @@ -83,6 +83,7 @@ private void ReturnOperation(BufferListSendOperation operation) { operation.Reset(); operation.Buffers = null; + operation.SetBufferPosition(bufferIndex: 0, offset: 0); operation.Callback = null; operation.SocketAddress = default; Volatile.Write(ref _cachedBufferListSendOperation, operation); // benign race condition @@ -108,7 +109,14 @@ private BufferListSendOperation RentBufferListSendOperation() => Interlocked.Exchange(ref _cachedBufferListSendOperation, null) ?? new BufferListSendOperation(this); - private abstract class AsyncOperation : IThreadPoolWorkItem + // Partial method hooks for io_uring completion-mode staging (Linux-only). + // No-op on non-Linux; implemented in SocketAsyncContext.IoUring.Linux.cs. + static partial void LinuxTryStageIoUringOperation(AsyncOperation operation); + partial void LinuxTryDequeuePreAcceptedConnection(AcceptOperation operation, ref bool dequeued); + partial void LinuxTryConsumeBufferedPersistentMultishotRecvData(Memory destination, ref bool consumed, ref int bytesTransferred); + partial void LinuxOnStopAndAbort(); + + internal abstract partial class AsyncOperation : IThreadPoolWorkItem { private enum State { @@ -141,6 +149,7 @@ public AsyncOperation(SocketAsyncContext context) public void Reset() { + ResetIoUringState(); _state = State.Waiting; Event = null; Next = this; @@ -202,6 +211,16 @@ public OperationResult TryComplete(SocketAsyncContext context) } public bool TryCancel() + { + return TryCancelCore(requestIoUringCancellation: true); + } + + internal bool TryCancelForTeardown() + { + return TryCancelCore(requestIoUringCancellation: false); + } + + private bool TryCancelCore(bool requestIoUringCancellation) { Trace("Enter"); @@ -232,6 +251,9 @@ public bool TryCancel() return false; } + // Best effort: if completion-mode io_uring work was already submitted, request kernel-side cancellation now. + // Partial method: no-op on non-Linux; implemented in SocketAsyncContext.IoUring.Linux.cs. + LinuxRequestIoUringCancellationIfNeeded(requestIoUringCancellation); ProcessCancellation(); // Note, we leave the operation in the OperationQueue. @@ -245,6 +267,7 @@ public void ProcessCancellation() Debug.Assert(_state == State.Canceled); + LinuxUntrackIoUringOperation(); ErrorCode = SocketError.OperationAborted; ManualResetEventSlim? e = Event; @@ -305,17 +328,29 @@ void IThreadPoolWorkItem.Execute() // We could also add an abstract method that the base interface implementation // invokes, but that adds an extra virtual dispatch. Debug.Fail("Expected derived type to implement IThreadPoolWorkItem"); - throw new InvalidOperationException(); + ThrowExpectedDerivedTypeToImplementThreadPoolWorkItem(); } + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowExpectedDerivedTypeToImplementThreadPoolWorkItem() => + throw new InvalidOperationException(); + // Called when op is not in the queue yet, so can't be otherwise executing public void DoAbort() { + LinuxUntrackIoUringOperation(); ErrorCode = SocketError.OperationAborted; } protected abstract bool DoTryComplete(SocketAsyncContext context); + partial void ResetIoUringState(); + partial void LinuxRequestIoUringCancellationIfNeeded(bool requestIoUringCancellation); + partial void LinuxUntrackIoUringOperation(); + + internal virtual bool ShouldDispatchCallback => true; + public abstract void InvokeCallback(bool allowPooling); [Conditional("SOCKETASYNCCONTEXT_TRACE")] @@ -333,21 +368,21 @@ public void TraceWithContext(SocketAsyncContext context, string message, [Caller // These two abstract classes differentiate the operations that go in the // read queue vs the ones that go in the write queue. - private abstract class ReadOperation : AsyncOperation, IThreadPoolWorkItem + internal abstract partial class ReadOperation : AsyncOperation, IThreadPoolWorkItem { public ReadOperation(SocketAsyncContext context) : base(context) { } void IThreadPoolWorkItem.Execute() => AssociatedContext.ProcessAsyncReadOperation(this); } - private abstract class WriteOperation : AsyncOperation, IThreadPoolWorkItem + private abstract partial class WriteOperation : AsyncOperation, IThreadPoolWorkItem { public WriteOperation(SocketAsyncContext context) : base(context) { } void IThreadPoolWorkItem.Execute() => AssociatedContext.ProcessAsyncWriteOperation(this); } - private abstract class SendOperation : WriteOperation + private abstract partial class SendOperation : WriteOperation { public SocketFlags Flags; public int BytesTransferred; @@ -360,9 +395,10 @@ public SendOperation(SocketAsyncContext context) : base(context) { } public override void InvokeCallback(bool allowPooling) => Callback!(BytesTransferred, SocketAddress, SocketFlags.None, ErrorCode); + } - private class BufferMemorySendOperation : SendOperation + private partial class BufferMemorySendOperation : SendOperation { public Memory Buffer; @@ -390,7 +426,7 @@ public override void InvokeCallback(bool allowPooling) } } - private sealed class BufferListSendOperation : SendOperation + private sealed partial class BufferListSendOperation : SendOperation { public IList>? Buffers; public int BufferIndex; @@ -402,6 +438,12 @@ protected override bool DoTryComplete(SocketAsyncContext context) return SocketPal.TryCompleteSendTo(context._socket, default(ReadOnlySpan), Buffers, ref BufferIndex, ref Offset, ref Count, Flags, SocketAddress.Span, ref BytesTransferred, out ErrorCode); } + internal void SetBufferPosition(int bufferIndex, int offset) + { + BufferIndex = bufferIndex; + Offset = offset; + } + public override void InvokeCallback(bool allowPooling) { var cb = Callback!; @@ -446,7 +488,7 @@ public override void InvokeCallback(bool allowPooling) => Callback!(BytesTransferred, SocketAddress, ReceivedFlags, ErrorCode); } - private sealed class BufferMemoryReceiveOperation : ReceiveOperation + private sealed partial class BufferMemoryReceiveOperation : ReceiveOperation { public Memory Buffer; public bool SetReceivedFlags; @@ -455,6 +497,19 @@ public BufferMemoryReceiveOperation(SocketAsyncContext context) : base(context) protected override bool DoTryComplete(SocketAsyncContext context) { + bool consumedBufferedData = false; + int bufferedBytes = 0; + context.LinuxTryConsumeBufferedPersistentMultishotRecvData(Buffer, ref consumedBufferedData, ref bufferedBytes); + if (!SetReceivedFlags && + SocketAddress.Length == 0 && + consumedBufferedData) + { + BytesTransferred = bufferedBytes; + ReceivedFlags = SocketFlags.None; + ErrorCode = SocketError.Success; + return true; + } + // Zero byte read is performed to know when data is available. // We don't have to call receive, our caller is interested in the event. if (Buffer.Length == 0 && Flags == SocketFlags.None && SocketAddress.Length == 0) @@ -502,7 +557,7 @@ public override void InvokeCallback(bool allowPooling) } } - private sealed class BufferListReceiveOperation : ReceiveOperation + private sealed partial class BufferListReceiveOperation : ReceiveOperation { public IList>? Buffers; @@ -553,7 +608,7 @@ protected override bool DoTryComplete(SocketAsyncContext context) } } - private sealed class ReceiveMessageFromOperation : ReadOperation + private sealed partial class ReceiveMessageFromOperation : ReadOperation { public Memory Buffer; public SocketFlags Flags; @@ -613,9 +668,10 @@ public override void InvokeCallback(bool allowPooling) => Callback!(BytesTransferred, SocketAddress, ReceivedFlags, IPPacketInformation, ErrorCode); } - private sealed class AcceptOperation : ReadOperation + internal sealed partial class AcceptOperation : ReadOperation { public IntPtr AcceptedFileDescriptor; + public int AcceptSocketAddressLength; public AcceptOperation(SocketAsyncContext context) : base(context) { } @@ -623,11 +679,19 @@ public AcceptOperation(SocketAsyncContext context) : base(context) { } protected override bool DoTryComplete(SocketAsyncContext context) { + bool dequeuedPreAcceptedConnection = false; + context.LinuxTryDequeuePreAcceptedConnection(this, ref dequeuedPreAcceptedConnection); + if (dequeuedPreAcceptedConnection) + { + return true; + } + bool completed = SocketPal.TryCompleteAccept(context._socket, SocketAddress, out int socketAddressLen, out AcceptedFileDescriptor, out ErrorCode); + AcceptSocketAddressLength = socketAddressLen; Debug.Assert(ErrorCode == SocketError.Success || AcceptedFileDescriptor == (IntPtr)(-1), $"Unexpected values: ErrorCode={ErrorCode}, AcceptedFileDescriptor={AcceptedFileDescriptor}"); if (ErrorCode == SocketError.Success) { - SocketAddress = SocketAddress.Slice(0, socketAddressLen); + SocketAddress = SocketAddress.Slice(0, AcceptSocketAddressLength); } return completed; } @@ -648,7 +712,7 @@ public override void InvokeCallback(bool allowPooling) } } - private sealed class ConnectOperation : BufferMemorySendOperation + private sealed partial class ConnectOperation : BufferMemorySendOperation { public ConnectOperation(SocketAsyncContext context) : base(context) { } @@ -659,28 +723,47 @@ protected override bool DoTryComplete(SocketAsyncContext context) if (result && ErrorCode == SocketError.Success && Buffer.Length > 0) { - SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, Memory.Empty, ref BytesTransferred, Callback!, default); - if (error != SocketError.Success && error != SocketError.IOPending) + Action, SocketFlags, SocketError>? callback = Callback; + Debug.Assert(callback != null); + SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, Memory.Empty, ref BytesTransferred, callback!, default); + if (error == SocketError.IOPending) { - context._socket.RegisterConnectResult(ErrorCode); + // Callback ownership moved to the async send operation. + Callback = null; + Buffer = default; + } + else + { + if (error != SocketError.Success) + { + ErrorCode = error; + context._socket.RegisterConnectResult(ErrorCode); + } + + // Follow-up send completed synchronously (success/error), so invoke + // Connect callback from this operation path. + Buffer = default; } } return result; } + internal override bool ShouldDispatchCallback => Buffer.Length == 0 && Callback is not null; + public override void InvokeCallback(bool allowPooling) { - var cb = Callback!; + Action, SocketFlags, SocketError>? cb = Callback; int bt = BytesTransferred; Memory sa = SocketAddress; SocketError ec = ErrorCode; Memory buffer = Buffer; - if (buffer.Length == 0) + if (cb != null && (buffer.Length == 0 || ec == SocketError.OperationAborted)) { // Invoke callback only when we are completely done. // In case data were provided for Connect we may or may not send them all. - // If we did not we will need follow-up with Send operation + // If we did not we will need follow-up with Send operation. + // On cancellation, always invoke — the send was never started. cb(bt, sa, SocketFlags.None, ec); } } @@ -890,6 +973,9 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation operation.CancellationRegistration = cancellationToken.UnsafeRegister(s => ((TOperation)s!).TryCancel(), operation); } + // Completion-mode staging: partial method is no-op on non-Linux. + LinuxTryStageIoUringOperation(operation); + return true; case QueueState.Stopped: @@ -898,7 +984,7 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation break; default: - Environment.FailFast("unexpected queue state"); + FailFastUnexpectedQueueState(_state); break; } } @@ -939,7 +1025,7 @@ static void HandleFailedRegistration(SocketAsyncContext context, TOperation oper } else { - throw new InternalException(error); + ThrowInternalException(error); } } } @@ -986,7 +1072,7 @@ static void HandleFailedRegistration(SocketAsyncContext context, TOperation oper return null; default: - Environment.FailFast("unexpected queue state"); + FailFastUnexpectedQueueState(_state); return null; } } @@ -1022,7 +1108,10 @@ internal void ProcessAsyncOperation(TOperation op) // request for a previous operation could affect a subsequent one) // and here we know the operation has completed. op.CancellationRegistration.Dispose(); - op.InvokeCallback(allowPooling: true); + if (op.ShouldDispatchCallback) + { + op.InvokeCallback(allowPooling: true); + } } } @@ -1129,6 +1218,59 @@ public OperationResult ProcessQueuedOperation(TOperation op) return result; } + public bool TryRemoveCompletedOperation(SocketAsyncContext context, TOperation operation) + { + using (Lock()) + { + if (_tail == null || _state == QueueState.Stopped) + { + return false; + } + + AsyncOperation? previous = _tail; + AsyncOperation? current = _tail.Next; + while (!ReferenceEquals(current, operation)) + { + if (ReferenceEquals(current, _tail)) + { + return false; + } + + previous = current; + current = current!.Next; + } + + Debug.Assert(previous != null && current != null); + bool removedHead = ReferenceEquals(current, _tail.Next); + bool removedTail = ReferenceEquals(current, _tail); + + if (removedHead && removedTail) + { + _tail = null; + _isNextOperationSynchronous = false; + _state = QueueState.Ready; + _sequenceNumber++; + Trace(context, $"Removed completed {IdOf(operation)} (queue empty)"); + return true; + } + + previous!.Next = current!.Next; + if (removedTail) + { + _tail = (TOperation)previous; + } + + if (removedHead) + { + Debug.Assert(_tail != null); + _isNextOperationSynchronous = _tail.Next.Event != null; + } + + Trace(context, $"Removed completed {IdOf(operation)}"); + return true; + } + } + public void CancelAndContinueProcessing(TOperation op) { // Note, only sync operations use this method. @@ -1244,6 +1386,17 @@ public bool StopAndAbort(SocketAsyncContext context) return aborted; } + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowInternalException(Interop.Error error) => + throw new InternalException(error); + + [DoesNotReturn] + [StackTraceHidden] + [MethodImpl(MethodImplOptions.NoInlining)] + private static void FailFastUnexpectedQueueState(QueueState state) => + Environment.FailFast($"unexpected queue state: {state}"); + [Conditional("SOCKETASYNCCONTEXT_TRACE")] public void Trace(SocketAsyncContext context, string message, [CallerMemberName] string? memberName = null) { @@ -1328,6 +1481,7 @@ public bool StopAndAbort() // Drain queues aborted |= _sendQueue.StopAndAbort(this); aborted |= _receiveQueue.StopAndAbort(this); + LinuxOnStopAndAbort(); // We don't need to synchronize with Register. // This method is called when the handle gets released. @@ -1360,7 +1514,7 @@ public void SetHandleNonBlocking() { if (Interop.Sys.Fcntl.SetIsNonBlocking(_socket, 1) != 0) { - throw new SocketException((int)SocketPal.GetSocketErrorForErrorCode(Interop.Sys.GetLastError())); + ThrowSocketExceptionFromLastError(); } _isHandleNonBlocking = true; @@ -1369,11 +1523,36 @@ public void SetHandleNonBlocking() public bool IsHandleNonBlocking => _isHandleNonBlocking; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ThrowIfThreadsAreNotSupported() + { + if (!Socket.OSSupportsThreads) + { + ThrowPlatformNotSupportedForMissingThreadSupport(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ValidateSyncOperationPreconditions(int timeout) + { + ThrowIfThreadsAreNotSupported(); + Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + } + + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowPlatformNotSupportedForMissingThreadSupport() => + throw new PlatformNotSupportedException(); + + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowSocketExceptionFromLastError() => + throw new SocketException((int)SocketPal.GetSocketErrorForErrorCode(Interop.Sys.GetLastError())); + private void PerformSyncOperation(ref OperationQueue queue, TOperation operation, int timeout, int observedSequenceNumber) where TOperation : AsyncOperation { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); using (var e = new ManualResetEventSlim(false, 0)) { @@ -1509,7 +1688,7 @@ public SocketError AcceptAsync(Memory socketAddress, out int socketAddress public SocketError Connect(Memory socketAddress) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); + ThrowIfThreadsAreNotSupported(); Debug.Assert(socketAddress.Length > 0, $"Unexpected socketAddressLen: {socketAddress.Length}"); // Connect is different than the usual "readiness" pattern of other operations. @@ -1603,9 +1782,7 @@ public SocketError ReceiveAsync(Memory buffer, SocketFlags flags, out int public SocketError ReceiveFrom(Memory buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); SocketFlags receivedFlags; SocketError errorCode; @@ -1636,7 +1813,7 @@ public SocketError ReceiveFrom(Memory buffer, ref SocketFlags flags, Memor public unsafe SocketError ReceiveFrom(Span buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); + ValidateSyncOperationPreconditions(timeout); SocketFlags receivedFlags; SocketError errorCode; @@ -1748,9 +1925,7 @@ public SocketError ReceiveAsync(IList> buffers, SocketFlags f public SocketError ReceiveFrom(IList> buffers, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); SocketFlags receivedFlags; SocketError errorCode; @@ -1817,9 +1992,7 @@ public SocketError ReceiveFromAsync(IList> buffers, SocketFla public SocketError ReceiveMessageFrom( Memory buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, bool isIPv4, bool isIPv6, int timeout, out IPPacketInformation ipPacketInformation, out int bytesReceived) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); SocketFlags receivedFlags; SocketError errorCode; @@ -1854,9 +2027,7 @@ public SocketError ReceiveMessageFrom( public unsafe SocketError ReceiveMessageFrom( Span buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, bool isIPv4, bool isIPv6, int timeout, out IPPacketInformation ipPacketInformation, out int bytesReceived) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); SocketFlags receivedFlags; SocketError errorCode; @@ -1946,9 +2117,7 @@ public SocketError SendAsync(Memory buffer, int offset, int count, SocketF public SocketError SendTo(byte[] buffer, int offset, int count, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); bytesSent = 0; SocketError errorCode; @@ -1978,9 +2147,7 @@ public SocketError SendTo(byte[] buffer, int offset, int count, SocketFlags flag public unsafe SocketError SendTo(ReadOnlySpan buffer, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); bytesSent = 0; SocketError errorCode; @@ -2057,9 +2224,7 @@ public SocketError SendAsync(IList> buffers, SocketFlags flag public SocketError SendTo(IList> buffers, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); bytesSent = 0; int bufferIndex = 0; @@ -2127,9 +2292,7 @@ public SocketError SendToAsync(IList> buffers, SocketFlags fl public SocketError SendFile(SafeFileHandle fileHandle, long offset, long count, int timeout, out long bytesSent) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); bytesSent = 0; SocketError errorCode; diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs new file mode 100644 index 00000000000000..1232968a0433d6 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs @@ -0,0 +1,5664 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace System.Net.Sockets +{ + internal sealed unsafe partial class SocketAsyncEngine + { + /// Lock-free slot-based registry mapping io_uring user_data to managed instances. + private sealed class IoUringOperationRegistry + { + /// Result of attempting to remove a tracked operation by user_data. + internal enum RemoveResult + { + Removed, + NotFound, + Mismatch + } + + private const int GenerationShift = IoUringConstants.SlotIndexBits; + + private struct RegistrySlot + { + public SocketAsyncContext.AsyncOperation? Operation; + public uint Generation; + } + + private readonly RegistrySlot[] _slots; + private int _count; + + /// Initializes the registry with the specified number of completion slots. + internal IoUringOperationRegistry(int slotCapacity) + { + ArgumentOutOfRangeException.ThrowIfNegativeOrZero(slotCapacity); + + _slots = new RegistrySlot[slotCapacity]; + } + + /// Returns true when no operations are currently tracked. + internal bool IsEmpty => Volatile.Read(ref _count) == 0; + /// Returns the current number of tracked operations. + internal int Count => Volatile.Read(ref _count); + + /// Registers an operation by its user_data, returning false on slot collision. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool TryTrack(SocketAsyncContext.AsyncOperation operation) + { + ulong userData = operation.IoUringUserData; + if (!TryDecodeUserData(userData, out int slotIndex, out uint generation)) + { + return false; + } + + ref RegistrySlot slot = ref _slots[slotIndex]; + if (Interlocked.CompareExchange(ref slot.Operation, operation, null) is not null) + { + return false; + } + + // The generation write is ordered after the operation write. A concurrent reader + // (TryTake) that sees the new operation but a stale generation will correctly reject + // the take, since the generation mismatch means the CQE references a prior slot + // incarnation. This is safe because such rejection is treated as a benign late completion. + Volatile.Write(ref slot.Generation, generation); + Interlocked.Increment(ref _count); + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Prepared, + IoUringOperationLifecycleState.Submitted); + return true; + } + + /// Atomically removes and returns the operation matching the user_data and generation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool TryTake(ulong userData, out SocketAsyncContext.AsyncOperation? operation) + { + operation = null; + if (!TryDecodeUserData(userData, out int slotIndex, out uint generation)) + { + return false; + } + + ref RegistrySlot slot = ref _slots[slotIndex]; + while (true) + { + SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref slot.Operation); + if (currentOperation is null) + { + return false; + } + + if (Volatile.Read(ref slot.Generation) != generation) + { + return false; + } + + if (Interlocked.CompareExchange(ref slot.Operation, null, currentOperation) != currentOperation) + { + continue; + } + + Interlocked.Decrement(ref _count); + operation = currentOperation; + return true; + } + } + + /// + /// Re-attaches a completion owner after dispatch-side deferral (for example, SEND_ZC waiting on NOTIF CQE). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool TryReattach(ulong userData, SocketAsyncContext.AsyncOperation operation) + { + if (!TryDecodeUserData(userData, out int slotIndex, out uint generation)) + { + return false; + } + + ref RegistrySlot slot = ref _slots[slotIndex]; + if (Interlocked.CompareExchange(ref slot.Operation, operation, null) is not null) + { + return false; + } + + Volatile.Write(ref slot.Generation, generation); + Interlocked.Increment(ref _count); + return true; + } + + /// Removes a tracked operation, optionally verifying it matches an expected reference. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal RemoveResult TryUntrack( + ulong userData, + SocketAsyncContext.AsyncOperation? expectedOperation, + out SocketAsyncContext.AsyncOperation? removedOperation) + { + removedOperation = null; + if (!TryDecodeUserData(userData, out int slotIndex, out uint generation)) + { + return RemoveResult.NotFound; + } + + ref RegistrySlot slot = ref _slots[slotIndex]; + while (true) + { + SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref slot.Operation); + if (currentOperation is null) + { + return RemoveResult.NotFound; + } + + if (Volatile.Read(ref slot.Generation) != generation) + { + return RemoveResult.NotFound; + } + + if (expectedOperation is not null && !ReferenceEquals(currentOperation, expectedOperation)) + { + return RemoveResult.Mismatch; + } + + if (Interlocked.CompareExchange(ref slot.Operation, null, currentOperation) != currentOperation) + { + continue; + } + + Interlocked.Decrement(ref _count); + removedOperation = currentOperation; + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Canceled); + return RemoveResult.Removed; + } + } + + /// Returns whether an operation with the given user_data and generation is currently tracked. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool Contains(ulong userData) + { + if (!TryDecodeUserData(userData, out int slotIndex, out uint generation)) + { + return false; + } + + ref RegistrySlot slot = ref _slots[slotIndex]; + return Volatile.Read(ref slot.Operation) is not null && + Volatile.Read(ref slot.Generation) == generation; + } + + /// Returns the tracked operation for the given user_data without untracking it. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool TryGet(ulong userData, out SocketAsyncContext.AsyncOperation? operation) + { + operation = null; + if (!TryDecodeUserData(userData, out int slotIndex, out uint generation)) + { + return false; + } + + ref RegistrySlot slot = ref _slots[slotIndex]; + SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref slot.Operation); + if (currentOperation is null) + { + return false; + } + + if (Volatile.Read(ref slot.Generation) != generation) + { + return false; + } + + operation = currentOperation; + return true; + } + + /// + /// Atomically replaces the tracked operation for the given user_data. + /// Used by persistent multishot receive to attach the next managed operation + /// to an already-armed kernel multishot request. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool TryReplace(ulong userData, SocketAsyncContext.AsyncOperation newOperation) + { + if (!TryDecodeUserData(userData, out int slotIndex, out uint generation)) + { + return false; + } + + ref RegistrySlot slot = ref _slots[slotIndex]; + while (true) + { + SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref slot.Operation); + if (currentOperation is null) + { + return false; + } + + if (Volatile.Read(ref slot.Generation) != generation) + { + return false; + } + + if (Interlocked.CompareExchange(ref slot.Operation, newOperation, currentOperation) == currentOperation) + { + return true; + } + } + } + + /// Removes and yields all tracked operations during teardown. + internal IEnumerable DrainAllTrackedOperations() + { + for (int i = 0; i < _slots.Length; i++) + { + SocketAsyncContext.AsyncOperation? operation = Interlocked.Exchange(ref _slots[i].Operation, null); + if (operation is not null) + { + Interlocked.Decrement(ref _count); + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Detached); + yield return operation; + } + } + } + + /// Extracts the slot index and generation from an encoded user_data value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryDecodeUserData(ulong userData, out int slotIndex, out uint generation) + { + if (userData == 0) + { + slotIndex = 0; + generation = 0; + return false; + } + + slotIndex = (int)(userData & IoUringConstants.SlotIndexMask); + if ((uint)slotIndex >= (uint)_slots.Length) + { + generation = 0; + return false; + } + + generation = (uint)((userData >> GenerationShift) & IoUringConstants.GenerationMask); + return true; + } + } + + /// Indicates which io_uring dispatch mode is active for this engine instance. + private enum LinuxIoUringMode : byte + { + Disabled = 0, + CompletionMode = 1 + } + + /// Distinguishes cancellation requests issued during normal runtime from those during engine teardown. + private enum IoUringCancellationOrigin : byte + { + Runtime = 0, + Teardown = 1 + } + + /// Tracks the lifecycle of an io_uring operation for debug assertions on valid state transitions. + private enum IoUringOperationLifecycleState : byte + { + Queued = 0, + Prepared = 1, + Submitted = 2, + Completed = 3, + Canceled = 4, + Detached = 5 + } + + /// Immutable snapshot of negotiated io_uring capabilities for this engine instance. + private readonly struct LinuxIoUringCapabilities + { + /// Whether the engine's port was created as an io_uring instance. + internal bool IsIoUringPort { get; } + /// The active io_uring dispatch mode. + internal LinuxIoUringMode Mode { get; } + /// Whether multishot recv can be used by this engine instance. + internal bool SupportsMultishotRecv { get; } + /// Whether multishot accept can be used by this engine instance. + internal bool SupportsMultishotAccept { get; } + /// Whether zero-copy send is enabled for this engine instance. + internal bool SupportsZeroCopySend { get; } + /// Whether SQPOLL mode is enabled for this engine instance. + internal bool SqPollEnabled { get; } + + /// Whether the engine is operating in full completion mode. + internal bool IsCompletionMode => + Mode == LinuxIoUringMode.CompletionMode; + + /// Creates a capabilities snapshot with the given port type and mode. + internal LinuxIoUringCapabilities( + bool isIoUringPort, + LinuxIoUringMode mode, + bool supportsMultishotRecv, + bool supportsMultishotAccept, + bool supportsZeroCopySend, + bool sqPollEnabled) + { + IsIoUringPort = isIoUringPort; + Mode = mode; + SupportsMultishotRecv = supportsMultishotRecv; + SupportsMultishotAccept = supportsMultishotAccept; + SupportsZeroCopySend = supportsZeroCopySend; + SqPollEnabled = sqPollEnabled; + } + } + + /// Mirrors kernel struct io_uring_sqe (64 bytes), written to the SQ ring for submission. + [StructLayout(LayoutKind.Explicit, Size = 64)] + internal struct IoUringSqe + { + [FieldOffset(0)] + internal byte Opcode; + [FieldOffset(1)] + internal byte Flags; + [FieldOffset(2)] + internal ushort Ioprio; + [FieldOffset(4)] + internal int Fd; + [FieldOffset(8)] + internal ulong Off; + [FieldOffset(16)] + internal ulong Addr; + [FieldOffset(24)] + internal uint Len; + [FieldOffset(28)] + internal uint RwFlags; + [FieldOffset(32)] + internal ulong UserData; + [FieldOffset(40)] + internal ushort BufIndex; + [FieldOffset(42)] + internal ushort Personality; + [FieldOffset(44)] + internal int SpliceFdIn; + [FieldOffset(48)] + internal ulong Addr3; + } + + /// Mirrors kernel struct io_uring_probe_op (8 bytes per entry in the probe ops array). + [StructLayout(LayoutKind.Explicit, Size = 8)] + private struct IoUringProbeOp + { + [FieldOffset(0)] internal byte Op; + [FieldOffset(1)] internal byte Resv; + [FieldOffset(2)] internal ushort Flags; + // 4 bytes reserved at offset 4 + } + + /// Mirrors kernel struct io_uring_probe (16-byte header preceding the variable-length ops array). + [StructLayout(LayoutKind.Explicit, Size = 16)] + private struct IoUringProbeHeader + { + [FieldOffset(0)] internal byte LastOp; + [FieldOffset(1)] internal byte OpsLen; + // 14 bytes reserved at offset 2 + } + + /// + /// Kernel ABI opcode constants as a static class (not an enum) to avoid byte-cast noise + /// at every SQE write site, since the SQE Opcode field is typed as byte. + /// + private static class IoUringOpcodes + { + internal const byte ReadFixed = 4; + internal const byte Send = 26; + internal const byte Recv = 27; + internal const byte SendMsg = 9; + internal const byte RecvMsg = 10; + internal const byte Accept = 13; + internal const byte Connect = 16; + internal const byte SendZc = 53; + internal const byte SendMsgZc = 54; + internal const byte AsyncCancel = 14; + internal const byte PollAdd = 6; + internal const byte PollRemove = 7; + } + + /// + /// Centralizes io_uring ABI constants that mirror the native definitions in pal_io_uring.c. + /// These are used by managed code that directly interacts with the io_uring submission + /// and completion rings (e.g., direct SQE writes via mmap'd ring access). + /// + private static class IoUringConstants + { + // Setup flags (io_uring_setup params.flags) + internal const uint SetupCqSize = 1u << 3; + internal const uint SetupSqPoll = 1u << 5; + internal const uint SetupSubmitAll = 1u << 7; + internal const uint SetupCoopTaskrun = 1u << 8; + internal const uint SetupSingleIssuer = 1u << 12; + internal const uint SetupDeferTaskrun = 1u << 13; + internal const uint SetupNoSqArray = 1u << 16; + + // Feature flags (io_uring_params.features) + internal const uint FeatureSingleMmap = 1u << 0; + internal const uint FeatureExtArg = 1u << 8; + + // Enter flags (io_uring_enter flags parameter) + internal const uint EnterGetevents = 1u << 0; + internal const uint EnterSqWakeup = 1u << 1; + internal const uint EnterExtArg = 1u << 3; + internal const uint EnterRegisteredRing = 1u << 4; + + // SQ ring flags (sq_ring->flags) + internal const uint SqNeedWakeup = 1u << 0; + + // Register opcodes + internal const uint RegisterBuffers = 0; + internal const uint UnregisterBuffers = 1; + internal const uint RegisterFiles = 2; + internal const uint UnregisterFiles = 3; + internal const uint RegisterFilesUpdate = 6; + internal const uint RegisterProbe = 8; + internal const uint RegisterRingFds = 20; + internal const uint UnregisterRingFds = 21; + internal const uint RegisterPbufRing = 22; + internal const uint UnregisterPbufRing = 23; + + // Register helper values + internal const uint RegisterOffsetAuto = 0xFFFFFFFFU; + + // Probe op flags + internal const uint ProbeOpFlagSupported = 1u << 0; + + // Poll flags + internal const uint PollAddFlagMulti = 1u << 0; + + // CQE flags + internal const uint CqeFBuffer = 1u << 0; // IORING_CQE_F_BUFFER (buffer id in upper bits) + internal const uint CqeFMore = 1u << 1; // IORING_CQE_F_MORE (multishot) + internal const uint CqeFNotif = 1u << 2; // IORING_CQE_F_NOTIF (zero-copy notification) + internal const int CqeBufferShift = 16; // IORING_CQE_BUFFER_SHIFT + + // Recv ioprio flags + internal const ushort RecvMultishot = 1 << 1; // IORING_RECV_MULTISHOT + // Accept ioprio flags + internal const ushort AcceptMultishot = 1 << 0; // IORING_ACCEPT_MULTISHOT + + // SQE flags + internal const byte SqeFixedFile = 1 << 0; // IOSQE_FIXED_FILE + internal const byte SqeBufferSelect = 1 << 5; // IOSQE_BUFFER_SELECT + + // Sizing + internal const uint QueueEntries = 1024; + internal const uint CqEntriesFactor = 4; + internal const uint MaxCqeDrainBatch = 128; + internal const long BoundedWaitTimeoutNanos = 50L * 1000 * 1000; // 50ms + + // Registration sizing + internal const uint RegistrationBucketCountMin = 2048; + internal const uint RegistrationBucketCountFactor = 8; + internal const uint RegistrationBucketCountMax = 32768; + internal const uint RegisteredFileSlotCountFactor = 4; + + // Completion operation pool sizing + internal const int CompletionOperationPoolCapacityFactor = 2; + + // mmap offsets (from kernel UAPI: IORING_OFF_SQ_RING, IORING_OFF_CQ_RING, IORING_OFF_SQES) + internal const ulong OffSqRing = 0; + internal const ulong OffCqRing = 0x8000000; + internal const ulong OffSqes = 0x10000000; + + // Minimum kernel version for io_uring engine + internal const int MinKernelMajor = 6; + internal const int MinKernelMinor = 1; + + // Zero-copy send size threshold (payloads below this use regular send). + internal const int ZeroCopySendThreshold = 16384; // 16KB + + // User data tag values (encoded in upper bits of user_data) + internal const byte TagNone = 0; + internal const byte TagPollReadiness = 1; + internal const byte TagReservedCompletion = 2; + internal const byte TagWakeupSignal = 3; + + // Message inline capacities (avoid heap allocation on common small payloads) + internal const int MessageInlineIovCount = 4; + internal const int MessageInlineSocketAddressCapacity = 128; // sizeof(sockaddr_storage) + internal const int MessageInlineControlBufferCapacity = 128; + + // Internal discriminator for io_uring vs epoll fallback detection + internal const int NotSocketEventPort = int.MinValue + 1; + + // Completion slot encoding + internal const int SlotIndexBits = 24; + internal const ulong SlotIndexMask = (1UL << SlotIndexBits) - 1UL; + internal const uint GenerationMask = uint.MaxValue; + + // Test hook opcode masks (mirrors IoUringTestOpcodeMask in pal_io_uring.c) + internal const byte TestOpcodeMaskNone = 0; + internal const byte TestOpcodeMaskSend = 1 << 0; + internal const byte TestOpcodeMaskRecv = 1 << 1; + internal const byte TestOpcodeMaskSendMsg = 1 << 2; + internal const byte TestOpcodeMaskRecvMsg = 1 << 3; + internal const byte TestOpcodeMaskAccept = 1 << 4; + internal const byte TestOpcodeMaskConnect = 1 << 5; + internal const byte TestOpcodeMaskSendZc = 1 << 6; + internal const byte TestOpcodeMaskSendMsgZc = 1 << 7; + } + + /// Captures the results of io_uring_setup(2) including ring fd, negotiated params, and feature flags. + private struct IoUringSetupResult + { + internal int RingFd; + internal Interop.Sys.IoUringParams Params; + internal uint NegotiatedFlags; + internal bool UsesExtArg; + internal bool SqPollNegotiated; + } + + /// Discriminates completion slot metadata shape for operation-specific post-completion processing. + private enum IoUringCompletionOperationKind : byte + { + None = 0, + Accept = 1, + Message = 2, + } + + /// + /// Hot per-slot metadata used on every CQE dispatch. + /// Keep this minimal; native pointer-heavy state lives in . + /// + private struct IoUringCompletionSlot + { + public uint Generation; + public IoUringCompletionOperationKind Kind; +#if DEBUG + public bool HasTestForcedResult; + public int TestForcedResult; +#endif + public bool IsZeroCopySend; + public bool ZeroCopyNotificationPending; + public bool UsesFixedRecvBuffer; + public ushort FixedRecvBufferId; + public int FreeListNext; // -1 = end of free list + } + + /// + /// Cold per-slot native metadata: pointers and message writeback state needed only for + /// operation-specific completion processing. + /// + private struct IoUringCompletionSlotStorage + { + // Accept metadata + public unsafe int* NativeSocketAddressLengthPtr; // socklen_t* for accept + // Message metadata (pointers to native-alloc'd msghdr/iovec) + public IntPtr NativeMsgHdrPtr; + public IntPtr NativeMessageStorage; // heap-allocated contiguous block (or null for inline) + public bool MessageIsReceive; + // Message metadata - deep-copied native msghdr constituents (point into NativeMessageStorage block) + public unsafe Interop.Sys.IOVector* NativeIOVectors; + public unsafe byte* NativeSocketAddress; + public unsafe byte* NativeControlBuffer; + // RecvMsg output capture - pointers back to managed MessageHeader buffers for writeback + public unsafe byte* ReceiveOutputSocketAddress; + public unsafe byte* ReceiveOutputControlBuffer; + public int ReceiveSocketAddressCapacity; + public int ReceiveControlBufferCapacity; + } + + /// + /// Mirrors the kernel's struct msghdr layout for direct SQE submission. + /// Used by to build a native msghdr that + /// io_uring sendmsg/recvmsg opcodes can consume directly. + /// Must only be used on 64-bit Linux where sizeof(msghdr) == 56. + /// + [StructLayout(LayoutKind.Sequential)] + private unsafe struct NativeMsghdr + { + public void* msg_name; + public uint msg_namelen; + // On x64, 4 bytes of padding are inserted by sequential layout before the next pointer. + public Interop.Sys.IOVector* msg_iov; + public nuint msg_iovlen; + public void* msg_control; + public nuint msg_controllen; + public int msg_flags; + } + + /// Tracks per-socket epoll/poll registration state including registered file index and active events. + private sealed class SocketEventRegistration + { + public int Socket; + public int RegisteredFileIndex = -1; + public Interop.Sys.SocketEvents Events; + public uint PollEvents; + public UIntPtr Data; + public ulong RequestId; + } + + /// Cross-thread request to modify a socket's event registration, completed by the event loop thread. + private sealed class RegistrationChangeRequest : IDisposable + { + public int Socket; + public Interop.Sys.SocketEvents NewEvents; + public UIntPtr Data; + public volatile Interop.Error Error; + public volatile bool Completed; + public ManualResetEventSlim CompletionEvent = new ManualResetEventSlim(false); + + public void Dispose() => CompletionEvent.Dispose(); + } + + private const int IoUringDiagnosticsPollInterval = 64; + private const long DiagnosticSampleMask = 0x3F; + private const int MaxIoUringPrepareQueueDrainPerSubmit = 256; + private const int MaxIoUringCancelQueueDrainPerSubmit = 256; + private const int MaxSlotExhaustionRetries = 3; + private const int MaxIoUringSqeAcquireSubmitAttempts = 16; + private const ulong IoUringUserDataPayloadMask = 0x00FF_FFFF_FFFF_FFFFUL; + private const int IoUringUserDataTagShift = 56; + private static readonly int s_ioUringPrepareQueueCapacity = GetIoUringPrepareQueueCapacity(); + private static readonly int s_ioUringCancellationQueueCapacity = s_ioUringPrepareQueueCapacity; + private static long s_ioUringPollReadinessCqeCount; + private static long s_ioUringPendingRetryQueuedToPrepareQueueCount; + private static long s_ioUringPublishedNonPinnablePrepareFallbackCount; + private static int s_ioUringPublishingNonPinnablePrepareFallback; + private MpscQueue? _ioUringPrepareQueue; + private MpscQueue? _ioUringCancelQueue; + private long _ioUringPrepareQueueLength; + private long _ioUringCancelQueueLength; + private long _ioUringPrepareQueueOverflowCount; + private long _ioUringCancelQueueOverflowCount; + private long _ioUringPrepareQueueOverflowFallbackCount; + private long _ioUringCompletionSlotExhaustionCount; + private long _ioUringCompletionSlotDrainRecoveryCount; + private long _ioUringPublishedPrepareQueueLength; + private long _ioUringBenignLateCompletionCount; + private long _ioUringCompletionRequeueFailureCount; + private long _ioUringUntrackMismatchCount; + private long _ioUringPublishedPrepareQueueOverflowCount; + private long _ioUringPublishedPrepareQueueOverflowFallbackCount; + private long _ioUringPublishedCompletionRequeueFailureCount; + private long _ioUringPublishedCompletionSlotExhaustionCount; + private long _ioUringPublishedCompletionSlotDrainRecoveryCount; + private int _ioUringDiagnosticsPollCountdown; + private bool _ioUringAdvancedFeatureStateLogged; + private int _ioUringWakeupRequested; + private int _ioUringPortClosedForTeardown; + private int _ioUringTeardownInitiated; + private int _ioUringSlotCapacity; + private bool _completionSlotDrainInProgress; + private uint _ioUringManagedPendingSubmissions; + private uint _ioUringManagedSqTail; + private bool _ioUringManagedSqTailLoaded; + private Interop.Sys.IoUringSqRingInfo _ioUringSqRingInfo; + private bool _ioUringDirectSqeEnabled; + + // Per-opcode support flags, populated by ProbeIoUringOpcodeSupport. + private bool _supportsOpSend; + private bool _supportsOpReadFixed; + private bool _supportsOpRecv; + private bool _supportsOpSendMsg; + private bool _supportsOpRecvMsg; + private bool _supportsOpAccept; + private bool _supportsOpConnect; + private bool _supportsOpSendZc; + private bool _supportsOpSendMsgZc; + private bool _supportsOpAsyncCancel; + private bool _supportsOpPollAdd; + private bool _supportsMultishotRecv; + private bool _supportsMultishotAccept; + private bool _supportsProvidedBufferRings; + private bool _zeroCopySendEnabled; + + // Managed ring state (populated by TryMmapRings, replaces native-provided IoUringSqRingInfo) + private unsafe Interop.Sys.IoUringCqe* _managedCqeBase; + private unsafe uint* _managedCqTailPtr; + private unsafe uint* _managedCqHeadPtr; + private uint _managedCqMask; + private uint _managedCqEntries; + private unsafe uint* _managedCqOverflowPtr; + private uint _managedObservedCqOverflow; + private unsafe byte* _managedSqRingPtr; + private unsafe byte* _managedCqRingPtr; + private unsafe uint* _managedSqFlagsPtr; + private ulong _managedSqRingSize; + private ulong _managedCqRingSize; + private ulong _managedSqesSize; + private bool _managedUsesSingleMmap; + private int _managedRingFd; + private bool _managedUsesExtArg; + private bool _managedUsesNoSqArray; + private uint _managedNegotiatedFlags; + private bool _sqPollEnabled; + private uint _managedCachedCqHead; + private bool _ioUringInitialized; + private bool _managedCqDrainEnabled; + private int _managedWakeupEventFd = -1; + private IoUringProvidedBufferRing? _ioUringProvidedBufferRing; + private bool _ioUringBuffersRegistered; + private ushort _ioUringProvidedBufferGroupId; + private Dictionary? _registrationsBySocket; + private Dictionary? _registrationsByRequestId; + private ulong _nextRequestId; + // Note: _registrationChangeQueue is only allocated when RequiresPollReadiness() is true, + // which is not the case in pure completion mode. It is not a target for MPSC migration + // until/unless a non-completion-mode io_uring path is reactivated. + private ConcurrentQueue? _registrationChangeQueue; + private IoUringCompletionSlot[]? _completionSlots; + private IoUringCompletionSlotStorage[]? _completionSlotStorage; + private System.Buffers.MemoryHandle[]? _zeroCopyPinHolds; + private int _completionSlotFreeListHead = -1; + private int _completionSlotsInUse; + +#if DEBUG + // Test hook state: forced completion result injection (mirrors native pal_io_uring.c test hooks). + private byte _testForceEagainOnceMask; + private byte _testForceEcanceledOnceMask; +#endif + + // Registered-file table state + private int[]? _registeredFiles; // slot -> fd mapping (-1 = empty) + private uint[]? _registeredFileFreeSlots; // stack of free slot indices + private uint _registeredFileFreeSlotCount; + private int _registeredFileHotSocket = -1; + private int _registeredFileHotIndex = -1; + private bool _usesRegisteredFiles; + + private LinuxIoUringCapabilities _ioUringCapabilities; + /// Whether this engine instance is using io_uring completion mode. + internal bool IsIoUringCompletionModeEnabled => _ioUringCapabilities.IsCompletionMode; + /// Whether managed direct SQE submission is enabled. + internal bool IsIoUringDirectSqeEnabled => _ioUringDirectSqeEnabled; + /// Whether a connected send payload is eligible for the SEND_ZC path. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool ShouldTryIoUringDirectSendZeroCopy(int payloadLength) => + IsIoUringZeroCopySendEligible(payloadLength, requiresSendMessageOpcode: false); + /// Whether a message-based send payload is eligible for the SENDMSG_ZC path. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool ShouldTryIoUringDirectSendMessageZeroCopy(int payloadLength) => + IsIoUringZeroCopySendEligible(payloadLength, requiresSendMessageOpcode: true); + private IoUringOperationRegistry? _ioUringOperationRegistry; + + /// + /// Centralized zero-copy policy: + /// 1) process-level opt-in, 2) opcode support, 3) payload threshold. + /// The threshold is based on total payload bytes so buffer-list workloads (e.g. 4KB segments) + /// are eligible once the aggregate payload crosses the cutoff. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsIoUringZeroCopySendEligible(int payloadLength, bool requiresSendMessageOpcode) + { + if (!_zeroCopySendEnabled || payloadLength < IoUringConstants.ZeroCopySendThreshold) + { + return false; + } + + return requiresSendMessageOpcode ? _supportsOpSendMsgZc : _supportsOpSendZc; + } + + /// Reads the process-wide count of poll-readiness CQEs observed by managed completion drains. + internal static long GetIoUringPollReadinessCqeCount() => + Interlocked.Read(ref s_ioUringPollReadinessCqeCount); + + /// + /// Reads the process-wide count of pending completions that had to requeue through the prepare queue + /// after inline completion-mode re-prepare was not used. + /// + internal static long GetIoUringPendingRetryQueuedToPrepareQueueCount() => + Interlocked.Read(ref s_ioUringPendingRetryQueuedToPrepareQueueCount); + + private static int GetIoUringPrepareQueueCapacity() + { +#if DEBUG + if (Environment.GetEnvironmentVariable( + "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PREPARE_QUEUE_CAPACITY") is string configuredValue && + int.TryParse(configuredValue, out int configuredCapacity) && + configuredCapacity > 0) + { + return configuredCapacity; + } +#endif + + // Raised default to reduce fallback frequency under bursty load. + int scaledCapacity = s_eventBufferCount >= 32 ? checked(s_eventBufferCount * 4) : 512; + return Math.Max(scaledCapacity, 512); + } + + /// Creates a capabilities snapshot based on whether the port is io_uring. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static LinuxIoUringCapabilities ResolveLinuxIoUringCapabilities(bool isIoUringPort) => + new LinuxIoUringCapabilities( + isIoUringPort, + isIoUringPort ? LinuxIoUringMode.CompletionMode : LinuxIoUringMode.Disabled, + supportsMultishotRecv: false, + supportsMultishotAccept: false, + supportsZeroCopySend: false, + sqPollEnabled: false); + + /// Encodes a tag byte and payload into a 64-bit user_data value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong EncodeIoUringUserData(byte tag, ulong payload) => + ((ulong)tag << IoUringUserDataTagShift) | (payload & IoUringUserDataPayloadMask); + + /// Reads the next CQE from the completion ring without advancing the head. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool TryPeekNextCqe(out Interop.Sys.IoUringCqe* cqe) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "TryPeekNextCqe must only be called from the event loop thread (SINGLE_ISSUER contract)."); + cqe = null; + uint cqTail = Volatile.Read(ref *_managedCqTailPtr); + if (_managedCachedCqHead == cqTail) return false; + uint index = _managedCachedCqHead & _managedCqMask; + cqe = _managedCqeBase + index; + return true; + } + + /// Advances the CQ head pointer by the given count, making slots available to the kernel. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void AdvanceCqHead(uint count) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "AdvanceCqHead must only be called from the event loop thread (SINGLE_ISSUER contract)."); + _managedCachedCqHead += count; + Volatile.Write(ref *_managedCqHeadPtr, _managedCachedCqHead); + } + + /// + /// Drains up to CQEs from the mmap'd + /// completion ring and dispatches each based on the user_data tag. + /// Tag=2 (reserved completion) entries are dispatched directly through + /// . + /// Tag=3 (wakeup signal) entries are handled inline. + /// Tag=1 (poll readiness) entries are retained for readiness mode fallback. + /// Returns true when at least one CQE was drained. + /// + private unsafe bool DrainCqeRingBatch(SocketEventHandler handler) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "DrainCqeRingBatch must only be called from the event loop thread (SINGLE_ISSUER contract)."); + ObserveManagedCqOverflowCounter(); + int drained = 0; + bool drainedAnyCqe = false; + bool enqueuedFallbackEvent = false; + + while (drained < (int)IoUringConstants.MaxCqeDrainBatch + && TryPeekNextCqe(out Interop.Sys.IoUringCqe* cqe)) + { + drainedAnyCqe = true; + ulong userData = cqe->UserData; + int result = cqe->Result; + uint flags = cqe->Flags; + AdvanceCqHead(1); + + byte tag = (byte)(userData >> IoUringUserDataTagShift); + ulong payload = userData & IoUringUserDataPayloadMask; + + switch (tag) + { + case IoUringConstants.TagReservedCompletion: + if ((flags & IoUringConstants.CqeFNotif) != 0) + { + if (HandleZeroCopyNotification(payload)) + { + handler.DispatchZeroCopyIoUringNotification(payload); + } + + break; + } + + bool isMultishotCompletion = false; + if ((flags & IoUringConstants.CqeFMore) != 0) + { + IoUringCompletionSlot[]? completionSlots = _completionSlots; + int slotIndex = DecodeCompletionSlotIndex(payload); + if (completionSlots is not null && + (uint)slotIndex < (uint)completionSlots.Length) + { + IoUringCompletionOperationKind kind = completionSlots[slotIndex].Kind; + isMultishotCompletion = + (kind == IoUringCompletionOperationKind.Message && _ioUringCapabilities.SupportsMultishotRecv) || + (kind == IoUringCompletionOperationKind.Accept && _ioUringCapabilities.SupportsMultishotAccept); + } + } + ResolveReservedCompletionSlotMetadata( + payload, + isMultishotCompletion, + ref result, + out int completionSocketAddressLen, + out int completionControlBufferLen, + out uint completionAuxiliaryData, + out bool hasFixedRecvBuffer, + out ushort fixedRecvBufferId); + + if (isMultishotCompletion) + { + handler.DispatchMultishotIoUringCompletion( + payload, + result, + flags, + completionSocketAddressLen, + completionControlBufferLen, + completionAuxiliaryData, + hasFixedRecvBuffer, + fixedRecvBufferId, + ref enqueuedFallbackEvent); + } + else + { + handler.DispatchSingleIoUringCompletion( + payload, + result, + flags, + completionSocketAddressLen, + completionControlBufferLen, + completionAuxiliaryData, + hasFixedRecvBuffer, + fixedRecvBufferId, + ref enqueuedFallbackEvent); + } + break; + case IoUringConstants.TagWakeupSignal: + HandleManagedWakeupSignal(result); + break; + case IoUringConstants.TagPollReadiness: + HandlePollReadinessCqe(payload, result, flags); + break; + default: + break; // Unknown tag - silently ignore. + } + + drained++; + } + + if (enqueuedFallbackEvent) + { + EnsureWorkerScheduled(); + } + + return drainedAnyCqe; + } + + /// + /// Resolves metadata for a reserved completion by applying forced test results and + /// copying operation-specific completion outputs (accept/recvmsg) from native storage. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ResolveReservedCompletionSlotMetadata( + ulong payload, + bool isMultishotCompletion, + ref int result, + out int completionSocketAddressLen, + out int completionControlBufferLen, + out uint completionAuxiliaryData, + out bool hasFixedRecvBuffer, + out ushort fixedRecvBufferId) + { + completionSocketAddressLen = 0; + completionControlBufferLen = 0; + completionAuxiliaryData = 0; + hasFixedRecvBuffer = false; + fixedRecvBufferId = 0; + + int slotIndex = DecodeCompletionSlotIndex(payload); + if ((uint)slotIndex >= (uint)_completionSlots!.Length) + { + return; + } + + ref IoUringCompletionSlot slot = ref _completionSlots[slotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + + ResolveDebugTestForcedResult(ref slot, ref result); + + if (slot.UsesFixedRecvBuffer) + { + hasFixedRecvBuffer = true; + fixedRecvBufferId = slot.FixedRecvBufferId; + slot.UsesFixedRecvBuffer = false; + slot.FixedRecvBufferId = 0; + Debug.Assert(!isMultishotCompletion, "Fixed-buffer receive completions are expected to be one-shot."); + } + + if (slot.Kind == IoUringCompletionOperationKind.Accept && + slotStorage.NativeSocketAddressLengthPtr is not null) + { + int nativeSocketAddressLength = *slotStorage.NativeSocketAddressLengthPtr; + completionAuxiliaryData = nativeSocketAddressLength >= 0 ? (uint)nativeSocketAddressLength : 0u; + if (isMultishotCompletion) + { + int socketAddressCapacity = slotStorage.ReceiveSocketAddressCapacity; + *slotStorage.NativeSocketAddressLengthPtr = socketAddressCapacity >= 0 ? socketAddressCapacity : 0; + } + } + else if (slot.Kind == IoUringCompletionOperationKind.Message) + { + CopyMessageCompletionOutputs( + slotIndex, + out completionSocketAddressLen, + out completionControlBufferLen, + out completionAuxiliaryData); + } + + if (!isMultishotCompletion) + { + if (!slot.IsZeroCopySend) + { + FreeCompletionSlot(slotIndex); + } + else if (result < 0) + { + // Error completion path may not produce a NOTIF CQE. + FreeCompletionSlot(slotIndex); + } + else if (!slot.ZeroCopyNotificationPending) + { + // First CQE for zero-copy send: keep slot alive until NOTIF CQE arrives. + slot.ZeroCopyNotificationPending = true; + AssertZeroCopyNotificationPendingForPayload(payload); + } + } + } + + /// Handles NOTIF CQEs for zero-copy sends and releases retained completion slots. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool HandleZeroCopyNotification(ulong payload) + { + IoUringCompletionSlot[]? completionSlots = _completionSlots; + if (completionSlots is null) + { + return false; + } + + int slotIndex = DecodeCompletionSlotIndex(payload); + if ((uint)slotIndex >= (uint)completionSlots.Length) + { + return false; + } + + ref IoUringCompletionSlot slot = ref completionSlots[slotIndex]; + if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending) + { + return false; + } + + slot.IsZeroCopySend = false; + slot.ZeroCopyNotificationPending = false; + FreeCompletionSlot(slotIndex); + return true; + } + + /// Returns true when the completion slot for is waiting on SEND_ZC NOTIF. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsZeroCopyNotificationPending(ulong userData) + { + IoUringCompletionSlot[]? completionSlots = _completionSlots; + if (completionSlots is null) + { + return false; + } + + int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask); + if ((uint)slotIndex >= (uint)completionSlots.Length) + { + return false; + } + + ref IoUringCompletionSlot slot = ref completionSlots[slotIndex]; + return slot.IsZeroCopySend && slot.ZeroCopyNotificationPending; + } + + /// Debug assertion that a reserved completion payload remains armed for SEND_ZC NOTIF. + [Conditional("DEBUG")] + private void AssertZeroCopyNotificationPendingForPayload(ulong payload) + { + ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload); + Debug.Assert( + IsZeroCopyNotificationPending(userData), + "SEND_ZC first CQE must leave the completion slot pending until NOTIF CQE arrives."); + } + + /// Debug assertion that SEND_ZC completion dispatch is deferred until NOTIF arrives. + [Conditional("DEBUG")] + private void AssertZeroCopyDeferredCompletionState(ulong userData, SocketAsyncContext.AsyncOperation operation) + { + Debug.Assert( + operation.IoUringUserData == userData, + "Deferred SEND_ZC completion must retain the original user_data until NOTIF CQE dispatch."); + Debug.Assert( + IsZeroCopyNotificationPending(userData), + "Deferred SEND_ZC completion requires an armed NOTIF state."); + } + + /// Observes kernel CQ overflow count deltas and emits telemetry/logs. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void ObserveManagedCqOverflowCounter() + { + if (_managedCqOverflowPtr is null) + { + return; + } + + uint observedOverflow = Volatile.Read(ref *_managedCqOverflowPtr); + uint previousOverflow = _managedObservedCqOverflow; + if (observedOverflow <= previousOverflow) + { + return; + } + + uint delta = observedOverflow - previousOverflow; + _managedObservedCqOverflow = observedOverflow; + SocketsTelemetry.Log.IoUringCqOverflow(delta); + + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringCqOverflow(observedOverflow, delta); + } + } + + /// Handles a poll-readiness CQE and records diagnostics in completion mode. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void HandlePollReadinessCqe(ulong payload, int result, uint flags) + { + Debug.Assert(!_ioUringInitialized || !_ioUringCapabilities.IsCompletionMode, + "Unexpected poll readiness CQE in pure io_uring completion mode"); + if (_ioUringCapabilities.IsCompletionMode) + { + RecordIoUringPollReadinessCqe(); + } + + DispatchManagedPollReadinessCqe(payload, result, flags); + } + + /// + /// Handles a wakeup signal CQE by consuming the eventfd counter. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe void HandleManagedWakeupSignal(int cqeResult) + { + if (cqeResult >= 0 && _managedWakeupEventFd >= 0) + { + ulong value; + Interop.Error readError = Interop.Sys.IoUringShimReadEventFd(_managedWakeupEventFd, &value); + if (readError != Interop.Error.SUCCESS && + readError != Interop.Error.EAGAIN && + NetEventSource.Log.IsEnabled()) + { + LogWakeupReadFailure(this, readError); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void LogWakeupReadFailure(SocketAsyncEngine engine, Interop.Error readErrorCode) + { + NetEventSource.Error(engine, $"io_uring wakeup eventfd read failed: error={readErrorCode}"); + } + } + + // Poll event constants matching Linux UAPI definitions. + // POLLIN = 0x0001, POLLOUT = 0x0004, POLLERR = 0x0008, POLLHUP = 0x0010, POLLRDHUP = 0x2000 + private const uint PollIn = 0x0001; + private const uint PollOut = 0x0004; + private const uint PollErr = 0x0008; + private const uint PollHup = 0x0010; + private const uint PollRdHup = 0x2000; + + /// Converts SocketEvents to kernel poll event flags. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint GetIoUringPollEvents(Interop.Sys.SocketEvents events) + { + uint pollEvents = + (((events & Interop.Sys.SocketEvents.Read) != 0) ? PollIn : 0u) | + (((events & Interop.Sys.SocketEvents.Write) != 0) ? PollOut : 0u) | + PollErr | PollHup; + if ((events & Interop.Sys.SocketEvents.ReadClose) != 0) + pollEvents |= PollRdHup; + return pollEvents; + } + + /// Converts a kernel poll result bitmask to SocketEvents. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Interop.Sys.SocketEvents GetSocketEventsFromPollResult(int result) + { + uint events = (uint)result; + // Treat POLLHUP as both read + write ready (hangup means both directions signaled). + if ((events & PollHup) != 0) + { + events = (events & ~PollHup) | PollIn | PollOut; + } + return (Interop.Sys.SocketEvents)( + (((events & PollIn) != 0) ? (int)Interop.Sys.SocketEvents.Read : 0) | + (((events & PollOut) != 0) ? (int)Interop.Sys.SocketEvents.Write : 0) | + (((events & PollRdHup) != 0) ? (int)Interop.Sys.SocketEvents.ReadClose : 0) | + (((events & PollErr) != 0) ? (int)Interop.Sys.SocketEvents.Error : 0)); + } + + /// Looks up the event registration for a socket file descriptor. + private SocketEventRegistration? FindRegistrationBySocket(int socket) + { + if (_registrationsBySocket is not null && _registrationsBySocket.TryGetValue(socket, out SocketEventRegistration? reg)) + return reg; + return null; + } + + /// Looks up the event registration for a poll request ID. + private SocketEventRegistration? FindRegistrationByRequestId(ulong requestId) + { + if (_registrationsByRequestId is not null && _registrationsByRequestId.TryGetValue(requestId, out SocketEventRegistration? reg)) + return reg; + return null; + } + + /// Returns the existing registration or creates a new one for the socket. + private SocketEventRegistration FindOrCreateRegistrationBySocket(int socket) + { + SocketEventRegistration? reg = FindRegistrationBySocket(socket); + if (reg is not null) return reg; + + reg = new SocketEventRegistration { Socket = socket }; + _registrationsBySocket!.Add(socket, reg); + return reg; + } + + /// Assigns a unique request ID and indexes the registration. + private void AssignRegistrationRequestId(SocketEventRegistration reg) + { + ulong requestId = ++_nextRequestId; + reg.RequestId = requestId; + _registrationsByRequestId!.Add(requestId, reg); + } + + /// Removes the registration from the request ID index. + private void ClearRegistrationRequestId(SocketEventRegistration reg) + { + if (reg.RequestId != 0) + { + _registrationsByRequestId?.Remove(reg.RequestId); + reg.RequestId = 0; + } + } + + /// Fully removes a socket's event registration from all indexes. + private void RemoveRegistration(SocketEventRegistration reg) + { + TryUnregisterRegisteredFileForRegistration(reg); + ClearRegistrationRequestId(reg); + _registrationsBySocket?.Remove(reg.Socket); + } + + // Raw Linux errno values as returned by the kernel in CQE results (negative). + // These differ from Interop.Error which uses a PAL-specific numbering scheme. + private const int ErrnoECANCELED = 125; + private const int ErrnoEBADF = 9; + private const int ErrnoENOENT = 2; + private const int ErrnoEINVAL = 22; + + /// + /// Dispatches a poll readiness CQE by looking up the registration and raising + /// the appropriate socket events. The CQE result contains the poll events + /// returned by the kernel (or a negative errno on error). + /// + private void DispatchManagedPollReadinessCqe(ulong requestIdPayload, int cqeResult, uint cqeFlags) + { + SocketEventRegistration? reg = FindRegistrationByRequestId(requestIdPayload); + if (reg is null) return; + + UIntPtr registrationData = reg.Data; + bool removeRegistration = false; + bool pollStillArmed = false; + + Interop.Sys.SocketEvents events = Interop.Sys.SocketEvents.None; + if (cqeResult >= 0) + { + events = GetSocketEventsFromPollResult(cqeResult); + if ((cqeFlags & IoUringConstants.CqeFMore) != 0) + { + pollStillArmed = true; + } + } + else if (cqeResult != -ErrnoECANCELED && cqeResult != -ErrnoENOENT) + { + events = Interop.Sys.SocketEvents.Error; + } + + // Certain errors require removing the registration entirely. + if (cqeResult == -ErrnoEBADF || + cqeResult == -ErrnoENOENT || + cqeResult == -ErrnoEINVAL) + { + removeRegistration = true; + } + + if (!pollStillArmed || removeRegistration) + { + ClearRegistrationRequestId(reg); + } + + if (events != Interop.Sys.SocketEvents.None) + { + // Deliver the event through the registered context lookup table. + // The Data field holds the index into s_registeredContexts, matching + // how native poll events flow through HandleSocketEvents. + SocketAsyncContext? context = s_registeredContexts[(int)(nuint)registrationData]; + + if (context is not null) + { + if (context.PreferInlineCompletions) + { + context.HandleEventsInline(events); + } + else + { + Interop.Sys.SocketEvents filteredEvents = context.HandleSyncEventsSpeculatively(events); + + if (filteredEvents != Interop.Sys.SocketEvents.None) + { + _eventQueue.Enqueue(new SocketIOEvent(context, filteredEvents)); + } + } + } + } + + if (removeRegistration) + { + RemoveRegistration(reg); + } + } + + /// Applies an event registration change for io_uring poll-based readiness. + partial void LinuxTryChangeSocketEventRegistration(IntPtr socketHandle, Interop.Sys.SocketEvents currentEvents, Interop.Sys.SocketEvents newEvents, int data, ref Interop.Error error, ref bool handled) + { + if (!_ioUringInitialized) return; + if (_ioUringCapabilities.IsCompletionMode) + { + // In pure completion mode, readiness registration is intentionally disabled. + handled = true; + error = Interop.Error.SUCCESS; + return; + } + + handled = true; + + int socket = (int)(nint)socketHandle; + + // Check if we're on the event loop thread - apply directly + int eventLoopThreadId = Volatile.Read(ref _eventLoopManagedThreadId); + if (eventLoopThreadId != 0 && eventLoopThreadId == Environment.CurrentManagedThreadId) + { + error = ApplyManagedRegistrationChange(socket, newEvents, (UIntPtr)(nuint)(uint)data); + return; + } + + // Off event loop thread: enqueue and wait + var request = new RegistrationChangeRequest + { + Socket = socket, + NewEvents = newEvents, + Data = (UIntPtr)(nuint)(uint)data, + }; + + _registrationChangeQueue!.Enqueue(request); + WakeEventLoop(); + + request.CompletionEvent.Wait(); + error = request.Error; + request.Dispose(); + } + + /// + /// Indicates whether this io_uring mode still relies on poll-readiness registration. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool RequiresPollReadiness() + => _ioUringCapabilities.IsIoUringPort && !_ioUringCapabilities.IsCompletionMode; + + /// + /// Applies a socket event registration change directly on the event loop thread. + /// Ports the native ApplySocketEventRegistrationChange from pal_io_uring.c. + /// + private unsafe Interop.Error ApplyManagedRegistrationChange(int socket, Interop.Sys.SocketEvents newEvents, UIntPtr data) + { + SocketEventRegistration? registration = FindRegistrationBySocket(socket); + + // If existing registration has an active poll, remove it first + if (registration is not null && registration.RequestId != 0) + { + Interop.Error removeError = WritePollRemoveSqe(registration.RequestId); + if (removeError != Interop.Error.SUCCESS) + return removeError; + ClearRegistrationRequestId(registration); + } + + // If deregistering (newEvents == None), just remove + if (newEvents == Interop.Sys.SocketEvents.None) + { + if (registration is not null) + RemoveRegistration(registration); + return Interop.Error.SUCCESS; + } + + // Create or update registration + registration ??= FindOrCreateRegistrationBySocket(socket); + + registration.Events = newEvents; + registration.PollEvents = GetIoUringPollEvents(newEvents); + registration.Data = data; + + // Write POLL_ADD SQE + Interop.Error addError = WritePollAddSqe(registration); + if (addError != Interop.Error.SUCCESS) + { + RemoveRegistration(registration); + return addError; + } + + return Interop.Error.SUCCESS; + } + + /// + /// Writes a POLL_ADD SQE for the given registration, using multishot mode. + /// Ports the native QueueIoUringPollAdd from pal_io_uring.c. + /// + private unsafe Interop.Error WritePollAddSqe(SocketEventRegistration registration) + { + Debug.Assert(registration.RequestId == 0); + + if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError)) + return submitError; + + // Generate monotonic request ID (wrapping within payload space) + do { _nextRequestId++; } + while ((_nextRequestId & IoUringUserDataPayloadMask) == 0); + ulong requestId = _nextRequestId & IoUringUserDataPayloadMask; + + TryAssignRegisteredFileForRegistration(registration, out int pollSqeFd, out byte pollSqeFlags); + sqe->Opcode = IoUringOpcodes.PollAdd; + sqe->Fd = pollSqeFd; + sqe->Flags = pollSqeFlags; + sqe->Len = IoUringConstants.PollAddFlagMulti; // IORING_POLL_ADD_MULTI + sqe->RwFlags = registration.PollEvents; // poll_events in rw_flags union + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagPollReadiness, requestId); + + registration.RequestId = requestId; + _registrationsByRequestId!.Add(requestId, registration); + + return Interop.Error.SUCCESS; + } + + /// + /// Writes a POLL_REMOVE SQE to cancel an outstanding POLL_ADD identified by . + /// Ports the native QueueIoUringPollRemove from pal_io_uring.c. + /// + private unsafe Interop.Error WritePollRemoveSqe(ulong requestId) + { + if (requestId == 0) + return Interop.Error.SUCCESS; + + if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError)) + return submitError; + + sqe->Opcode = IoUringOpcodes.PollRemove; + sqe->Addr = EncodeIoUringUserData(IoUringConstants.TagPollReadiness, requestId); + sqe->UserData = 0; // CQE for POLL_REMOVE is untracked + + return Interop.Error.SUCCESS; + } + + /// + /// Drains pending registration change requests enqueued by off-event-loop threads. + /// Each request is applied, then the waiting thread is signaled. + /// Ports the native ProcessPendingRegistrationChangeRequests from pal_io_uring.c. + /// + private void ProcessPendingRegistrationChanges() + { + ConcurrentQueue? queue = _registrationChangeQueue; + if (queue is null || queue.IsEmpty) + return; + + while (queue.TryDequeue(out RegistrationChangeRequest? request)) + { + request.Error = ApplyManagedRegistrationChange(request.Socket, request.NewEvents, request.Data); + request.Completed = true; + request.CompletionEvent.Set(); + } + } + + /// Allocates the completion slot array and initializes the free list. + [MethodImpl(MethodImplOptions.NoInlining)] + private void InitializeCompletionSlotPool(int capacity) + { + _completionSlots = new IoUringCompletionSlot[capacity]; + _completionSlotStorage = new IoUringCompletionSlotStorage[capacity]; + _zeroCopyPinHolds = new System.Buffers.MemoryHandle[capacity]; + // Build free list linking all slots + for (int i = 0; i < capacity - 1; i++) + { + _completionSlots[i].Generation = 1; + _completionSlots[i].FreeListNext = i + 1; + } + _completionSlots[capacity - 1].Generation = 1; + _completionSlots[capacity - 1].FreeListNext = -1; + _completionSlotFreeListHead = 0; + _completionSlotsInUse = 0; + } + + /// + /// Allocates a completion slot from the free list. Returns the slot index, + /// or -1 if the pool is exhausted (backpressure signal). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private int AllocateCompletionSlot() + { + Debug.Assert(_completionSlotStorage is not null); + int index = _completionSlotFreeListHead; + if (index < 0) + return -1; // Pool exhausted + + ref IoUringCompletionSlot slot = ref _completionSlots![index]; + _completionSlotFreeListHead = slot.FreeListNext; + slot.FreeListNext = -1; + slot.Kind = IoUringCompletionOperationKind.None; + ResetDebugTestForcedResult(ref slot); + slot.IsZeroCopySend = false; + slot.ZeroCopyNotificationPending = false; + slot.UsesFixedRecvBuffer = false; + slot.FixedRecvBufferId = 0; + _completionSlotsInUse++; + return index; + } + + /// + /// Returns a completion slot to the free list, incrementing its generation + /// to invalidate any stale user_data references. + /// + private unsafe void FreeCompletionSlot(int index) + { + Debug.Assert(index >= 0 && index < _completionSlots!.Length); + Debug.Assert(_completionSlotStorage is not null); + + ReleaseZeroCopyPinHold(index); + ref IoUringCompletionSlot slot = ref _completionSlots![index]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![index]; + + if (slot.UsesFixedRecvBuffer) + { + IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing; + if (providedBufferRing is not null) + { + providedBufferRing.TryRecycleBufferFromCompletion(slot.FixedRecvBufferId); + } + + slot.UsesFixedRecvBuffer = false; + slot.FixedRecvBufferId = 0; + } + + // Free any native message storage + if (slot.Kind == IoUringCompletionOperationKind.Message) + { + FreeMessageStorage(index); + } + else if (slot.Kind == IoUringCompletionOperationKind.Accept) + { + if (slotStorage.NativeSocketAddressLengthPtr != null) + { + NativeMemory.Free(slotStorage.NativeSocketAddressLengthPtr); + slotStorage.NativeSocketAddressLengthPtr = null; + } + } + + slot.Generation++; + if (slot.Generation == 0) + { + slot.Generation = 1; + } + slot.Kind = IoUringCompletionOperationKind.None; + ResetDebugTestForcedResult(ref slot); + slot.IsZeroCopySend = false; + slot.ZeroCopyNotificationPending = false; + slot.UsesFixedRecvBuffer = false; + slot.FixedRecvBufferId = 0; + slot.FreeListNext = _completionSlotFreeListHead; + _completionSlotFreeListHead = index; + _completionSlotsInUse--; + } + + /// Disposes a retained zero-copy pin-hold for the specified completion slot. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ReleaseZeroCopyPinHold(int slotIndex) + { + System.Buffers.MemoryHandle[]? pinHolds = _zeroCopyPinHolds; + if (pinHolds is null || (uint)slotIndex >= (uint)pinHolds.Length) + { + return; + } + + pinHolds[slotIndex].Dispose(); + pinHolds[slotIndex] = default; + } + + /// Transfers operation-owned pin state into the engine's zero-copy pin-hold registry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void TransferIoUringZeroCopyPinHold(ulong userData, System.Buffers.MemoryHandle pinHold) + { + System.Buffers.MemoryHandle[]? pinHolds = _zeroCopyPinHolds; + if (pinHolds is null) + { + pinHold.Dispose(); + ThrowInternalException(Interop.Error.EINVAL); + return; + } + + int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask); + if ((uint)slotIndex >= (uint)pinHolds.Length) + { + pinHold.Dispose(); + ThrowInternalException(Interop.Error.EINVAL); + return; + } + + Debug.Assert(_completionSlots is not null); + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + if (!slot.IsZeroCopySend) + { + pinHold.Dispose(); + ThrowInternalException(Interop.Error.EINVAL); + return; + } + + pinHolds[slotIndex].Dispose(); + pinHolds[slotIndex] = pinHold; + } + + /// + /// Allocates a single contiguous native memory block containing the kernel-consumable + /// struct msghdr, IOVector array, socket address, and control buffer for a + /// sendmsg/recvmsg io_uring operation. The layout within the block is: + /// [NativeMsghdr | IOVectors | SocketAddress | ControlBuffer]. + /// For sendmsg, socket address and control buffer data are deep-copied from the + /// managed . For recvmsg, output pointers + /// are saved so completion can copy kernel-written data back to managed buffers. + /// + private unsafe void AllocateMessageStorage(int slotIndex, Interop.Sys.MessageHeader* messageHeader, bool isReceive) + { + Debug.Assert(sizeof(NativeMsghdr) == 56, $"NativeMsghdr size mismatch with kernel struct msghdr: expected 56, got {sizeof(NativeMsghdr)}"); + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + + int iovCount = messageHeader->IOVectorCount; + int sockAddrLen = messageHeader->SocketAddressLen; + int controlBufLen = messageHeader->ControlBufferLen; + Debug.Assert(iovCount >= 0, $"Expected non-negative iovCount, got {iovCount}"); + Debug.Assert(sockAddrLen >= 0, $"Expected non-negative socket address length, got {sockAddrLen}"); + Debug.Assert(controlBufLen >= 0, $"Expected non-negative control buffer length, got {controlBufLen}"); + + nuint hdrSize = (nuint)sizeof(NativeMsghdr); + nuint iovSize = (nuint)iovCount * (nuint)sizeof(Interop.Sys.IOVector); + nuint sockAddrSize = (nuint)sockAddrLen; + nuint controlBufSize = (nuint)controlBufLen; + nuint totalSize = hdrSize + iovSize + sockAddrSize + controlBufSize; + + byte* storage = (byte*)NativeMemory.AllocZeroed(totalSize); + slotStorage.NativeMessageStorage = (IntPtr)storage; + + // Partition the contiguous block + NativeMsghdr* hdr = (NativeMsghdr*)storage; + Interop.Sys.IOVector* iovDst = (Interop.Sys.IOVector*)(storage + hdrSize); + byte* sockAddrDst = storage + hdrSize + iovSize; + byte* controlBufDst = storage + hdrSize + iovSize + sockAddrSize; + + slotStorage.NativeMsgHdrPtr = (IntPtr)hdr; + slotStorage.NativeIOVectors = iovCount > 0 ? iovDst : null; + slotStorage.NativeSocketAddress = sockAddrLen > 0 ? sockAddrDst : null; + slotStorage.NativeControlBuffer = controlBufLen > 0 ? controlBufDst : null; + + // Deep-copy IOVectors (base/count pairs pointing to caller's pinned buffers) + if (iovCount > 0 && messageHeader->IOVectors != null) + { + nuint iovBytes = (nuint)iovCount * (nuint)sizeof(Interop.Sys.IOVector); + Buffer.MemoryCopy(messageHeader->IOVectors, iovDst, iovBytes, iovBytes); + } + + // For sendmsg: deep-copy socket address and control buffer data into native copies. + // For recvmsg: the kernel will write into these buffers; we copy back at completion. + if (sockAddrLen > 0 && messageHeader->SocketAddress != null) + { + if (!isReceive) + { + Buffer.MemoryCopy(messageHeader->SocketAddress, sockAddrDst, sockAddrSize, sockAddrSize); + } + } + + if (controlBufLen > 0 && messageHeader->ControlBuffer != null) + { + if (!isReceive) + { + Buffer.MemoryCopy(messageHeader->ControlBuffer, controlBufDst, controlBufSize, controlBufSize); + } + } + + // Build the kernel-consumable msghdr + hdr->msg_name = sockAddrLen > 0 ? sockAddrDst : null; + hdr->msg_namelen = (uint)sockAddrLen; + hdr->msg_iov = iovCount > 0 ? iovDst : null; + hdr->msg_iovlen = (nuint)iovCount; + hdr->msg_control = controlBufLen > 0 ? controlBufDst : null; + hdr->msg_controllen = (nuint)controlBufLen; + hdr->msg_flags = 0; + + // For recvmsg: save pointers back to the managed MessageHeader's buffers + // so CopyMessageCompletionOutputs can write back kernel results at completion time. + if (isReceive) + { + slotStorage.ReceiveOutputSocketAddress = messageHeader->SocketAddress; + slotStorage.ReceiveOutputControlBuffer = messageHeader->ControlBuffer; + slotStorage.ReceiveSocketAddressCapacity = sockAddrLen; + slotStorage.ReceiveControlBufferCapacity = controlBufLen; + } + } + + /// + /// Frees the contiguous native memory block allocated by + /// and resets all associated pointer fields on the completion slot. + /// + private unsafe void FreeMessageStorage(int slotIndex) + { + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + + if (slotStorage.NativeMessageStorage != IntPtr.Zero) + { + NativeMemory.Free((void*)slotStorage.NativeMessageStorage); + slotStorage.NativeMessageStorage = IntPtr.Zero; + } + + slotStorage.NativeMsgHdrPtr = IntPtr.Zero; + slotStorage.NativeIOVectors = null; + slotStorage.NativeSocketAddress = null; + slotStorage.NativeControlBuffer = null; + slotStorage.ReceiveOutputSocketAddress = null; + slotStorage.ReceiveOutputControlBuffer = null; + slotStorage.ReceiveSocketAddressCapacity = 0; + slotStorage.ReceiveControlBufferCapacity = 0; + slotStorage.MessageIsReceive = false; + } + + /// + /// After a recvmsg CQE completes, copies the kernel-written socket address and + /// control buffer data from the native msghdr back to the managed MessageHeader's + /// output buffers. For sendmsg completions this is a no-op. + /// Returns the actual socket address length, control buffer length, and msg_flags written by the kernel. + /// + private unsafe void CopyMessageCompletionOutputs( + int slotIndex, + out int socketAddressLen, + out int controlBufferLen, + out uint messageFlags) + { + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + socketAddressLen = 0; + controlBufferLen = 0; + messageFlags = 0; + + if (!slotStorage.MessageIsReceive) + return; + + NativeMsghdr* hdr = (NativeMsghdr*)slotStorage.NativeMsgHdrPtr; + if (hdr == null) + return; + + socketAddressLen = (int)hdr->msg_namelen; + controlBufferLen = (int)hdr->msg_controllen; + messageFlags = (uint)hdr->msg_flags; + + // Copy socket address from native buffer back to managed output buffer + if (slotStorage.ReceiveOutputSocketAddress != null && slotStorage.NativeSocketAddress != null && + slotStorage.ReceiveSocketAddressCapacity > 0 && socketAddressLen > 0) + { + int copyLen = Math.Min(slotStorage.ReceiveSocketAddressCapacity, socketAddressLen); + Buffer.MemoryCopy(slotStorage.NativeSocketAddress, slotStorage.ReceiveOutputSocketAddress, copyLen, copyLen); + } + + // Copy control buffer from native buffer back to managed output buffer + if (slotStorage.ReceiveOutputControlBuffer != null && slotStorage.NativeControlBuffer != null && + slotStorage.ReceiveControlBufferCapacity > 0 && controlBufferLen > 0) + { + int copyLen = Math.Min(slotStorage.ReceiveControlBufferCapacity, controlBufferLen); + Buffer.MemoryCopy(slotStorage.NativeControlBuffer, slotStorage.ReceiveOutputControlBuffer, copyLen, copyLen); + } + } + + /// + /// Decodes a completion slot index from a user_data payload value. + /// The slot index is encoded in the lower 24 bits of the payload. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int DecodeCompletionSlotIndex(ulong payload) + { + return (int)(payload & IoUringConstants.SlotIndexMask); + } + + /// + /// Encodes a completion slot index and generation into a user_data value + /// with the ReservedCompletion tag. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong EncodeCompletionSlotUserData(int slotIndex, uint generation) + { + ulong payload = ((ulong)(generation & IoUringConstants.GenerationMask) << IoUringConstants.SlotIndexBits) | ((ulong)slotIndex & IoUringConstants.SlotIndexMask); + return EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload); + } + + /// + /// Checks whether direct SQE submission is disabled. + /// Defaults to enabled; test-only env var can disable for deterministic tests. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool IsIoUringDirectSqeDisabled() + { +#if DEBUG + // Test-only override for deterministic stress scenarios. + string? value = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE"); + if (string.Equals(value, "0", StringComparison.Ordinal)) + { + return true; + } + + if (string.Equals(value, "1", StringComparison.Ordinal)) + { + return false; + } +#endif + + // Default: direct SQE enabled. + return false; + } + + /// Checks whether io_uring is enabled (env var overrides AppContext switch). + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool IsIoUringEnabled() + { + string? value = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING"); + if (string.Equals(value, "1", StringComparison.Ordinal)) + { + return true; + } + + if (string.Equals(value, "0", StringComparison.Ordinal)) + { + return false; + } + + if (AppContext.TryGetSwitch("System.Net.Sockets.IoUring.Enable", out bool enabled)) + { + return enabled; + } + + return false; + } + + /// + /// Returns whether SEND_ZC should be enabled. + /// Defaults to enabled; test-only env var can disable for deterministic tests. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool IsZeroCopySendOptedIn() + { +#if DEBUG + // Test-only override for deterministic coverage. + string? value = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND"); + if (string.Equals(value, "1", StringComparison.Ordinal)) + { + return true; + } + + if (string.Equals(value, "0", StringComparison.Ordinal)) + { + return false; + } +#endif + + // Default: zero-copy send enabled. + return true; + } + + /// + /// Returns whether SQPOLL mode has been explicitly requested. + /// Requires both env var and AppContext switch to be enabled. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool IsSqPollRequested() + { + bool switchEnabled = + AppContext.TryGetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", out bool enabled) && + enabled; + + string? value = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL"); + if (string.Equals(value, "1", StringComparison.Ordinal)) + { + return switchEnabled; + } + + if (string.Equals(value, "0", StringComparison.Ordinal)) + { + return false; + } + + return false; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ResetDebugTestForcedResult(ref IoUringCompletionSlot slot) + { +#if DEBUG + slot.HasTestForcedResult = false; + slot.TestForcedResult = 0; +#endif + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ResolveDebugTestForcedResult(ref IoUringCompletionSlot slot, ref int result) + { +#if DEBUG + if (slot.HasTestForcedResult) + { + result = slot.TestForcedResult; + slot.HasTestForcedResult = false; + } +#endif + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ApplyDebugTestForcedResult(ref IoUringCompletionSlot slot, byte opcode) + { +#if DEBUG + if ((_testForceEagainOnceMask | _testForceEcanceledOnceMask) == 0) + { + return; + } + + if (TryConsumeTestForcedResult(opcode, out int forced)) + { + slot.HasTestForcedResult = true; + slot.TestForcedResult = forced; + } +#else + _ = _ioUringInitialized; +#endif + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RestoreDebugTestForcedResultIfNeeded(int slotIndex, byte opcode) + { +#if DEBUG + Debug.Assert(_completionSlots is not null); + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + if (slot.HasTestForcedResult) + { + RestoreTestForcedResult(slot.TestForcedResult, opcode); + } +#else + _ = _ioUringInitialized; +#endif + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private void InitializeDebugTestHooksFromEnvironment() + { +#if DEBUG + // Mirrors native pal_io_uring.c test hooks. + _testForceEagainOnceMask = ParseTestOpcodeMask( + Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_EAGAIN_ONCE_MASK")); + _testForceEcanceledOnceMask = ParseTestOpcodeMask( + Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ECANCELED_ONCE_MASK")); +#else + _ = _ioUringInitialized; +#endif + } + +#if DEBUG + /// + /// Parses a comma-separated list of opcode names (e.g. "send,recv,accept") into a + /// bitmask of TestOpcodeMask* values. + /// Mirrors GetIoUringTestOpcodeMaskFromOpcodeNameList in pal_io_uring.c. + /// + private static byte ParseTestOpcodeMask(string? opcodeNameList) + { + if (string.IsNullOrEmpty(opcodeNameList)) + return IoUringConstants.TestOpcodeMaskNone; + + byte mask = IoUringConstants.TestOpcodeMaskNone; + foreach (var name in opcodeNameList.Split(',', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries)) + { + if (name.Equals("send", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskSend; + else if (name.Equals("recv", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskRecv; + else if (name.Equals("sendmsg", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskSendMsg; + else if (name.Equals("recvmsg", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskRecvMsg; + else if (name.Equals("accept", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskAccept; + else if (name.Equals("connect", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskConnect; + else if (name.Equals("sendzc", StringComparison.OrdinalIgnoreCase) || name.Equals("send_zc", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskSendZc; + else if (name.Equals("sendmsgzc", StringComparison.OrdinalIgnoreCase) || name.Equals("sendmsg_zc", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskSendMsgZc; + } + return mask; + } + + /// + /// Maps an io_uring opcode to its corresponding test opcode mask bit. + /// Mirrors GetIoUringTestOpcodeMaskFromOpcode in pal_io_uring.c. + /// + private static byte GetTestOpcodeMaskFromOpcode(byte opcode) + { + return opcode switch + { + IoUringOpcodes.Send => IoUringConstants.TestOpcodeMaskSend, + IoUringOpcodes.Recv => IoUringConstants.TestOpcodeMaskRecv, + IoUringOpcodes.SendMsg => IoUringConstants.TestOpcodeMaskSendMsg, + IoUringOpcodes.RecvMsg => IoUringConstants.TestOpcodeMaskRecvMsg, + IoUringOpcodes.Accept => IoUringConstants.TestOpcodeMaskAccept, + IoUringOpcodes.Connect => IoUringConstants.TestOpcodeMaskConnect, + IoUringOpcodes.SendZc => IoUringConstants.TestOpcodeMaskSendZc, + IoUringOpcodes.SendMsgZc => IoUringConstants.TestOpcodeMaskSendMsgZc, + _ => IoUringConstants.TestOpcodeMaskNone, + }; + } + + /// + /// Tries to consume a forced test result for the given opcode. + /// EAGAIN takes priority over ECANCELED when both are set. + /// Mirrors TryConsumeIoUringForcedCompletionResultLocked in pal_io_uring.c. + /// + private bool TryConsumeTestForcedResult(byte opcode, out int forcedResult) + { + forcedResult = 0; + byte opcodeMask = GetTestOpcodeMaskFromOpcode(opcode); + if (opcodeMask == IoUringConstants.TestOpcodeMaskNone) + return false; + + if ((_testForceEagainOnceMask & opcodeMask) != 0) + { + _testForceEagainOnceMask &= (byte)~opcodeMask; + forcedResult = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.EAGAIN); + return true; + } + + if ((_testForceEcanceledOnceMask & opcodeMask) != 0) + { + _testForceEcanceledOnceMask &= (byte)~opcodeMask; + forcedResult = -ErrnoECANCELED; + return true; + } + + return false; + } + + /// + /// Restores a previously consumed forced test result mask bit. + /// Called when SQE acquisition fails after the forced result was consumed, + /// so the test hook can fire on the next attempt. + /// Mirrors RestoreIoUringForcedCompletionResultLocked in pal_io_uring.c. + /// + private void RestoreTestForcedResult(int forcedResult, byte opcode) + { + byte opcodeMask = GetTestOpcodeMaskFromOpcode(opcode); + if (opcodeMask == IoUringConstants.TestOpcodeMaskNone) + return; + + if (forcedResult == -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.EAGAIN)) + _testForceEagainOnceMask |= opcodeMask; + else if (forcedResult == -ErrnoECANCELED) + _testForceEcanceledOnceMask |= opcodeMask; + } +#endif + + /// + /// Probes the kernel for supported io_uring opcodes using IORING_REGISTER_PROBE and + /// populates the per-opcode _supportsOp* capability flags. + /// When the probe syscall is unavailable (older kernels), all flags remain at their + /// default value (). + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe void ProbeIoUringOpcodeSupport(int ringFd) + { + // Probe buffer: 16-byte header + 256 * 8-byte ops = 2064 bytes. + const int maxOps = 256; + const int probeSize = 16 + maxOps * 8; + byte* probeBuffer = stackalloc byte[probeSize]; + new Span(probeBuffer, probeSize).Clear(); + + int result; + Interop.Error err = Interop.Sys.IoUringShimRegister( + ringFd, IoUringConstants.RegisterProbe, probeBuffer, (uint)maxOps, &result); + + if (err != Interop.Error.SUCCESS) + { + // Probe not supported (for example older kernels): per-opcode flags remain false. + // Direct SQE prep does not gate on these flags; this mainly affects optional feature light-up. + return; + } + + // Parse: ops start at offset 16, each is 8 bytes. + IoUringProbeOp* ops = (IoUringProbeOp*)(probeBuffer + 16); + IoUringProbeHeader* header = (IoUringProbeHeader*)probeBuffer; + int opsCount = Math.Min((int)header->OpsLen, maxOps); + + _supportsOpReadFixed = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.ReadFixed); + _supportsOpSend = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Send); + _supportsOpRecv = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Recv); + _supportsOpSendMsg = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendMsg); + _supportsOpRecvMsg = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.RecvMsg); + _supportsOpAccept = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Accept); + _supportsOpConnect = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Connect); + _supportsOpSendZc = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendZc); + _supportsOpSendMsgZc = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendMsgZc); + _zeroCopySendEnabled = _supportsOpSendZc && IsZeroCopySendOptedIn(); + _supportsOpAsyncCancel = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.AsyncCancel); + _supportsOpPollAdd = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.PollAdd); + _supportsMultishotAccept = _supportsOpAccept; + RefreshIoUringMultishotRecvSupport(); + } + + /// Checks whether a specific opcode is supported by the kernel's io_uring probe result. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe bool IsOpcodeSupported(IoUringProbeOp* ops, int opsCount, byte opcode) + { + if (opcode >= opsCount) return false; + return (ops[opcode].Flags & IoUringConstants.ProbeOpFlagSupported) != 0; + } + + /// Converts SocketFlags to the kernel msg_flags representation for io_uring. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool TryConvertIoUringPrepareSocketFlags(SocketFlags flags, out uint rwFlags) + { + const SocketFlags SupportedIoUringFlags = + SocketFlags.OutOfBand | + SocketFlags.Peek | + SocketFlags.DontRoute; + + if ((flags & ~SupportedIoUringFlags) != 0) + { + rwFlags = 0; + return false; + } + + rwFlags = (uint)(int)flags; + return true; + } + + /// Writes a send SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WriteSendSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* buffer, + uint length, + uint rwFlags) + { + sqe->Opcode = IoUringOpcodes.Send; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)buffer; + sqe->Len = length; + sqe->RwFlags = rwFlags; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// Writes a zero-copy send SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WriteSendZcSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* buffer, + uint length, + uint rwFlags) + { + sqe->Opcode = IoUringOpcodes.SendZc; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)buffer; + sqe->Len = length; + sqe->RwFlags = rwFlags; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// Writes a recv SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WriteRecvSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* buffer, + uint length, + uint rwFlags) + { + sqe->Opcode = IoUringOpcodes.Recv; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Ioprio = 0; + sqe->Addr = (ulong)(nuint)buffer; + sqe->Len = length; + sqe->RwFlags = rwFlags; + sqe->BufIndex = 0; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// Writes a read-fixed SQE for registered-buffer receive. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WriteReadFixedSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* buffer, + uint length, + ushort bufferIndex) + { + sqe->Opcode = IoUringOpcodes.ReadFixed; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Ioprio = 0; + sqe->Addr = (ulong)(nuint)buffer; + sqe->Len = length; + // For non-seekable sockets, offset is ignored; -1 matches "current position" semantics. + sqe->Off = ulong.MaxValue; + sqe->RwFlags = 0; + sqe->BufIndex = bufferIndex; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// + /// Writes a one-shot recv SQE using provided-buffer selection. + /// The kernel chooses a buffer from the specified buffer group. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteProvidedBufferRecvSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + uint requestedLength, + uint rwFlags, + ushort bufferGroupId) + { + sqe->Opcode = IoUringOpcodes.Recv; + sqe->Fd = sqeFd; + sqe->Flags = (byte)(sqeFlags | IoUringConstants.SqeBufferSelect); + sqe->Ioprio = 0; + sqe->Addr = 0; + sqe->Len = requestedLength; + sqe->RwFlags = rwFlags; + sqe->BufIndex = bufferGroupId; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// + /// Writes a multishot recv SQE to the submission ring entry. + /// The kernel selects buffers from a provided buffer ring (IOSQE_BUFFER_SELECT). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteMultishotRecvSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + ushort bufferGroupId) + { + sqe->Opcode = IoUringOpcodes.Recv; + sqe->Fd = sqeFd; + sqe->Flags = (byte)(sqeFlags | IoUringConstants.SqeBufferSelect); + sqe->Ioprio = IoUringConstants.RecvMultishot; + sqe->Addr = 0; + sqe->Len = 0; + sqe->RwFlags = 0; + sqe->BufIndex = bufferGroupId; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// Writes an accept SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WriteAcceptSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* socketAddress, + IntPtr socketAddressLengthPtr) + { + sqe->Opcode = IoUringOpcodes.Accept; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)socketAddress; + // Kernel accept prep aliases addr2 at sqe->off. + sqe->Off = (ulong)(nuint)socketAddressLengthPtr; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// Writes a multishot accept SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe void WriteMultishotAcceptSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* socketAddress, + IntPtr socketAddressLengthPtr) + { + sqe->Opcode = IoUringOpcodes.Accept; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Ioprio = IoUringConstants.AcceptMultishot; + sqe->Addr = (ulong)(nuint)socketAddress; + // accept4 prep aliases addr2 at sqe->off for addrlen pointer + sqe->Off = (ulong)(nuint)socketAddressLengthPtr; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// Writes a sendmsg SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteSendMsgSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + IntPtr messageHeader, + uint rwFlags) + { + sqe->Opcode = IoUringOpcodes.SendMsg; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)messageHeader; + sqe->Len = 1; + sqe->RwFlags = rwFlags; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// Writes a sendmsg_zc SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteSendMsgZcSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + IntPtr messageHeader, + uint rwFlags) + { + sqe->Opcode = IoUringOpcodes.SendMsgZc; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)messageHeader; + sqe->Len = 1; + sqe->RwFlags = rwFlags; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// Writes a recvmsg SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteRecvMsgSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + IntPtr messageHeader, + uint rwFlags) + { + sqe->Opcode = IoUringOpcodes.RecvMsg; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)messageHeader; + sqe->Len = 1; + sqe->RwFlags = rwFlags; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// Writes a connect SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WriteConnectSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* socketAddress, + int socketAddressLen) + { + sqe->Opcode = IoUringOpcodes.Connect; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)socketAddress; + // Kernel connect prep aliases addrlen at sqe->off and requires len=0. + sqe->Off = (uint)socketAddressLen; + sqe->Len = 0; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + } + + /// Writes an ASYNC_CANCEL SQE targeting the specified user_data. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteAsyncCancelSqe(IoUringSqe* sqe, ulong userData) + { + sqe->Opcode = IoUringOpcodes.AsyncCancel; + sqe->Fd = -1; + sqe->Addr = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData); + sqe->UserData = 0; + } + + /// Publishes the managed SQ tail pointer to make queued SQEs visible to the kernel. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void PublishManagedSqeTail() + { + if (!_ioUringManagedSqTailLoaded || _ioUringSqRingInfo.SqTailPtr == IntPtr.Zero) + { + return; + } + + Debug.Assert(IsCurrentThreadEventLoopThread(), + "PublishManagedSqeTail must only be called from the event loop thread (SINGLE_ISSUER contract)."); + ref uint sqTailRef = ref Unsafe.AsRef((void*)_ioUringSqRingInfo.SqTailPtr); + Volatile.Write(ref sqTailRef, _ioUringManagedSqTail); + _ioUringManagedSqTailLoaded = false; + } + + /// + /// Returns true when the SQPOLL kernel thread has gone idle and needs an explicit wakeup. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool SqNeedWakeup() + { + Debug.Assert(_sqPollEnabled, "SqNeedWakeup should only be checked in SQPOLL mode."); + if (_managedSqFlagsPtr == null) + { + return true; + } + + return (Volatile.Read(ref *_managedSqFlagsPtr) & IoUringConstants.SqNeedWakeup) != 0; + } + + /// Allocates the next available SQE slot from the submission ring. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool TryGetNextManagedSqe(out IoUringSqe* sqe) + { + sqe = null; + if (!_ioUringDirectSqeEnabled) + { + return false; + } + + Debug.Assert(IsCurrentThreadEventLoopThread(), + "TryGetNextManagedSqe must only be called from the event loop thread (SINGLE_ISSUER contract)."); + ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo; + if (ringInfo.SqeBase == IntPtr.Zero || + ringInfo.SqHeadPtr == IntPtr.Zero || + ringInfo.SqTailPtr == IntPtr.Zero || + ringInfo.SqEntries == 0) + { + return false; + } + + ref uint sqHeadRef = ref Unsafe.AsRef((void*)ringInfo.SqHeadPtr); + uint sqHead = Volatile.Read(ref sqHeadRef); + if (!_ioUringManagedSqTailLoaded) + { + ref uint sqTailRef = ref Unsafe.AsRef((void*)ringInfo.SqTailPtr); + _ioUringManagedSqTail = Volatile.Read(ref sqTailRef); + _ioUringManagedSqTailLoaded = true; + } + + uint sqTail = _ioUringManagedSqTail; + if (sqTail - sqHead >= ringInfo.SqEntries) + { + return false; + } + + uint index = sqTail & ringInfo.SqMask; + nint sqeOffset = checked((nint)((nuint)index * ringInfo.SqeSize)); + sqe = (IoUringSqe*)((byte*)ringInfo.SqeBase + sqeOffset); + Unsafe.InitBlockUnaligned(sqe, 0, ringInfo.SqeSize); + _ioUringManagedSqTail = sqTail + 1; + _ioUringManagedPendingSubmissions++; + return true; + } + + /// Attempts to acquire an SQE, retrying with intermediate submits on ring full. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError) + { + sqe = null; + submitError = Interop.Error.SUCCESS; + for (int attempt = 0; attempt < MaxIoUringSqeAcquireSubmitAttempts; attempt++) + { + if (TryGetNextManagedSqe(out sqe)) + { + return true; + } + + submitError = SubmitIoUringOperationsNormalized(); + if (submitError != Interop.Error.SUCCESS) + { + return false; + } + } + + submitError = Interop.Error.EAGAIN; + return false; + } + + /// + /// Common setup for direct SQE preparation: allocates a completion slot, encodes user data, + /// resolves the socket fd/flags, applies test hooks, and acquires an SQE. On failure, + /// restores test state and frees the slot. + /// + /// + /// if the SQE was acquired + /// (caller must write the SQE and return Prepared), + /// or a terminal result (Unsupported/PrepareFailed) that the caller should return directly. + /// + private unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TrySetupDirectSqe( + SafeSocketHandle socket, + byte opcode, + out int slotIndex, + out ulong allocatedUserData, + out int sqeFd, + out byte sqeFlags, + out IoUringSqe* sqe, + out SocketError errorCode) + { + slotIndex = -1; + allocatedUserData = 0; + sqeFd = 0; + sqeFlags = 0; + sqe = null; + errorCode = SocketError.Success; + + if (!_ioUringDirectSqeEnabled) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + slotIndex = AllocateCompletionSlot(); + if (slotIndex < 0) + { + RecordIoUringCompletionSlotExhaustion(); + + if (!_completionSlotDrainInProgress) + { + _completionSlotDrainInProgress = true; + try + { + SocketEventHandler handler = new SocketEventHandler(this); + if (DrainCqeRingBatch(handler)) + { + slotIndex = AllocateCompletionSlot(); + } + } + finally + { + _completionSlotDrainInProgress = false; + } + } + + if (slotIndex < 0) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + RecordIoUringCompletionSlotDrainRecovery(); + } + + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + allocatedUserData = EncodeCompletionSlotUserData(slotIndex, slot.Generation); + int socketFd = (int)(nint)socket.DangerousGetHandle(); + ConfigureSocketSqeFdAndFlags(socketFd, out sqeFd, out sqeFlags); + ApplyDebugTestForcedResult(ref slot, opcode); + + if (!TryAcquireManagedSqeWithRetry(out sqe, out Interop.Error submitError)) + { + RestoreDebugTestForcedResultIfNeeded(slotIndex, opcode); + FreeCompletionSlot(slotIndex); + slotIndex = -1; + + if (submitError == Interop.Error.SUCCESS || + submitError == Interop.Error.EAGAIN || + submitError == Interop.Error.EWOULDBLOCK) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + errorCode = SocketPal.GetSocketErrorForErrorCode(submitError); + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed; + } + + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a send SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSend( + SafeSocketHandle socket, + byte* buffer, + int bufferLen, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + var result = TrySetupDirectSqe(socket, IoUringOpcodes.Send, out _, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode); + if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + return result; + } + + WriteSendSqe(sqe, sqeFd, sqeFlags, allocatedUserData, buffer, (uint)bufferLen, rwFlags); + userData = allocatedUserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// + /// Prepares a send SQE, preferring SEND_ZC when eligible and falling back to SEND when unavailable. + /// + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendWithZeroCopyFallback( + SafeSocketHandle socket, + byte* buffer, + int bufferLen, + SocketFlags flags, + out bool usedZeroCopy, + out ulong userData, + out SocketError errorCode) + { + usedZeroCopy = false; + if (ShouldTryIoUringDirectSendZeroCopy(bufferLen)) + { + SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult zeroCopyResult = TryPrepareIoUringDirectSendZc( + socket, + buffer, + bufferLen, + flags, + out userData, + out errorCode); + if (zeroCopyResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported) + { + usedZeroCopy = zeroCopyResult == SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + return zeroCopyResult; + } + } + + return TryPrepareIoUringDirectSend( + socket, + buffer, + bufferLen, + flags, + out userData, + out errorCode); + } + + /// Prepares a zero-copy send SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendZc( + SafeSocketHandle socket, + byte* buffer, + int bufferLen, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!ShouldTryIoUringDirectSendZeroCopy(bufferLen)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + var result = TrySetupDirectSqe( + socket, + IoUringOpcodes.SendZc, + out int slotIndex, + out ulong allocatedUserData, + out int sqeFd, + out byte sqeFlags, + out IoUringSqe* sqe, + out errorCode); + if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + return result; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + slot.IsZeroCopySend = true; + slot.ZeroCopyNotificationPending = false; + + WriteSendZcSqe(sqe, sqeFd, sqeFlags, allocatedUserData, buffer, (uint)bufferLen, rwFlags); + userData = allocatedUserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a recv SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectRecv( + SafeSocketHandle socket, + byte* buffer, + int bufferLen, + SocketFlags flags, + bool allowMultishotRecv, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + var result = TrySetupDirectSqe(socket, IoUringOpcodes.Recv, out int slotIndex, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode); + if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + return result; + } + + if (ShouldTryIoUringDirectFixedRecv(flags, allowMultishotRecv, bufferLen) && + TryPrepareIoUringDirectRecvFixed(slotIndex, sqe, sqeFd, sqeFlags, allocatedUserData, bufferLen)) + { + SocketsTelemetry.Log.IoUringFixedRecvSelected(); + userData = allocatedUserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + if (allowMultishotRecv && + bufferLen > 0 && + TryGetIoUringMultishotRecvBufferGroupId(out ushort multishotBufferGroupId)) + { + WriteMultishotRecvSqe(sqe, sqeFd, sqeFlags, allocatedUserData, multishotBufferGroupId); + } + else if (bufferLen > 0 && + TryGetIoUringProvidedBufferGroupId(out ushort providedBufferGroupId)) + { + WriteProvidedBufferRecvSqe( + sqe, + sqeFd, + sqeFlags, + allocatedUserData, + (uint)bufferLen, + rwFlags, + providedBufferGroupId); + } + else + { + WriteRecvSqe(sqe, sqeFd, sqeFlags, allocatedUserData, buffer, (uint)bufferLen, rwFlags); + } + userData = allocatedUserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool ShouldTryIoUringDirectFixedRecv(SocketFlags flags, bool allowMultishotRecv, int bufferLen) + { + if (!_supportsOpReadFixed || !_ioUringBuffersRegistered) + { + return false; + } + + if (allowMultishotRecv || bufferLen <= 0) + { + return false; + } + + // READ_FIXED does not provide recvmsg/socket flags semantics. + return flags == SocketFlags.None; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool TryPrepareIoUringDirectRecvFixed( + int slotIndex, + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + int requestedLength) + { + IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing; + if (providedBufferRing is null) + { + SocketsTelemetry.Log.IoUringFixedRecvFallback(); + return false; + } + + if (!providedBufferRing.TryAcquireBufferForPreparedReceive( + out ushort bufferId, + out byte* fixedBuffer, + out int fixedBufferLength)) + { + // Under transient provided-buffer pressure, fall back to normal receive preparation. + SocketsTelemetry.Log.IoUringFixedRecvFallback(); + return false; + } + + Debug.Assert(_completionSlots is not null); + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + slot.UsesFixedRecvBuffer = true; + slot.FixedRecvBufferId = bufferId; + + int receiveLength = Math.Min(requestedLength, fixedBufferLength); + WriteReadFixedSqe( + sqe, + sqeFd, + sqeFlags, + userData, + fixedBuffer, + (uint)receiveLength, + bufferId); + return true; + } + + /// Prepares an accept SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectAccept( + SafeSocketHandle socket, + byte* socketAddress, + int socketAddressLen, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + var result = TrySetupDirectSqe(socket, IoUringOpcodes.Accept, out int slotIndex, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode); + if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + return result; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + slot.Kind = IoUringCompletionOperationKind.Accept; + slotStorage.NativeSocketAddressLengthPtr = (int*)NativeMemory.Alloc((nuint)sizeof(int)); + *slotStorage.NativeSocketAddressLengthPtr = socketAddressLen; + + WriteAcceptSqe(sqe, sqeFd, sqeFlags, allocatedUserData, socketAddress, (IntPtr)slotStorage.NativeSocketAddressLengthPtr); + userData = allocatedUserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a multishot accept SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectMultishotAccept( + SafeSocketHandle socket, + byte* socketAddress, + int socketAddressLen, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + if (!_supportsMultishotAccept) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + var result = TrySetupDirectSqe( + socket, + IoUringOpcodes.Accept, + out int slotIndex, + out ulong allocatedUserData, + out int sqeFd, + out byte sqeFlags, + out IoUringSqe* sqe, + out errorCode); + if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + return result; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + slot.Kind = IoUringCompletionOperationKind.Accept; + slotStorage.NativeSocketAddressLengthPtr = (int*)NativeMemory.Alloc((nuint)sizeof(int)); + *slotStorage.NativeSocketAddressLengthPtr = socketAddressLen; + // Preserve the original sockaddr capacity for future multishot accept re-arm/reset handling. + slotStorage.ReceiveSocketAddressCapacity = socketAddressLen; + + WriteMultishotAcceptSqe( + sqe, + sqeFd, + sqeFlags, + allocatedUserData, + socketAddress, + (IntPtr)slotStorage.NativeSocketAddressLengthPtr); + userData = allocatedUserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a connect SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectConnect( + SafeSocketHandle socket, + byte* socketAddress, + int socketAddressLen, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + var result = TrySetupDirectSqe(socket, IoUringOpcodes.Connect, out _, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode); + if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + return result; + } + + WriteConnectSqe(sqe, sqeFd, sqeFlags, allocatedUserData, socketAddress, socketAddressLen); + userData = allocatedUserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a sendmsg SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessage( + SafeSocketHandle socket, + Interop.Sys.MessageHeader* messageHeader, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + var result = TrySetupDirectSqe(socket, IoUringOpcodes.SendMsg, out int slotIndex, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode); + if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + return result; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + slot.Kind = IoUringCompletionOperationKind.Message; + slotStorage.MessageIsReceive = false; + AllocateMessageStorage(slotIndex, messageHeader, isReceive: false); + + WriteSendMsgSqe(sqe, sqeFd, sqeFlags, allocatedUserData, slotStorage.NativeMsgHdrPtr, rwFlags); + userData = allocatedUserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// + /// Prepares a sendmsg SQE, preferring SENDMSG_ZC when eligible and falling back to SENDMSG otherwise. + /// + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessageWithZeroCopyFallback( + SafeSocketHandle socket, + Interop.Sys.MessageHeader* messageHeader, + int payloadLength, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + if (ShouldTryIoUringDirectSendMessageZeroCopy(payloadLength)) + { + SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult zeroCopyResult = TryPrepareIoUringDirectSendMessageZc( + socket, + messageHeader, + payloadLength, + flags, + out userData, + out errorCode); + if (zeroCopyResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported) + { + return zeroCopyResult; + } + } + + return TryPrepareIoUringDirectSendMessage( + socket, + messageHeader, + flags, + out userData, + out errorCode); + } + + /// Prepares a sendmsg_zc SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessageZc( + SafeSocketHandle socket, + Interop.Sys.MessageHeader* messageHeader, + int payloadLength, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!ShouldTryIoUringDirectSendMessageZeroCopy(payloadLength)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + var result = TrySetupDirectSqe( + socket, + IoUringOpcodes.SendMsgZc, + out int slotIndex, + out ulong allocatedUserData, + out int sqeFd, + out byte sqeFlags, + out IoUringSqe* sqe, + out errorCode); + if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + return result; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + slot.Kind = IoUringCompletionOperationKind.Message; + slotStorage.MessageIsReceive = false; + // Mirror SEND_ZC semantics: first CQE is not final managed completion; operation + // completes only after NOTIF CQE confirms kernel/NIC no longer references payload. + slot.IsZeroCopySend = true; + slot.ZeroCopyNotificationPending = false; + AllocateMessageStorage(slotIndex, messageHeader, isReceive: false); + + WriteSendMsgZcSqe(sqe, sqeFd, sqeFlags, allocatedUserData, slotStorage.NativeMsgHdrPtr, rwFlags); + userData = allocatedUserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a recvmsg SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectReceiveMessage( + SafeSocketHandle socket, + Interop.Sys.MessageHeader* messageHeader, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + var result = TrySetupDirectSqe(socket, IoUringOpcodes.RecvMsg, out int slotIndex, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode); + if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + return result; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + slot.Kind = IoUringCompletionOperationKind.Message; + slotStorage.MessageIsReceive = true; + AllocateMessageStorage(slotIndex, messageHeader, isReceive: true); + + WriteRecvMsgSqe(sqe, sqeFd, sqeFlags, allocatedUserData, slotStorage.NativeMsgHdrPtr, rwFlags); + userData = allocatedUserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Debug-only assertion that validates a state machine transition. + [Conditional("DEBUG")] + private static void AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState from, + IoUringOperationLifecycleState to) + { + bool isValid = + from == IoUringOperationLifecycleState.Queued && to == IoUringOperationLifecycleState.Prepared || + from == IoUringOperationLifecycleState.Prepared && to == IoUringOperationLifecycleState.Submitted || + from == IoUringOperationLifecycleState.Prepared && to == IoUringOperationLifecycleState.Detached || + from == IoUringOperationLifecycleState.Submitted && + (to == IoUringOperationLifecycleState.Queued || + to == IoUringOperationLifecycleState.Completed || + to == IoUringOperationLifecycleState.Canceled || + to == IoUringOperationLifecycleState.Detached); + + Debug.Assert(isValid, $"Invalid io_uring lifecycle transition: {from} -> {to}"); + } + + /// Resets the native diagnostics poll countdown. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void InitializeLinuxIoUringDiagnosticsState() => + _ioUringDiagnosticsPollCountdown = IoUringDiagnosticsPollInterval; + + /// Logs a failed ASYNC_CANCEL SQE preparation. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringAsyncCancelPrepareFailure(SocketError cancelError, ulong userData, IoUringCancellationOrigin origin) + { + string originLabel = origin == IoUringCancellationOrigin.Teardown ? " during teardown" : string.Empty; + NetEventSource.Info(this, $"io_uring async-cancel prepare failed{originLabel}: error={cancelError}, user_data=0x{userData:x}"); + } + + /// Logs a failed ASYNC_CANCEL submission. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringAsyncCancelSubmitFailure(Interop.Error submitError, IoUringCancellationOrigin origin) + { + string originLabel = origin == IoUringCancellationOrigin.Teardown ? " during teardown" : string.Empty; + NetEventSource.Info(this, $"io_uring async-cancel submit failed{originLabel}: error={submitError}"); + } + + /// Logs a sampled counter value with its associated user_data. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCounterSample(string message, long count, ulong userData) + { + NetEventSource.Info(this, $"{message}: count={count}, user_data=0x{userData:x}"); + } + + /// Logs a prepare queue overflow event. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringPrepareQueueOverflow(long count, int capacity) + { + NetEventSource.Info(this, $"io_uring prepare queue overflow: count={count}, capacity={capacity}"); + } + + /// Logs a cancellation queue overflow event. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCancellationQueueOverflow(long count, int capacity) + { + NetEventSource.Info(this, $"io_uring cancellation queue overflow: count={count}, capacity={capacity}"); + } + + /// Logs a CQ overflow observation from the kernel CQ ring counter. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCqOverflow(uint totalOverflowCount, uint delta) + { + NetEventSource.Error(this, $"io_uring CQ overflow detected: total={totalOverflowCount}, delta={delta}"); + } + + /// Logs a failed eventfd wake signal. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringWakeFailure(Interop.Error error) + { + NetEventSource.Info(this, $"io_uring wake signal failed: error={error}"); + } + + /// Logs the final count of benign late completions at teardown. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringTeardownSummary(long lateCompletionCount) + { + NetEventSource.Info(this, $"io_uring benign late-completion total={lateCompletionCount}"); + } + + /// Logs an untrack operation mismatch. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringUntrackMismatch(ulong userData, long mismatchCount) + { + NetEventSource.Info(this, $"io_uring untrack mismatch: user_data=0x{userData:x}, count={mismatchCount}"); + } + + /// Logs the negotiated io_uring mode for this engine instance. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringModeSelection(LinuxIoUringCapabilities capabilities) + { + NetEventSource.Info( + this, + $"io_uring mode={capabilities.Mode}, is_io_uring_port={capabilities.IsIoUringPort}, supports_multishot_recv={capabilities.SupportsMultishotRecv}, supports_multishot_accept={capabilities.SupportsMultishotAccept}, zero_copy_send_enabled={capabilities.SupportsZeroCopySend}, supports_read_fixed={_supportsOpReadFixed}, supports_send_zc={_supportsOpSendZc}, supports_sendmsg_zc={_supportsOpSendMsgZc}, sqpoll_enabled={capabilities.SqPollEnabled}"); + } + + /// Logs active advanced io_uring features for this engine instance. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringAdvancedFeatureState() + { + int providedBufferSize = _ioUringProvidedBufferRing?.BufferSize ?? 0; + NetEventSource.Info( + this, + $"io_uring features: multishot_recv={_ioUringCapabilities.SupportsMultishotRecv}, multishot_accept={_ioUringCapabilities.SupportsMultishotAccept}, zero_copy_send_enabled={_ioUringCapabilities.SupportsZeroCopySend}, supports_read_fixed={_supportsOpReadFixed}, fixed_recv_active={_supportsOpReadFixed && _ioUringBuffersRegistered}, supports_send_zc={_supportsOpSendZc}, supports_sendmsg_zc={_supportsOpSendMsgZc}, provided_buffers={_supportsProvidedBufferRings}, registered_buffers={_ioUringBuffersRegistered}, adaptive_buffer_sizing={_adaptiveBufferSizingEnabled}, sqpoll_enabled={_ioUringCapabilities.SqPollEnabled}, provided_buffer_size={providedBufferSize}"); + } + + /// Checks whether the kernel version meets the minimum for io_uring support. + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool IsIoUringKernelVersionSupported() => + OperatingSystem.IsOSPlatformVersionAtLeast( + "Linux", + IoUringConstants.MinKernelMajor, + IoUringConstants.MinKernelMinor); + + /// + /// Recomputes whether multishot recv can be used by this engine instance. + /// Requires opcode support and active provided-buffer ring support. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool RefreshIoUringMultishotRecvSupport() + { + _supportsMultishotRecv = + _supportsOpRecv && + _supportsProvidedBufferRings; + return _supportsMultishotRecv; + } + + /// + /// Returns the provided-buffer group id used for buffer-select receive submissions. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryGetIoUringProvidedBufferGroupId(out ushort bufferGroupId) + { + if (_supportsProvidedBufferRings && _ioUringProvidedBufferRing is not null) + { + bufferGroupId = _ioUringProvidedBufferGroupId; + return true; + } + + bufferGroupId = default; + return false; + } + + /// + /// Returns the provided-buffer group id used for multishot recv submissions. + /// Multishot recv remains disabled unless both the opcode probe and provided-ring + /// registration succeeded for this engine instance. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryGetIoUringMultishotRecvBufferGroupId(out ushort bufferGroupId) + { + if (_supportsMultishotRecv && TryGetIoUringProvidedBufferGroupId(out bufferGroupId)) + { + return true; + } + + bufferGroupId = default; + return false; + } + + internal bool SupportsMultishotRecv => _ioUringCapabilities.SupportsMultishotRecv; + internal bool SupportsMultishotAccept => _ioUringCapabilities.SupportsMultishotAccept; + + /// Calls io_uring_setup and negotiates feature flags. + [MethodImpl(MethodImplOptions.NoInlining)] + private static unsafe bool TrySetupIoUring(bool sqPollRequested, out IoUringSetupResult setupResult) + { + setupResult = default; + + uint flags = IoUringConstants.SetupCqSize | IoUringConstants.SetupSubmitAll + | IoUringConstants.SetupCoopTaskrun | IoUringConstants.SetupSingleIssuer + | IoUringConstants.SetupNoSqArray; + + if (sqPollRequested) + { + // SQPOLL and DEFER_TASKRUN are mutually exclusive in practice. + flags |= IoUringConstants.SetupSqPoll; + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info(null, "io_uring setup: SQPOLL requested and included in initial setup flags."); + } + } + else + { + flags |= IoUringConstants.SetupDeferTaskrun; + } + + Interop.Sys.IoUringParams ioParams = default; + ioParams.Flags = flags; + ioParams.CqEntries = IoUringConstants.QueueEntries * IoUringConstants.CqEntriesFactor; + + int ringFd; + Interop.Error err = Interop.Sys.IoUringShimSetup(IoUringConstants.QueueEntries, &ioParams, &ringFd); + + // IORING_SETUP_NO_SQARRAY was introduced in Linux 6.6. + // For 6.1-6.5 kernels, keep setup simple but allow a single targeted retry without NO_SQARRAY. + if ((err == Interop.Error.EINVAL || err == Interop.Error.EPERM) && + (flags & IoUringConstants.SetupNoSqArray) != 0) + { + flags &= ~IoUringConstants.SetupNoSqArray; + ioParams = default; + ioParams.Flags = flags; + ioParams.CqEntries = IoUringConstants.QueueEntries * IoUringConstants.CqEntriesFactor; + + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info(null, $"io_uring setup: peeled NO_SQARRAY after {err}."); + } + + err = Interop.Sys.IoUringShimSetup(IoUringConstants.QueueEntries, &ioParams, &ringFd); + } + + if (err != Interop.Error.SUCCESS) + { + return false; + } + + setupResult.RingFd = ringFd; + setupResult.Params = ioParams; + setupResult.NegotiatedFlags = flags; + setupResult.UsesExtArg = (ioParams.Features & IoUringConstants.FeatureExtArg) != 0; + setupResult.SqPollNegotiated = (flags & IoUringConstants.SetupSqPoll) != 0; + if (setupResult.SqPollNegotiated && NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info(null, "io_uring setup: SQPOLL negotiated."); + } + return true; + } + + /// + /// Maps the SQ ring, CQ ring, and SQE array into managed address space and derives + /// all ring pointers from the kernel-reported offsets. On failure, unmaps any + /// partially-mapped regions and closes the ring fd. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe bool TryMmapRings(ref IoUringSetupResult setup) + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static bool IsOffsetInRange(ulong offset, ulong size, ulong mappedSize) => + offset <= mappedSize && size <= mappedSize - offset; + + ref Interop.Sys.IoUringParams p = ref setup.Params; + bool usesNoSqArray = (setup.NegotiatedFlags & IoUringConstants.SetupNoSqArray) != 0; + + // Compute ring sizes. + ulong sqRingSize = p.SqOff.Array; + if (!usesNoSqArray) + { + sqRingSize += p.SqEntries * (uint)sizeof(uint); + } + ulong cqRingSize = p.CqOff.Cqes + p.CqEntries * (uint)sizeof(Interop.Sys.IoUringCqe); + ulong sqesSize = p.SqEntries * 64u; // sizeof(io_uring_sqe) = 64 + + // mmap SQ ring (and possibly CQ ring if SINGLE_MMAP). + bool usesSingleMmap = (p.Features & IoUringConstants.FeatureSingleMmap) != 0; + + byte* sqRingPtr; + byte* cqRingPtr; + + if (usesSingleMmap) + { + ulong ringSize = sqRingSize > cqRingSize ? sqRingSize : cqRingSize; + void* ptr; + Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, ringSize, IoUringConstants.OffSqRing, &ptr); + if (err != Interop.Error.SUCCESS) + { + Interop.Sys.IoUringShimCloseFd(setup.RingFd); + return false; + } + sqRingPtr = (byte*)ptr; + cqRingPtr = (byte*)ptr; + sqRingSize = ringSize; + cqRingSize = ringSize; + } + else + { + void* sqPtr; + Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, sqRingSize, IoUringConstants.OffSqRing, &sqPtr); + if (err != Interop.Error.SUCCESS) + { + Interop.Sys.IoUringShimCloseFd(setup.RingFd); + return false; + } + sqRingPtr = (byte*)sqPtr; + + void* cqPtr; + err = Interop.Sys.IoUringShimMmap(setup.RingFd, cqRingSize, IoUringConstants.OffCqRing, &cqPtr); + if (err != Interop.Error.SUCCESS) + { + Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize); + Interop.Sys.IoUringShimCloseFd(setup.RingFd); + return false; + } + cqRingPtr = (byte*)cqPtr; + } + + Debug.Assert(IsOffsetInRange(p.SqOff.Head, sizeof(uint), sqRingSize)); + Debug.Assert(IsOffsetInRange(p.SqOff.Tail, sizeof(uint), sqRingSize)); + Debug.Assert(IsOffsetInRange(p.SqOff.RingMask, sizeof(uint), sqRingSize)); + Debug.Assert(IsOffsetInRange(p.SqOff.RingEntries, sizeof(uint), sqRingSize)); + Debug.Assert(IsOffsetInRange(p.SqOff.Flags, sizeof(uint), sqRingSize)); + if (!usesNoSqArray) + { + Debug.Assert(IsOffsetInRange(p.SqOff.Array, p.SqEntries * (uint)sizeof(uint), sqRingSize)); + } + + Debug.Assert(IsOffsetInRange(p.CqOff.Head, sizeof(uint), cqRingSize)); + Debug.Assert(IsOffsetInRange(p.CqOff.Tail, sizeof(uint), cqRingSize)); + Debug.Assert(IsOffsetInRange(p.CqOff.RingMask, sizeof(uint), cqRingSize)); + Debug.Assert(IsOffsetInRange(p.CqOff.RingEntries, sizeof(uint), cqRingSize)); + Debug.Assert(IsOffsetInRange(p.CqOff.Overflow, sizeof(uint), cqRingSize)); + Debug.Assert(IsOffsetInRange(p.CqOff.Cqes, p.CqEntries * (uint)sizeof(Interop.Sys.IoUringCqe), cqRingSize)); + + // mmap SQE array. + void* sqePtr; + { + Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, sqesSize, IoUringConstants.OffSqes, &sqePtr); + if (err != Interop.Error.SUCCESS) + { + if (!usesSingleMmap) + Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize); + Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize); + Interop.Sys.IoUringShimCloseFd(setup.RingFd); + return false; + } + } + + // Derive SQ pointers and populate existing _ioUringSqRingInfo for compatibility. + _ioUringSqRingInfo.SqeBase = (IntPtr)sqePtr; + _ioUringSqRingInfo.SqTailPtr = (IntPtr)(sqRingPtr + p.SqOff.Tail); + _ioUringSqRingInfo.SqHeadPtr = (IntPtr)(sqRingPtr + p.SqOff.Head); + _ioUringSqRingInfo.SqMask = *(uint*)(sqRingPtr + p.SqOff.RingMask); + _ioUringSqRingInfo.SqEntries = *(uint*)(sqRingPtr + p.SqOff.RingEntries); + _ioUringSqRingInfo.SqeSize = 64; + _ioUringSqRingInfo.UsesNoSqArray = usesNoSqArray ? (byte)1 : (byte)0; + _ioUringSqRingInfo.RingFd = setup.RingFd; + _ioUringSqRingInfo.UsesEnterExtArg = setup.UsesExtArg ? (byte)1 : (byte)0; + _managedSqFlagsPtr = (uint*)(sqRingPtr + p.SqOff.Flags); + + // Initialize SQ array identity mapping if NO_SQARRAY is not active. + if (!usesNoSqArray) + { + uint* sqArray = (uint*)(sqRingPtr + p.SqOff.Array); + for (uint i = 0; i < p.SqEntries; i++) + { + sqArray[i] = i; + } + } + + // Derive CQ pointers. + _managedCqeBase = (Interop.Sys.IoUringCqe*)(cqRingPtr + p.CqOff.Cqes); + _managedCqTailPtr = (uint*)(cqRingPtr + p.CqOff.Tail); + _managedCqHeadPtr = (uint*)(cqRingPtr + p.CqOff.Head); + _managedCqMask = *(uint*)(cqRingPtr + p.CqOff.RingMask); + _managedCqEntries = *(uint*)(cqRingPtr + p.CqOff.RingEntries); + _managedCqOverflowPtr = (uint*)(cqRingPtr + p.CqOff.Overflow); + _managedObservedCqOverflow = Volatile.Read(ref *_managedCqOverflowPtr); + + // Store ring region info for teardown. + _managedSqRingPtr = sqRingPtr; + _managedCqRingPtr = cqRingPtr; + _managedSqRingSize = sqRingSize; + _managedCqRingSize = cqRingSize; + _managedSqesSize = sqesSize; + _managedUsesSingleMmap = usesSingleMmap; + _managedRingFd = setup.RingFd; + _managedUsesExtArg = setup.UsesExtArg; + _managedUsesNoSqArray = usesNoSqArray; + _managedNegotiatedFlags = setup.NegotiatedFlags; + + return true; + } + + /// Queues a POLL_ADD SQE on the wakeup eventfd for cross-thread signaling. + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe bool QueueManagedWakeupPollAdd() + { + if (_managedWakeupEventFd < 0) + return false; + + if (!TryGetNextManagedSqe(out IoUringSqe* sqe)) + return false; + + sqe->Opcode = IoUringOpcodes.PollAdd; + sqe->Fd = _managedWakeupEventFd; + sqe->Len = IoUringConstants.PollAddFlagMulti; // IORING_POLL_ADD_MULTI + sqe->RwFlags = 1; // POLLIN = 0x0001 in poll32_events (stored in RwFlags union at offset 28) + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagWakeupSignal, 0); + return true; + } + + /// Attempts to register the ring fd for fixed-fd submission. + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe bool TryRegisterRingFd(int ringFd, out int registeredRingFd) + { + registeredRingFd = -1; + + // io_uring_rsrc_update: { uint32 offset, uint32 resv, uint64 data } + uint* update = stackalloc uint[4]; // 16 bytes + update[0] = IoUringConstants.RegisterOffsetAuto; // offset = auto-assign + update[1] = 0; // resv + *(ulong*)(update + 2) = (ulong)ringFd; // data = ring fd + + int result; + Interop.Error err = Interop.Sys.IoUringShimRegister( + ringFd, IoUringConstants.RegisterRingFds, update, 1u, &result); + + if (err != Interop.Error.SUCCESS || result <= 0) + return false; + + registeredRingFd = (int)update[0]; // kernel wrote assigned index back + return true; + } + + /// + /// Initializes the registered-file table with the kernel. Allocates slot arrays, + /// fills all entries with -1, and calls IORING_REGISTER_FILES. On success, builds + /// the free-slot stack so that slots can be assigned to sockets. + /// Ports the native TryInitializeRegisteredFilesTable from pal_io_uring.c. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe bool TryInitializeRegisteredFileTable(int ringFd) + { + uint slotCount = IoUringConstants.QueueEntries * IoUringConstants.RegisteredFileSlotCountFactor; + _registeredFiles = new int[slotCount]; + Array.Fill(_registeredFiles, -1); + + // Register the file table with the kernel. + fixed (int* fdsPtr = _registeredFiles) + { + int result; + Interop.Error err = Interop.Sys.IoUringShimRegister( + ringFd, IoUringConstants.RegisterFiles, fdsPtr, slotCount, &result); + if (err != Interop.Error.SUCCESS) + { + _registeredFiles = null; + return false; + } + } + + // Build free-slot stack (all slots initially free). + // Initialize in reverse order so that popping yields lowest indices first, + // matching native behavior. + _registeredFileFreeSlots = new uint[slotCount]; + for (uint i = 0; i < slotCount; i++) + { + _registeredFileFreeSlots[i] = slotCount - i - 1; + } + _registeredFileFreeSlotCount = slotCount; + _registeredFileHotSocket = -1; + _registeredFileHotIndex = -1; + _usesRegisteredFiles = true; + return true; + } + + /// + /// Updates a single registered-file slot to the given fd value by calling + /// IORING_REGISTER_FILES_UPDATE via the io_uring register syscall. + /// Ports the native UpdateRegisteredFileSlotLocked from pal_io_uring.c. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe Interop.Error UpdateRegisteredFileSlot(uint slot, int fd) + { + // io_uring_rsrc_update: { uint32 offset, uint32 resv, uint64 data } + // For IORING_REGISTER_FILES_UPDATE, data is a pointer to the fd value. + int fdValue = fd; + int* fdsPtr = &fdValue; + + uint* update = stackalloc uint[4]; // 16 bytes + update[0] = slot; // offset + update[1] = 0; // resv + *(ulong*)(update + 2) = (ulong)(nuint)fdsPtr; // data = pointer to fd + + int result; + Interop.Error err = Interop.Sys.IoUringShimRegister( + _managedRingFd, IoUringConstants.RegisterFilesUpdate, update, 1u, &result); + + // The shim returns SUCCESS only when the syscall returned >= 0, + // so no separate result check is needed. + return err; + } + + /// + /// Assigns a registered-file slot for a socket registration, enabling IOSQE_FIXED_FILE + /// for faster fd lookup in the kernel. If the registration does not yet have a slot, + /// one is popped from the free-slot stack and the kernel slot is updated. + /// Ports the native TryAssignRegisteredFileForRegistrationLocked from pal_io_uring.c. + /// + private void TryAssignRegisteredFileForRegistration( + SocketEventRegistration registration, out int sqeFd, out byte sqeFlags) + { + sqeFd = registration.Socket; + sqeFlags = 0; + + if (!_usesRegisteredFiles || _managedRingFd < 0) + return; + + if (registration.RegisteredFileIndex < 0) + { + if (_registeredFileFreeSlotCount == 0) + return; // No free slots + + uint slot = _registeredFileFreeSlots![--_registeredFileFreeSlotCount]; + Interop.Error err = UpdateRegisteredFileSlot(slot, registration.Socket); + if (err != Interop.Error.SUCCESS) + { + // Restore free slot on failure. + _registeredFileFreeSlots[_registeredFileFreeSlotCount++] = slot; + return; + } + + _registeredFiles![slot] = registration.Socket; + registration.RegisteredFileIndex = (int)slot; + } + + uint assignedSlot = (uint)registration.RegisteredFileIndex; + if (assignedSlot < (uint)_registeredFiles!.Length && + _registeredFiles[assignedSlot] == registration.Socket) + { + _registeredFileHotSocket = registration.Socket; + _registeredFileHotIndex = registration.RegisteredFileIndex; + sqeFd = registration.RegisteredFileIndex; + sqeFlags = IoUringConstants.SqeFixedFile; + } + } + + /// + /// Unregisters a file slot for a socket registration by updating the kernel slot to -1 + /// and returning the slot to the free list. + /// Ports the native TryUnregisterRegisteredFileForRegistrationLocked from pal_io_uring.c. + /// + private void TryUnregisterRegisteredFileForRegistration(SocketEventRegistration registration) + { + if (!_usesRegisteredFiles || registration.RegisteredFileIndex < 0) + return; + + if (Volatile.Read(ref _ioUringTeardownInitiated) != 0) + { + if (_registeredFileHotSocket == registration.Socket) + { + _registeredFileHotSocket = -1; + _registeredFileHotIndex = -1; + } + registration.RegisteredFileIndex = -1; + return; + } + + uint slot = (uint)registration.RegisteredFileIndex; + if (slot >= (uint)_registeredFiles!.Length) + { + registration.RegisteredFileIndex = -1; + return; + } + + Interop.Error err = UpdateRegisteredFileSlot(slot, -1); + if (err != Interop.Error.SUCCESS) + { + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Error(this, $"io_uring registered-file slot unregister failed: slot={slot}, error={err}"); + } + + _registeredFiles[slot] = -1; + if (_registeredFileFreeSlotCount < (uint)_registeredFileFreeSlots!.Length) + { + _registeredFileFreeSlots[_registeredFileFreeSlotCount++] = slot; + } + + if (_registeredFileHotSocket == registration.Socket) + { + _registeredFileHotSocket = -1; + _registeredFileHotIndex = -1; + } + + registration.RegisteredFileIndex = -1; + return; + } + + _registeredFiles[slot] = -1; + if (_registeredFileFreeSlotCount < (uint)_registeredFileFreeSlots!.Length) + { + _registeredFileFreeSlots[_registeredFileFreeSlotCount++] = slot; + } + + if (_registeredFileHotSocket == registration.Socket) + { + _registeredFileHotSocket = -1; + _registeredFileHotIndex = -1; + } + + registration.RegisteredFileIndex = -1; + } + + /// + /// Configures the SQE fd and flags for a socket operation, using the registered-file + /// hot cache for fast lookup, falling back to registration-based assignment. + /// Ports the native ConfigureSocketSqeFdAndFlagsLocked from pal_io_uring.c. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ConfigureSocketSqeFdAndFlags(int socketFd, out int sqeFd, out byte sqeFlags) + { + sqeFd = socketFd; + sqeFlags = 0; + + if (!_usesRegisteredFiles) + return; + + // Hot cache check + int hotIndex = _registeredFileHotIndex; + if (_registeredFileHotSocket == socketFd && + hotIndex >= 0 && + (uint)hotIndex < (uint)_registeredFiles!.Length && + _registeredFiles[hotIndex] == socketFd) + { + sqeFd = hotIndex; + sqeFlags = IoUringConstants.SqeFixedFile; + return; + } + + // Fallback: look up by socket registration + SocketEventRegistration? registration = FindRegistrationBySocket(socketFd); + if (registration is null) + return; + + TryAssignRegisteredFileForRegistration(registration, out sqeFd, out sqeFlags); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe void CleanupManagedRings() + { + _managedSqFlagsPtr = null; + if (_managedSqRingPtr != null) + { + // Unmap SQEs first + if (_managedSqesSize > 0) + { + Interop.Sys.IoUringShimMunmap(_ioUringSqRingInfo.SqeBase.ToPointer(), _managedSqesSize); + } + // Unmap CQ ring (only if separate from SQ ring) + if (!_managedUsesSingleMmap && _managedCqRingPtr != null && _managedCqRingPtr != _managedSqRingPtr) + { + Interop.Sys.IoUringShimMunmap(_managedCqRingPtr, _managedCqRingSize); + } + // Unmap SQ ring + Interop.Sys.IoUringShimMunmap(_managedSqRingPtr, _managedSqRingSize); + _managedSqRingPtr = null; + _managedCqRingPtr = null; + } + if (_managedRingFd >= 0) + { + Interop.Sys.IoUringShimCloseFd(_managedRingFd); + _managedRingFd = -1; + } + } + + /// + /// Orchestrates complete managed io_uring initialization: kernel version check, + /// ring setup with flag negotiation, mmap, opcode probe, eventfd creation, + /// ring fd registration, and initial wakeup poll queue. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe bool TryInitializeManagedIoUring() + { + if (!IsIoUringKernelVersionSupported()) + return false; + + bool sqPollRequested = IsSqPollRequested(); + if (!TrySetupIoUring(sqPollRequested, out IoUringSetupResult setupResult)) + return false; + + if (!TryMmapRings(ref setupResult)) + return false; + + _sqPollEnabled = setupResult.SqPollNegotiated; + if (NetEventSource.Log.IsEnabled()) + { + if (sqPollRequested && !_sqPollEnabled) + { + NetEventSource.Info( + this, + "SQPOLL requested but not negotiated (kernel support/capabilities may be unavailable)."); + } + else if (_sqPollEnabled) + { + NetEventSource.Info(this, "SQPOLL negotiated and enabled."); + } + } + + // Probe opcode support. + ProbeIoUringOpcodeSupport(setupResult.RingFd); + + // Try to initialize registered file table (optional optimization). + TryInitializeRegisteredFileTable(setupResult.RingFd); + + // Create wakeup eventfd. + int eventFd; + Interop.Error err = Interop.Sys.IoUringShimCreateEventFd(&eventFd); + if (err != Interop.Error.SUCCESS) + { + // Cleanup: unmap and close + CleanupManagedRings(); + return false; + } + _managedWakeupEventFd = eventFd; + + // Try to register the ring fd for faster enter syscalls. + if (TryRegisterRingFd(setupResult.RingFd, out int registeredRingFd)) + { + _ioUringSqRingInfo.RegisteredRingFd = registeredRingFd; + } + + // Queue the initial wakeup POLL_ADD. + // Direct SQE must be enabled for QueueManagedWakeupPollAdd to work. + _ioUringDirectSqeEnabled = true; + if (!QueueManagedWakeupPollAdd()) + { + _ioUringDirectSqeEnabled = false; + Interop.Sys.IoUringShimCloseFd(eventFd); + _managedWakeupEventFd = -1; + CleanupManagedRings(); + return false; + } + + // Respect process-level direct SQE toggle after the required wakeup POLL_ADD is armed. + if (IsIoUringDirectSqeDisabled()) + { + _ioUringDirectSqeEnabled = false; + } + + InitializeIoUringProvidedBufferRingIfSupported(setupResult.RingFd); + RefreshIoUringMultishotRecvSupport(); + _ioUringInitialized = true; + + InitializeDebugTestHooksFromEnvironment(); + + return true; + } + + /// Validates the managed NativeMsghdr layout contract for direct io_uring message operations. + [MethodImpl(MethodImplOptions.NoInlining)] + private bool IsNativeMsghdrLayoutSupportedForIoUring() + { + if (IntPtr.Size == 8 && sizeof(NativeMsghdr) == 56) + { + return true; + } + + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info( + this, + $"io_uring disabled: unsupported NativeMsghdr layout (pointerSize={IntPtr.Size}, sizeof(NativeMsghdr)={sizeof(NativeMsghdr)})"); + } + + return false; + } + + /// Detects io_uring support and initializes the managed submission/completion paths. + partial void LinuxDetectAndInitializeIoUring() + { + if (!IsIoUringEnabled() || !IsNativeMsghdrLayoutSupportedForIoUring() || !TryInitializeManagedIoUring()) + { + _ioUringCapabilities = ResolveLinuxIoUringCapabilities(isIoUringPort: false); + SocketsTelemetry.Log.ReportSocketEngineBackendSelected( + isIoUringPort: false, + isCompletionMode: false, + sqPollEnabled: false); + + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringModeSelection(_ioUringCapabilities); + } + + return; + } + + // Managed init succeeded — set capabilities and initialize managed-side state. + _ioUringCapabilities = new LinuxIoUringCapabilities( + isIoUringPort: true, + mode: LinuxIoUringMode.CompletionMode, + supportsMultishotRecv: _supportsMultishotRecv, + supportsMultishotAccept: _supportsMultishotAccept, + supportsZeroCopySend: _zeroCopySendEnabled, + sqPollEnabled: _sqPollEnabled); + + SocketsTelemetry.Log.ReportSocketEngineBackendSelected( + isIoUringPort: true, + isCompletionMode: true, + sqPollEnabled: _sqPollEnabled); + + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringModeSelection(_ioUringCapabilities); + } + + InitializeLinuxIoUringDiagnosticsState(); + + _ioUringSlotCapacity = (int)Math.Max(_managedCqEntries, IoUringConstants.QueueEntries); + // Slot pool capacity is 2x slot capacity (currently 8192 with default cq sizing). + // Multishot operations retain slots for their full lifetime, so this bounds + // concurrent long-lived multishot receives before backpressure/exhaustion. + _ioUringPrepareQueue = new MpscQueue(); + _ioUringCancelQueue = new MpscQueue(); + _ioUringOperationRegistry = new IoUringOperationRegistry(_ioUringSlotCapacity); + InitializeCompletionSlotPool(_ioUringSlotCapacity * IoUringConstants.CompletionOperationPoolCapacityFactor); + + if (RequiresPollReadiness()) + { + _registrationsBySocket = new Dictionary(); + _registrationsByRequestId = new Dictionary(); + _registrationChangeQueue = new ConcurrentQueue(); + } + + _managedCqDrainEnabled = true; + } + + /// Tears down io_uring state before native resource cleanup. + partial void LinuxBeforeFreeNativeResources(ref bool closeSocketEventPort) + { + if (!_ioUringCapabilities.IsIoUringPort || _port == (IntPtr)(-1)) + { + return; + } + + Volatile.Write(ref _ioUringTeardownInitiated, 1); + DrainQueuedIoUringOperationsForTeardown(); + + Interop.Error closeError = Interop.Sys.CloseSocketEventPort(_port); + if (closeError == Interop.Error.SUCCESS) + { + closeSocketEventPort = false; + Volatile.Write(ref _ioUringPortClosedForTeardown, 1); + } + } + + /// Submits pending SQEs before entering the wait. + partial void LinuxEventLoopBeforeWait() + { + ProcessPendingRegistrationChanges(); + + Interop.Error submitError = SubmitIoUringBatch(); + if (submitError != Interop.Error.SUCCESS) + { + ThrowInternalException(submitError); + } + } + + /// Attempts a managed completion wait using io_uring_enter with timeout. + partial void LinuxEventLoopTryCompletionWait(SocketEventHandler handler, ref int numEvents, ref int numCompletions, ref Interop.Error err, ref bool waitHandled) + { + if (!_ioUringCapabilities.IsCompletionMode) + { + return; + } + + // Managed CQE drain path: read CQEs directly from mmap'd ring. + // First, try a non-blocking drain of any already-available CQEs. + bool hadCqes = DrainCqeRingBatch(handler); + if (hadCqes) + { + numCompletions = 1; + waitHandled = true; + err = Interop.Error.SUCCESS; + return; + } + + // No CQEs available — submit pending SQEs and wait for at least 1 CQE. + uint enterFlags = IoUringConstants.EnterGetevents; + int ringFd = _managedRingFd; + if (_ioUringSqRingInfo.RegisteredRingFd >= 0) + { + enterFlags |= IoUringConstants.EnterRegisteredRing; + ringFd = _ioUringSqRingInfo.RegisteredRingFd; + } + + uint submitCount = _sqPollEnabled ? 0u : _ioUringManagedPendingSubmissions; + if (_sqPollEnabled && + _ioUringManagedPendingSubmissions != 0 && + SqNeedWakeup()) + { + enterFlags |= IoUringConstants.EnterSqWakeup; + } + + if (_managedUsesExtArg) + { + // Bounded wait with 50ms timeout via EXT_ARG. + enterFlags |= IoUringConstants.EnterExtArg; + Interop.Sys.IoUringKernelTimespec timeout = default; + timeout.TvNsec = IoUringConstants.BoundedWaitTimeoutNanos; + Interop.Sys.IoUringGeteventsArg extArg = default; + extArg.Ts = (ulong)(nuint)(&timeout); + + int result; + err = Interop.Sys.IoUringShimEnterExt( + ringFd, submitCount, 1, enterFlags, &extArg, &result); + if (err == Interop.Error.SUCCESS) + { + _ioUringManagedPendingSubmissions = 0; + } + } + else + { + int result; + err = Interop.Sys.IoUringShimEnter( + ringFd, submitCount, 1, enterFlags, &result); + if (err == Interop.Error.SUCCESS) + { + _ioUringManagedPendingSubmissions = 0; + } + } + + // Drain after waking. + hadCqes = DrainCqeRingBatch(handler); + numCompletions = hadCqes ? 1 : 0; + numEvents = 0; + waitHandled = true; + err = Interop.Error.SUCCESS; + } + + /// Polls diagnostics after each event loop iteration. + partial void LinuxEventLoopAfterIteration() => + PollIoUringDiagnosticsIfNeeded(force: false); + + /// Unmaps rings and closes the ring fd. + partial void LinuxFreeIoUringResources() + { + // Managed io_uring teardown: release resources allocated during TryInitializeManagedIoUring. + // This must run BEFORE the common slot/buffer cleanup below because kernel + // unregister operations need the ring fd to still be open. + if (_ioUringInitialized) + { + // 0. Unregister/dispose provided buffer ring while the main ring fd is still open. + FreeIoUringProvidedBufferRing(); + + // 1. Drain pending registration changes — complete them with ECANCELED + // so callers waiting on CompletionEvent are unblocked. + if (_registrationChangeQueue is not null) + { + while (_registrationChangeQueue.TryDequeue(out RegistrationChangeRequest? request)) + { + request.Error = Interop.Error.ECANCELED; + request.Completed = true; + request.CompletionEvent.Set(); + } + _registrationChangeQueue = null; + } + + // 2. Clear registration tracking dictionaries. + // Individual per-fd unregistration is skipped because _ioUringTeardownInitiated + // is already set, so TryUnregisterRegisteredFileForRegistration would early-return. + // The entire registered-file table is bulk-unregistered in step 3 instead. + if (_registrationsBySocket is not null) + { + _registrationsBySocket.Clear(); + _registrationsBySocket = null; + } + if (_registrationsByRequestId is not null) + { + _registrationsByRequestId.Clear(); + _registrationsByRequestId = null; + } + + // 3. Unregister the entire registered-file table from the kernel (bulk). + if (_usesRegisteredFiles && _managedRingFd >= 0) + { + int result; + Interop.Sys.IoUringShimRegister( + _managedRingFd, IoUringConstants.UnregisterFiles, null, 0u, &result); + _usesRegisteredFiles = false; + } + _registeredFiles = null; + _registeredFileFreeSlots = null; + _registeredFileFreeSlotCount = 0; + _registeredFileHotSocket = -1; + _registeredFileHotIndex = -1; + + // 4. The registered ring fd is implicitly released when the ring fd is closed. + // Just mark it as inactive so no subsequent code attempts to use it. + _ioUringSqRingInfo.RegisteredRingFd = -1; + + // 5. Close the wakeup eventfd. + if (_managedWakeupEventFd >= 0) + { + Interop.Sys.IoUringShimCloseFd(_managedWakeupEventFd); + _managedWakeupEventFd = -1; + } + + // 6. Unmap SQ/CQ rings, SQEs and close the ring fd. + // Closing the ring fd also terminates any kernel SQPOLL thread for this ring. + CleanupManagedRings(); + + // 7. Disable managed flags to prevent any late operations. + _ioUringInitialized = false; + _managedCqDrainEnabled = false; + } + + bool portClosedForTeardown = Volatile.Read(ref _ioUringPortClosedForTeardown) != 0; + if (!portClosedForTeardown) + { + PollIoUringDiagnosticsIfNeeded(force: true); + } + + DrainQueuedIoUringOperationsForTeardown(); + + if (_ioUringOperationRegistry is not null) + { + DrainTrackedIoUringOperationsForTeardown(portClosedForTeardown); + Debug.Assert(_ioUringOperationRegistry.IsEmpty, $"Leaked tracked io_uring operations: {_ioUringOperationRegistry.Count}"); + _ioUringOperationRegistry = null; + + // Free any native memory still held by completion slots + if (_completionSlots is not null) + { + for (int i = 0; i < _completionSlots.Length; i++) + { + ReleaseZeroCopyPinHold(i); + ref IoUringCompletionSlot slot = ref _completionSlots[i]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![i]; + if (slot.Kind == IoUringCompletionOperationKind.Message) + { + FreeMessageStorage(i); + } + else if (slot.Kind == IoUringCompletionOperationKind.Accept && slotStorage.NativeSocketAddressLengthPtr != null) + { + NativeMemory.Free(slotStorage.NativeSocketAddressLengthPtr); + slotStorage.NativeSocketAddressLengthPtr = null; + } + } + _completionSlots = null; + _completionSlotStorage = null; + _zeroCopyPinHolds = null; + _completionSlotFreeListHead = -1; + _completionSlotsInUse = 0; + } + + _ioUringSlotCapacity = 0; + _ioUringManagedPendingSubmissions = 0; + _ioUringManagedSqTail = 0; + _ioUringManagedSqTailLoaded = false; + _ioUringSqRingInfo = default; + _ioUringDirectSqeEnabled = false; + _sqPollEnabled = false; + + LogLinuxIoUringTeardownSummaryIfNeeded(); + } + + ResetIoUringPrepareQueueDepthTelemetry(); + + // Final flush of managed io_uring deltas in case teardown modified counters + // after the forced diagnostics poll and no further event-loop iteration runs. + PublishIoUringManagedDiagnosticsDelta(); + } + + /// Publishes prepare queue depth delta to telemetry and resets the counter. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ResetIoUringPrepareQueueDepthTelemetry() + { + long publishedDepth = Interlocked.Exchange(ref _ioUringPublishedPrepareQueueLength, 0); + if (publishedDepth != 0) + { + SocketsTelemetry.Log.IoUringPrepareQueueDepthDelta(-publishedDepth); + } + } + + /// Queued work item pairing an operation with its prepare sequence number for deferred SQE preparation. + private readonly struct IoUringPrepareWorkItem + { + /// The operation to prepare. + public readonly SocketAsyncContext.AsyncOperation Operation; + /// The sequence number that must match for the preparation to proceed. + public readonly long PrepareSequence; + + /// Creates a work item pairing an operation with its prepare sequence number. + public IoUringPrepareWorkItem(SocketAsyncContext.AsyncOperation operation, long prepareSequence) + { + Operation = operation; + PrepareSequence = prepareSequence; + } + } + + /// Enqueues an operation for deferred SQE preparation on the event loop thread. + internal bool TryEnqueueIoUringPreparation(SocketAsyncContext.AsyncOperation operation, long prepareSequence) + { + if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0) + { + return false; + } + + MpscQueue? prepareQueue = _ioUringPrepareQueue; + if (prepareQueue is null) + { + return false; + } + + long queueLength = Interlocked.Increment(ref _ioUringPrepareQueueLength); + if (queueLength > s_ioUringPrepareQueueCapacity) + { + Interlocked.Decrement(ref _ioUringPrepareQueueLength); + long overflowCount = Interlocked.Increment(ref _ioUringPrepareQueueOverflowCount); + if ((overflowCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled()) + { + LogIoUringPrepareQueueOverflow(overflowCount, s_ioUringPrepareQueueCapacity); + } + + return false; + } + + prepareQueue.Enqueue(new IoUringPrepareWorkItem(operation, prepareSequence)); + WakeEventLoop(); + return true; + } + + /// Removes an operation from the registry, logging on mismatch. + internal bool TryUntrackIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation? expectedOperation = null) + { + IoUringOperationRegistry? registry = _ioUringOperationRegistry; + if (registry is null) + { + return true; + } + + IoUringOperationRegistry.RemoveResult removeResult = + registry.TryUntrack(userData, expectedOperation, out _); + if (removeResult == IoUringOperationRegistry.RemoveResult.Mismatch) + { + Debug.Fail("io_uring tracked operation mismatch while untracking user_data."); + long mismatchCount = Interlocked.Increment(ref _ioUringUntrackMismatchCount); + if ((mismatchCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled()) + { + LogIoUringUntrackMismatch(userData, mismatchCount); + } + + return false; + } + + return true; + } + + /// Attempts to replace the currently tracked operation for an existing user_data slot. + internal bool TryReplaceIoUringTrackedOperation(ulong userData, SocketAsyncContext.AsyncOperation newOperation) + { + return _ioUringOperationRegistry?.TryReplace(userData, newOperation) ?? false; + } + + /// Enqueues a user_data for ASYNC_CANCEL on the event loop thread. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryEnqueueIoUringCancellation(ulong userData) + { + if (!_ioUringCapabilities.IsCompletionMode || userData == 0 || Volatile.Read(ref _ioUringTeardownInitiated) != 0) + { + return false; + } + + MpscQueue? cancelQueue = _ioUringCancelQueue; + if (cancelQueue is null) + { + return false; + } + + for (int attempt = 0; ; attempt++) + { + long queueLength = Interlocked.Increment(ref _ioUringCancelQueueLength); + if (queueLength <= s_ioUringCancellationQueueCapacity) + { + cancelQueue.Enqueue(userData); + return true; + } + + Interlocked.Decrement(ref _ioUringCancelQueueLength); + if (attempt == 0) + { + // Queue can be transiently full under burst cancellation. + // Nudge the event loop to drain, then retry once before recording overflow. + WakeEventLoop(); + Thread.SpinWait(64); + continue; + } + + long overflowCount = Interlocked.Increment(ref _ioUringCancelQueueOverflowCount); + if ((overflowCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled()) + { + LogIoUringCancellationQueueOverflow(overflowCount, s_ioUringCancellationQueueCapacity); + } + + return false; + } + } + + /// Writes an ASYNC_CANCEL SQE directly if the engine is on the event loop thread. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryQueueIoUringAsyncCancel(ulong userData) + { + if (!_ioUringCapabilities.IsIoUringPort || userData == 0) + { + return false; + } + + if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out _)) + { + return false; + } + + WriteAsyncCancelSqe(sqe, userData); + return true; + } + + /// Writes to the eventfd to wake the event loop from a blocking wait. + [MethodImpl(MethodImplOptions.NoInlining)] + private Interop.Error ManagedWakeEventLoop() + { + return Interop.Sys.IoUringShimWriteEventFd(_managedWakeupEventFd); + } + + /// Sends a coalesced wake signal to the event loop thread. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void WakeEventLoop() + { + if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0) + { + return; + } + + if (Interlocked.Exchange(ref _ioUringWakeupRequested, 1) != 0) + { + return; + } + + Interop.Error error = ManagedWakeEventLoop(); + if (error != Interop.Error.SUCCESS) + { + // Reset flag so the next producer can retry the eventfd write. + // Worst case under sustained wake failure: work is picked up on the next bounded wait cycle. + Volatile.Write(ref _ioUringWakeupRequested, 0); + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringWakeFailure(error); + } + } + } + + /// Enqueues a cancellation request and wakes the event loop. + internal void TryRequestIoUringCancellation(ulong userData) + { + if (!TryEnqueueIoUringCancellation(userData)) + { + return; + } + + WakeEventLoop(); + } + + /// + /// Enqueues a readiness fallback event when io_uring submission is congested. + /// + internal void EnqueueReadinessFallbackEvent( + SocketAsyncContext context, + Interop.Sys.SocketEvents events, + bool countAsPrepareQueueOverflowFallback = false) + { + if (events == Interop.Sys.SocketEvents.None) + { + return; + } + + _eventQueue.Enqueue(new SocketIOEvent(context, events)); + if (countAsPrepareQueueOverflowFallback) + { + RecordIoUringPrepareQueueOverflowFallback(); + } + EnsureWorkerScheduled(); + } + + /// Drains queued cancellation requests into ASYNC_CANCEL SQEs. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool DrainIoUringCancellationQueue() + { + MpscQueue? cancelQueue = _ioUringCancelQueue; + if (cancelQueue is null) + { + return false; + } + + bool preparedSqe = false; + for (int drained = 0; drained < MaxIoUringCancelQueueDrainPerSubmit && + cancelQueue.TryDequeue(out ulong userData); drained++) + { + long remainingLength = Interlocked.Decrement(ref _ioUringCancelQueueLength); + Debug.Assert(remainingLength >= 0); + + // Cancellation requests can race with terminal completion/untracking. + // Skip stale requests to avoid issuing known -ENOENT async-cancel SQEs. + if (!IsTrackedIoUringOperation(userData)) + { + continue; + } + + if (TryQueueIoUringAsyncCancel(userData)) + { + preparedSqe = true; + } + } + return preparedSqe; + } + + /// Drains both prepare and cancel queues, then submits all pending SQEs. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private Interop.Error SubmitIoUringBatch() + { + if (!_ioUringCapabilities.IsIoUringPort) + { + return Interop.Error.SUCCESS; + } + + Debug.Assert(IsCurrentThreadEventLoopThread(), + "SubmitIoUringBatch must only be called from the event loop thread (SINGLE_ISSUER contract)."); + bool preparedSqe = false; + if (_ioUringCapabilities.IsCompletionMode) + { + // Clear the coalesced wake flag before draining queues so producers that enqueue + // during this drain window can publish a new wake signal without being suppressed. + Volatile.Write(ref _ioUringWakeupRequested, 0); + + preparedSqe |= DrainIoUringCancellationQueue(); + + MpscQueue? prepareQueue = _ioUringPrepareQueue; + if (prepareQueue is null) + { + return Interop.Error.EINVAL; + } + + for (int drained = 0; drained < MaxIoUringPrepareQueueDrainPerSubmit && + prepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem); drained++) + { + long remainingLength = Interlocked.Decrement(ref _ioUringPrepareQueueLength); + Debug.Assert(remainingLength >= 0); + Interop.Error prepareError = TryPrepareAndTrackIoUringOperation( + workItem.Operation, + workItem.PrepareSequence, + out bool preparedOperation); + if (prepareError != Interop.Error.SUCCESS) + { + return prepareError; + } + + preparedSqe |= preparedOperation; + if (!preparedOperation && workItem.Operation.IsInWaitingState()) + { + if (IsPotentialCompletionSlotExhaustion()) + { + int retryCount = workItem.Operation.IncrementIoUringSlotExhaustionRetryCount(); + if (retryCount < MaxSlotExhaustionRetries && + workItem.Operation.TryQueueIoUringPreparation()) + { + continue; + } + } + + workItem.Operation.ResetIoUringSlotExhaustionRetryCount(); + EmitReadinessFallbackForUnpreparedOperation(workItem.Operation); + } + } + + } + + if (!preparedSqe) + { + // Inline re-prepare paths can write SQEs outside queue drains; ensure they are submitted. + if (_ioUringManagedPendingSubmissions != 0) + { + return SubmitIoUringOperationsNormalized(); + } + + if ((_ioUringCancelQueue?.IsEmpty == false) || (_ioUringPrepareQueue?.IsEmpty == false)) + { + WakeEventLoop(); + } + + return Interop.Error.SUCCESS; + } + + return SubmitIoUringOperationsNormalized(); + } + + /// + /// Prepares an operation for io_uring submission and tracks it in the completion registry. + /// On non-prepared paths, clears operation user_data and releases preparation resources. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private Interop.Error TryPrepareAndTrackIoUringOperation( + SocketAsyncContext.AsyncOperation operation, + long prepareSequence, + out bool preparedSqe) + { + preparedSqe = false; + + bool prepared = operation.TryPrepareIoUring(operation.AssociatedContext, prepareSequence); + if (prepared) + { + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Queued, + IoUringOperationLifecycleState.Prepared); + } + + if (prepared && operation.ErrorCode == SocketError.Success) + { + preparedSqe = true; + if (!TryTrackPreparedIoUringOperation(operation)) + { + // Invariant violation: tracking collision after prepare. + // A prepared SQE may now complete without a managed owner; do not attempt best-effort recovery. + // Surface InternalException to terminate the engine path deterministically. + Debug.Fail("io_uring prepared operation could not be tracked by user_data."); + operation.ClearIoUringUserData(); + return Interop.Error.EINVAL; + } + + return Interop.Error.SUCCESS; + } + + if (prepared) + { + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Prepared, + IoUringOperationLifecycleState.Detached); + } + + if (!TryUntrackIoUringOperation(operation.IoUringUserData, operation)) + { + // Mismatch indicates token ownership confusion; avoid releasing + // resources that may still be associated with another tracked op. + return Interop.Error.EINVAL; + } + + operation.ClearIoUringUserData(); + return Interop.Error.SUCCESS; + } + + /// + /// Falls back to readiness notification for an operation that remained waiting after a failed prepare attempt. + /// + private void EmitReadinessFallbackForUnpreparedOperation(SocketAsyncContext.AsyncOperation operation) + { + operation.ClearIoUringUserData(); + Interop.Sys.SocketEvents fallbackEvents = operation.GetIoUringFallbackSocketEvents(); + if (fallbackEvents == Interop.Sys.SocketEvents.None) + { + return; + } + + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringPrepareFallbackToReadiness(fallbackEvents); + } + + EnqueueReadinessFallbackEvent(operation.AssociatedContext, fallbackEvents); + + [MethodImpl(MethodImplOptions.NoInlining)] + void LogIoUringPrepareFallbackToReadiness(Interop.Sys.SocketEvents events) + { + NetEventSource.Error( + this, + $"io_uring prepare fallback to readiness notification: events={events}"); + } + } + + /// Registers a prepared operation in the completion registry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryTrackPreparedIoUringOperation(SocketAsyncContext.AsyncOperation operation) + { + IoUringOperationRegistry? registry = _ioUringOperationRegistry; + if (registry is null) + { + return false; + } + + if (registry.TryTrack(operation)) + { + return true; + } + + // Persistent multishot receive can rebind an existing tracked user_data to a new + // managed operation before this call. In that case, tracking is already satisfied. + return operation.IoUringUserData != 0 && + registry.TryGet(operation.IoUringUserData, out SocketAsyncContext.AsyncOperation? trackedOperation) && + ReferenceEquals(trackedOperation, operation); + } + + /// Returns whether the given user_data is currently tracked. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsTrackedIoUringOperation(ulong userData) + { + IoUringOperationRegistry? registry = _ioUringOperationRegistry; + return registry is not null && registry.Contains(userData); + } + + /// Returns whether current completion-slot usage indicates likely slot exhaustion pressure. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsPotentialCompletionSlotExhaustion() + { + IoUringCompletionSlot[]? completionSlots = _completionSlots; + if (completionSlots is null || completionSlots.Length == 0) + { + return false; + } + + int threshold = Math.Max(0, completionSlots.Length - 16); + return _completionSlotsInUse >= threshold; + } + + /// Returns whether the calling thread is the event loop thread. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsCurrentThreadEventLoopThread() => + Volatile.Read(ref _eventLoopManagedThreadId) == Environment.CurrentManagedThreadId; + + /// Returns whether a submit error indicates an unsupported operation rather than a real failure. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsIgnoredIoUringSubmitError(Interop.Error error) => + error == Interop.Error.ENOSYS || error == Interop.Error.ENOTSUP || error == Interop.Error.EOPNOTSUPP; + + /// Submits the specified number of pending SQEs via io_uring_enter. + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe Interop.Error ManagedSubmitPendingEntries(uint toSubmit) + { + if (toSubmit == 0) + return Interop.Error.SUCCESS; + + Debug.Assert(IsCurrentThreadEventLoopThread(), + "ManagedSubmitPendingEntries must only be called from the event loop thread (SINGLE_ISSUER contract)."); + if (_sqPollEnabled) + { + if (!SqNeedWakeup()) + { + SocketsTelemetry.Log.IoUringSqPollSubmissionSkipped(toSubmit); + return Interop.Error.SUCCESS; + } + + uint wakeupFlags = IoUringConstants.EnterSqWakeup; + int wakeupRingFd = _managedRingFd; + if (_ioUringSqRingInfo.RegisteredRingFd >= 0) + { + wakeupFlags |= IoUringConstants.EnterRegisteredRing; + wakeupRingFd = _ioUringSqRingInfo.RegisteredRingFd; + } + + if (NetEventSource.Log.IsEnabled()) + { + LogSqPollWakeup(this, toSubmit); + } + + SocketsTelemetry.Log.IoUringSqPollWakeup(); + int wakeupResult; + return Interop.Sys.IoUringShimEnter(wakeupRingFd, 0, 0, wakeupFlags, &wakeupResult); + } + + uint enterFlags = 0; + int ringFd = _managedRingFd; + if (_ioUringSqRingInfo.RegisteredRingFd >= 0) + { + enterFlags |= IoUringConstants.EnterRegisteredRing; + ringFd = _ioUringSqRingInfo.RegisteredRingFd; + } + + while (toSubmit > 0) + { + int result; + Interop.Error err = Interop.Sys.IoUringShimEnter(ringFd, toSubmit, 0, enterFlags, &result); + if (err != Interop.Error.SUCCESS) + return err; + + if (result <= 0) + { + return Interop.Error.EAGAIN; + } + + toSubmit -= (uint)result; + } + return Interop.Error.SUCCESS; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void LogSqPollWakeup(SocketAsyncEngine engine, uint pendingSubmissionCount) => + NetEventSource.Info(engine, $"io_uring SQPOLL wakeup requested for pending SQEs: {pendingSubmissionCount}"); + + /// Computes pending submissions and calls ManagedSubmitPendingEntries. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private Interop.Error SubmitIoUringOperationsNormalized() + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "SubmitIoUringOperationsNormalized must only be called from the event loop thread (SINGLE_ISSUER contract)."); + PublishManagedSqeTail(); + uint managedPending = _ioUringManagedPendingSubmissions; + _ioUringManagedPendingSubmissions = 0; + + Interop.Error error = ManagedSubmitPendingEntries(managedPending); + + if (error != Interop.Error.SUCCESS && managedPending != 0) + { + _ioUringManagedPendingSubmissions += managedPending; + } + + return IsIgnoredIoUringSubmitError(error) ? Interop.Error.SUCCESS : error; + } + + /// Cancels all queued-but-not-submitted operations during teardown. + private void DrainQueuedIoUringOperationsForTeardown() + { + MpscQueue? prepareQueue = _ioUringPrepareQueue; + if (prepareQueue is not null) + { + while (prepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem)) + { + long remainingLength = Interlocked.Decrement(ref _ioUringPrepareQueueLength); + Debug.Assert(remainingLength >= 0); + + SocketAsyncContext.AsyncOperation operation = workItem.Operation; + operation.CancelPendingIoUringPreparation(workItem.PrepareSequence); + operation.TryCancelForTeardown(); + operation.ClearIoUringUserData(); + } + } + + MpscQueue? cancelQueue = _ioUringCancelQueue; + if (cancelQueue is not null) + { + while (cancelQueue.TryDequeue(out _)) + { + long remainingLength = Interlocked.Decrement(ref _ioUringCancelQueueLength); + Debug.Assert(remainingLength >= 0); + } + } + + Volatile.Write(ref _ioUringWakeupRequested, 0); + } + + /// + /// Cancels all tracked in-flight operations during teardown. + /// This includes any future long-lived operations (for example multishot recv). + /// + private void DrainTrackedIoUringOperationsForTeardown(bool portClosedForTeardown) + { + IoUringOperationRegistry? registry = _ioUringOperationRegistry; + if (registry is null || registry.IsEmpty) + { + return; + } + + bool queuedAsyncCancel = false; + bool canPrepareTeardownCancels = !portClosedForTeardown && IsCurrentThreadEventLoopThread(); + foreach (SocketAsyncContext.AsyncOperation operation in registry.DrainAllTrackedOperations()) + { + ulong userData = operation.IoUringUserData; + if (canPrepareTeardownCancels && + TryQueueIoUringAsyncCancel(userData)) + { + queuedAsyncCancel = true; + } + + // Teardown policy: if the port was already closed, native ownership has been + // detached and it is now safe to release operation-owned resources eagerly. + // Otherwise, queue best-effort async cancel before releasing resources. + operation.TryCancelForTeardown(); + operation.ClearIoUringUserData(); + } + + if (canPrepareTeardownCancels && queuedAsyncCancel) + { + Interop.Error submitError = SubmitIoUringOperationsNormalized(); + if (submitError != Interop.Error.SUCCESS) + { + if (NetEventSource.Log.IsEnabled()) LogIoUringAsyncCancelSubmitFailure(submitError, IoUringCancellationOrigin.Teardown); + } + } + } + + /// Increments the late-completion counter and samples to the log. + private void RecordBenignLateIoUringCompletion(ulong userData) + { + RecordIoUringCounterAndMaybeLog(ref _ioUringBenignLateCompletionCount, userData, "io_uring completion arrived after managed untrack"); + } + + /// Increments the poll-readiness CQE diagnostic counter. + private static void RecordIoUringPollReadinessCqe() + { + Interlocked.Increment(ref s_ioUringPollReadinessCqeCount); + } + + /// Increments the diagnostic counter tracking pending completion retries that queued prepare work. + private static void RecordIoUringPendingRetryQueuedToPrepareQueue() + { + Interlocked.Increment(ref s_ioUringPendingRetryQueuedToPrepareQueueCount); + } + + /// Increments the completion-slot exhaustion counter. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RecordIoUringCompletionSlotExhaustion() + { + Interlocked.Increment(ref _ioUringCompletionSlotExhaustionCount); + } + + /// Increments the completion-slot drain-recovery counter. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RecordIoUringCompletionSlotDrainRecovery() + { + Interlocked.Increment(ref _ioUringCompletionSlotDrainRecoveryCount); + } + + /// Increments the prepare-queue overflow fallback counter. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RecordIoUringPrepareQueueOverflowFallback() + { + Interlocked.Increment(ref _ioUringPrepareQueueOverflowFallbackCount); + } + + /// Increments the requeue-failure counter and samples to the log. + private void RecordIoUringCompletionRequeueFailure(ulong userData) + { + RecordIoUringCounterAndMaybeLog(ref _ioUringCompletionRequeueFailureCount, userData, "io_uring completion requeue failed; queued readiness fallback"); + } + + /// Increments a counter and logs a sample every 64 increments. + private void RecordIoUringCounterAndMaybeLog(ref long counter, ulong userData, string message) + { + long count = Interlocked.Increment(ref counter); + if ((count & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled()) + { + LogIoUringCounterSample(message, count, userData); + } + } + + /// Logs the teardown summary if any late completions were recorded. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void LogLinuxIoUringTeardownSummaryIfNeeded() + { + long lateCompletionCount = Interlocked.Read(ref _ioUringBenignLateCompletionCount); + if (lateCompletionCount > 0 && NetEventSource.Log.IsEnabled()) + { + LogIoUringTeardownSummary(lateCompletionCount); + } + } + + /// Periodically polls native counters and publishes deltas to telemetry. + private void PollIoUringDiagnosticsIfNeeded(bool force) + { + if (!_ioUringCapabilities.IsIoUringPort) + { + return; + } + + if (!force) + { + int countdown = _ioUringDiagnosticsPollCountdown - 1; + _ioUringDiagnosticsPollCountdown = countdown; + if (countdown > 0) + { + return; + } + } + + _ioUringDiagnosticsPollCountdown = IoUringDiagnosticsPollInterval; + PublishIoUringManagedDiagnosticsDelta(); + if (!_ioUringAdvancedFeatureStateLogged && NetEventSource.Log.IsEnabled()) + { + _ioUringAdvancedFeatureStateLogged = true; + LogIoUringAdvancedFeatureState(); + } + + if (!force) + { + EvaluateProvidedBufferRingResize(); + } + } + + /// Returns the non-negative delta between two counter snapshots. + private static long ComputeManagedCounterDelta(long previous, long current) => + current >= previous ? current - previous : current; + + /// Computes and publishes the global non-pinnable fallback counter delta. + private static long GetIoUringNonPinnablePrepareFallbackDelta() + { + // This counter is process-wide and shared across all engines. Serialize publication so + // concurrent engine loops do not double-publish or observe torn baseline updates. + if (Interlocked.CompareExchange(ref s_ioUringPublishingNonPinnablePrepareFallback, 1, 0) != 0) + { + return 0; + } + + try + { + long current = SocketAsyncContext.GetIoUringNonPinnablePrepareFallbackCount(); + long previous = Interlocked.Exchange(ref s_ioUringPublishedNonPinnablePrepareFallbackCount, current); + return ComputeManagedCounterDelta(previous, current); + } + finally + { + Volatile.Write(ref s_ioUringPublishingNonPinnablePrepareFallback, 0); + } + } + + /// Publishes all managed diagnostic counter deltas to telemetry. + private void PublishIoUringManagedDiagnosticsDelta() + { + long requeueFailureCurrent = Interlocked.Read(ref _ioUringCompletionRequeueFailureCount); + long requeueFailurePrevious = Volatile.Read(ref _ioUringPublishedCompletionRequeueFailureCount); + long requeueFailureDelta = ComputeManagedCounterDelta(requeueFailurePrevious, requeueFailureCurrent); + Volatile.Write(ref _ioUringPublishedCompletionRequeueFailureCount, requeueFailureCurrent); + + long nonPinnableFallbackDelta = GetIoUringNonPinnablePrepareFallbackDelta(); + long prepareQueueOverflowCurrent = Interlocked.Read(ref _ioUringPrepareQueueOverflowCount); + long prepareQueueOverflowPrevious = Volatile.Read(ref _ioUringPublishedPrepareQueueOverflowCount); + long prepareQueueOverflowDelta = ComputeManagedCounterDelta(prepareQueueOverflowPrevious, prepareQueueOverflowCurrent); + Volatile.Write(ref _ioUringPublishedPrepareQueueOverflowCount, prepareQueueOverflowCurrent); + long prepareQueueOverflowFallbackCurrent = Interlocked.Read(ref _ioUringPrepareQueueOverflowFallbackCount); + long prepareQueueOverflowFallbackPrevious = Volatile.Read(ref _ioUringPublishedPrepareQueueOverflowFallbackCount); + long prepareQueueOverflowFallbackDelta = ComputeManagedCounterDelta(prepareQueueOverflowFallbackPrevious, prepareQueueOverflowFallbackCurrent); + Volatile.Write(ref _ioUringPublishedPrepareQueueOverflowFallbackCount, prepareQueueOverflowFallbackCurrent); + long prepareQueueLengthCurrent = Interlocked.Read(ref _ioUringPrepareQueueLength); + long prepareQueueLengthPrevious = Volatile.Read(ref _ioUringPublishedPrepareQueueLength); + long prepareQueueDepthDelta = prepareQueueLengthCurrent - prepareQueueLengthPrevious; + Volatile.Write(ref _ioUringPublishedPrepareQueueLength, prepareQueueLengthCurrent); + long completionSlotExhaustionCurrent = Interlocked.Read(ref _ioUringCompletionSlotExhaustionCount); + long completionSlotExhaustionPrevious = Volatile.Read(ref _ioUringPublishedCompletionSlotExhaustionCount); + long completionSlotExhaustionDelta = ComputeManagedCounterDelta(completionSlotExhaustionPrevious, completionSlotExhaustionCurrent); + Volatile.Write(ref _ioUringPublishedCompletionSlotExhaustionCount, completionSlotExhaustionCurrent); + long completionSlotDrainRecoveryCurrent = Interlocked.Read(ref _ioUringCompletionSlotDrainRecoveryCount); + long completionSlotDrainRecoveryPrevious = Volatile.Read(ref _ioUringPublishedCompletionSlotDrainRecoveryCount); + long completionSlotDrainRecoveryDelta = ComputeManagedCounterDelta(completionSlotDrainRecoveryPrevious, completionSlotDrainRecoveryCurrent); + Volatile.Write(ref _ioUringPublishedCompletionSlotDrainRecoveryCount, completionSlotDrainRecoveryCurrent); + + if (requeueFailureDelta != 0) + { + SocketsTelemetry.Log.IoUringCompletionRequeueFailure(requeueFailureDelta); + } + + if (nonPinnableFallbackDelta != 0) + { + SocketsTelemetry.Log.IoUringPrepareNonPinnableFallback(nonPinnableFallbackDelta); + } + + if (prepareQueueOverflowDelta != 0) + { + SocketsTelemetry.Log.IoUringPrepareQueueOverflow(prepareQueueOverflowDelta); + } + + if (prepareQueueOverflowFallbackDelta != 0) + { + SocketsTelemetry.Log.IoUringPrepareQueueOverflowFallback(prepareQueueOverflowFallbackDelta); + } + + if (prepareQueueDepthDelta != 0) + { + SocketsTelemetry.Log.IoUringPrepareQueueDepthDelta(prepareQueueDepthDelta); + } + + if (completionSlotExhaustionDelta != 0) + { + SocketsTelemetry.Log.IoUringCompletionSlotExhaustion(completionSlotExhaustionDelta); + } + + if (completionSlotDrainRecoveryDelta != 0) + { + SocketsTelemetry.Log.IoUringCompletionSlotDrainRecovery(completionSlotDrainRecoveryDelta); + } + } + + private readonly partial struct SocketEventHandler + { + /// Delivers a completed operation to its owning socket context. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void DispatchCompletedIoUringOperation(SocketAsyncContext.AsyncOperation operation, ulong userData) + { + if (!operation.AssociatedContext.TryCompleteIoUringOperation(operation)) + { + _engine.RecordBenignLateIoUringCompletion(userData); + } + } + + /// Completes a deferred SEND_ZC operation when its NOTIF CQE arrives. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void DispatchZeroCopyIoUringNotification(ulong payload) + { + IoUringOperationRegistry? registry = _engine._ioUringOperationRegistry; + if (registry is null) + { + return; + } + + ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload); + if (!registry.TryTake(userData, out SocketAsyncContext.AsyncOperation? operation) || operation is null) + { + return; + } + + Debug.Assert( + !_engine.IsZeroCopyNotificationPending(userData), + "NOTIF CQE dispatch must occur only after clearing SEND_ZC pending slot state."); + Debug.Assert( + operation.IoUringUserData == userData, + "Deferred SEND_ZC operation must still be tracked with its original user_data at NOTIF dispatch."); + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Completed); + operation.ClearIoUringUserData(); + DispatchCompletedIoUringOperation(operation, userData); + } + + /// Processes a single completion and dispatches it to its owning operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void DispatchSingleIoUringCompletion( + ulong userData, + int result, + uint flags, + int socketAddressLen, + int controlBufferLen, + uint auxiliaryData, + bool hasFixedRecvBuffer, + ushort fixedRecvBufferId, + ref bool enqueuedFallbackEvent) + { + Debug.Assert(_engine.IsCurrentThreadEventLoopThread(), + "DispatchSingleIoUringCompletion must only run on the event-loop thread."); + if (userData == 0) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId); + return; + } + + IoUringOperationRegistry? registry = _engine._ioUringOperationRegistry; + if (registry is null) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId); + return; + } + + // Benign race: cancellation/abort paths may have already removed this tracked entry. + if (!registry.TryTake(userData, out SocketAsyncContext.AsyncOperation? operation)) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId); + _engine.RecordBenignLateIoUringCompletion(userData); + return; + } + + if (operation is null) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId); + return; + } + + SocketAsyncContext receiveContext = operation.AssociatedContext; + if (receiveContext.IsPersistentMultishotRecvArmed() && + receiveContext.PersistentMultishotRecvUserData == userData) + { + // Terminal CQE for persistent multishot recv (normal completion, cancel, + // ENOBUFS, EOF, or other error): clear armed-state so the next receive can re-arm. + SocketsTelemetry.Log.IoUringPersistentMultishotRecvTermination(); + receiveContext.ClearPersistentMultishotRecvArmed(); + } + + if (operation is SocketAsyncContext.AcceptOperation acceptOperation && + acceptOperation.AssociatedContext.MultishotAcceptUserData == userData) + { + acceptOperation.AssociatedContext.DisarmMultishotAccept(); + } + + uint completionAuxiliaryData = auxiliaryData; + int completionResultCode = result; + if (!TryMaterializeIoUringReceiveCompletion( + operation!, + completionResultCode, + flags, + hasFixedRecvBuffer, + fixedRecvBufferId, + ref completionAuxiliaryData)) + { + completionResultCode = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.ENOBUFS); + completionAuxiliaryData = 0; + } + + // Process completion metadata before processing result to allow message post-processing. + operation!.SetIoUringCompletionMessageMetadata(socketAddressLen, controlBufferLen); + SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult = + operation.ProcessIoUringCompletionResult(completionResultCode, flags, completionAuxiliaryData); + + if (completionDispatchResult == SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed && + _engine.IsZeroCopyNotificationPending(userData)) + { + // SEND_ZC API contract: complete managed operation only once NOTIF confirms + // the kernel/NIC no longer references the caller buffer. + _engine.AssertZeroCopyDeferredCompletionState(userData, operation); + if (!registry.TryReattach(userData, operation)) + { + ThrowInternalException(Interop.Error.EINVAL); + return; + } + + return; + } + + DispatchIoUringCompletionResult( + operation, + completionDispatchResult, + userData, + ref enqueuedFallbackEvent); + } + + /// + /// Processes a multishot completion by completing the current operation and + /// requesting async cancel for non-terminal shots until full item-9 dispatch lands. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + public void DispatchMultishotIoUringCompletion( + ulong userData, + int result, + uint flags, + int socketAddressLen, + int controlBufferLen, + uint auxiliaryData, + bool hasFixedRecvBuffer, + ushort fixedRecvBufferId, + ref bool enqueuedFallbackEvent) + { + Debug.Assert(_engine.IsCurrentThreadEventLoopThread(), + "DispatchMultishotIoUringCompletion must only run on the event-loop thread."); + _ = enqueuedFallbackEvent; // Transitional path never requeues via readiness fallback. + _ = hasFixedRecvBuffer; + _ = fixedRecvBufferId; + Debug.Assert((flags & IoUringConstants.CqeFMore) != 0, + "Multishot dispatch must only be used for non-terminal CQEs (IORING_CQE_F_MORE)."); + + if (userData == 0) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0); + return; + } + + IoUringOperationRegistry? registry = _engine._ioUringOperationRegistry; + if (registry is null) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0); + return; + } + + if (!registry.TryGet(userData, out SocketAsyncContext.AsyncOperation? operation) || operation is null) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0); + _engine.RecordBenignLateIoUringCompletion(userData); + return; + } + + if (operation is SocketAsyncContext.AcceptOperation acceptOperation) + { + DispatchMultishotAcceptIoUringCompletion( + acceptOperation, + userData, + result, + flags, + socketAddressLen, + auxiliaryData); + return; + } + + if (!operation.IsInWaitingState()) + { + if (!TryBufferEarlyPersistentMultishotRecvCompletion(operation.AssociatedContext, result, flags)) + { + _engine.TryRequestIoUringCancellation(userData); + } + + return; + } + + uint completionAuxiliaryData = auxiliaryData; + int completionResultCode = result; + if (!TryMaterializeIoUringReceiveCompletion( + operation, + completionResultCode, + flags, + hasFixedRecvBuffer: false, + fixedRecvBufferId: 0, + ref completionAuxiliaryData)) + { + completionResultCode = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.ENOBUFS); + completionAuxiliaryData = 0; + } + + operation.SetIoUringCompletionMessageMetadata(socketAddressLen, controlBufferLen); + SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult = + operation.ProcessIoUringCompletionResult(completionResultCode, flags, completionAuxiliaryData); + + SocketAsyncContext context = operation.AssociatedContext; + bool isPersistentMultishotRecv = + context.IsPersistentMultishotRecvArmed() && + context.PersistentMultishotRecvUserData == userData; + + // Transitional multishot model cancels after the first shot. + // Persistent multishot receive remains armed and rebinds future operations via TryReplace. + if (!isPersistentMultishotRecv) + { + _engine.TryRequestIoUringCancellation(userData); + } + + switch (completionDispatchResult) + { + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed: + DispatchCompletedIoUringOperation(operation, userData); + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending: + // Transitional multishot mode does not requeue intermediate shots. + // Cancellation is already requested above; terminal CQE cleanup path + // remains responsible for tracked-state/resource release. + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled: + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored: + break; + + default: + Debug.Fail($"Unexpected io_uring multishot completion result: {completionDispatchResult}"); + break; + } + } + + /// + /// Handles transitional multishot-accept CQEs by completing one waiting operation and + /// canceling the multishot request. Extra successful shots are queued for dequeue on + /// the accept operation queue when possible. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private void DispatchMultishotAcceptIoUringCompletion( + SocketAsyncContext.AcceptOperation operation, + ulong userData, + int result, + uint flags, + int socketAddressLen, + uint auxiliaryData) + { + Debug.Assert(_engine.IsCurrentThreadEventLoopThread(), + "DispatchMultishotAcceptIoUringCompletion must only run on the event-loop thread."); + operation.SetIoUringCompletionMessageMetadata(socketAddressLen, 0); + SocketAsyncContext context = operation.AssociatedContext; + SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult = + operation.ProcessIoUringCompletionResult(result, flags, auxiliaryData); + + // Transitional multishot-accept model: complete one managed accept and then + // issue async-cancel so terminal cleanup runs through single-shot dispatch. + _engine.TryRequestIoUringCancellation(userData); + + switch (completionDispatchResult) + { + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed: + DispatchCompletedIoUringOperation(operation, userData); + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending: + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled: + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored: + if (result >= 0) + { + int addressLength = auxiliaryData > (uint)operation.SocketAddress.Length ? + operation.SocketAddress.Length : + (int)auxiliaryData; + if (context.TryEnqueuePreAcceptedConnection((IntPtr)result, operation.SocketAddress.Span, addressLength)) + { + _engine.EnqueueReadinessFallbackEvent(context, Interop.Sys.SocketEvents.Read); + } + else + { + Interop.Sys.Close((IntPtr)result); + } + } + break; + + default: + Debug.Fail($"Unexpected io_uring multishot accept completion result: {completionDispatchResult}"); + break; + } + } + + /// + /// For receive completions that used provided buffers (buffer-select or fixed receive), + /// materializes payload bytes into the operation target and recycles checked-out buffers. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool TryMaterializeIoUringReceiveCompletion( + SocketAsyncContext.AsyncOperation operation, + int result, + uint flags, + bool hasFixedRecvBuffer, + ushort fixedRecvBufferId, + ref uint auxiliaryData) + { + bool hasSelectedBuffer = (flags & IoUringConstants.CqeFBuffer) != 0; + if (!hasFixedRecvBuffer && !hasSelectedBuffer) + { + return true; + } + + IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing; + if (providedBufferRing is null) + { + return false; + } + + ushort bufferId; + bool reportRecycleFailureAsDepletion; + byte* providedBuffer = null; + int providedBufferLength = 0; + if (hasFixedRecvBuffer) + { + bufferId = fixedRecvBufferId; + reportRecycleFailureAsDepletion = true; + + if (result > 0 && + !providedBufferRing.TryGetCheckedOutBuffer( + bufferId, + out providedBuffer, + out providedBufferLength)) + { + SocketsTelemetry.Log.IoUringProvidedBufferDepletion(); + return false; + } + } + else + { + bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift); + reportRecycleFailureAsDepletion = false; + if (!providedBufferRing.TryAcquireBufferForCompletion( + bufferId, + out providedBuffer, + out providedBufferLength)) + { + SocketsTelemetry.Log.IoUringProvidedBufferDepletion(); + return false; + } + } + + bool handled = result <= 0; + try + { + if (result > 0) + { + handled = + operation.TryProcessIoUringProvidedBufferCompletion( + providedBuffer, + providedBufferLength, + result, + ref auxiliaryData); + } + + RecordProvidedBufferUtilizationIfEnabled(providedBufferRing, result); + } + finally + { + handled &= TryRecycleProvidedBufferFromCheckedOutState( + providedBufferRing, + bufferId, + reportFailureAsDepletion: reportRecycleFailureAsDepletion); + } + + return handled; + } + + /// + /// For persistent multishot recv, buffers payload bytes that arrive while no + /// managed receive operation is in the Waiting state. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool TryBufferEarlyPersistentMultishotRecvCompletion( + SocketAsyncContext context, + int result, + uint flags) + { + if (result <= 0) + { + return true; + } + + if ((flags & IoUringConstants.CqeFBuffer) == 0) + { + return false; + } + + IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing; + if (providedBufferRing is null) + { + return false; + } + + ushort bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift); + if (!providedBufferRing.TryAcquireBufferForCompletion( + bufferId, + out byte* providedBuffer, + out int providedBufferLength)) + { + SocketsTelemetry.Log.IoUringProvidedBufferDepletion(); + return false; + } + + bool buffered = false; + try + { + if ((uint)result <= (uint)providedBufferLength) + { + buffered = context.TryBufferEarlyPersistentMultishotRecvData( + new ReadOnlySpan(providedBuffer, result)); + if (buffered) + { + RecordProvidedBufferUtilizationIfEnabled(providedBufferRing, result); + SocketsTelemetry.Log.IoUringPersistentMultishotRecvEarlyData(); + } + } + } + finally + { + buffered &= TryRecycleProvidedBufferFromCheckedOutState( + providedBufferRing, + bufferId, + reportFailureAsDepletion: false); + } + + return buffered; + } + + /// + /// Recycles a provided-buffer selection for completions that can no longer be + /// dispatched to a tracked operation (e.g., late multishot CQEs after cancel). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void RecycleUntrackedReceiveCompletionBuffers( + uint flags, + bool hasFixedRecvBuffer, + ushort fixedRecvBufferId) + { + IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing; + if (providedBufferRing is null) + { + return; + } + + if ((flags & IoUringConstants.CqeFBuffer) == 0) + { + if (hasFixedRecvBuffer) + { + _ = TryRecycleProvidedBufferFromCheckedOutState( + providedBufferRing, + fixedRecvBufferId, + reportFailureAsDepletion: true); + } + + return; + } + + ushort bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift); + if (!providedBufferRing.TryAcquireBufferForCompletion( + bufferId, + out _, + out _)) + { + SocketsTelemetry.Log.IoUringProvidedBufferDepletion(); + } + else + { + _ = TryRecycleProvidedBufferFromCheckedOutState( + providedBufferRing, + bufferId, + reportFailureAsDepletion: false); + } + + if (hasFixedRecvBuffer) + { + _ = TryRecycleProvidedBufferFromCheckedOutState( + providedBufferRing, + fixedRecvBufferId, + reportFailureAsDepletion: true); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RecordProvidedBufferUtilizationIfEnabled( + IoUringProvidedBufferRing providedBufferRing, + int bytesTransferred) + { + if (bytesTransferred <= 0 || !_engine._adaptiveBufferSizingEnabled) + { + return; + } + + Debug.Assert(_engine.IsCurrentThreadEventLoopThread(), + "Adaptive provided-buffer utilization tracking must run on the event-loop thread."); + providedBufferRing.RecordCompletionUtilization(bytesTransferred); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool TryRecycleProvidedBufferFromCheckedOutState( + IoUringProvidedBufferRing providedBufferRing, + ushort bufferId, + bool reportFailureAsDepletion) + { + bool recycled = providedBufferRing.TryRecycleBufferFromCompletion(bufferId); + if (recycled) + { + SocketsTelemetry.Log.IoUringProvidedBufferRecycle(); + } + else if (reportFailureAsDepletion) + { + SocketsTelemetry.Log.IoUringProvidedBufferDepletion(); + } + + return recycled; + } + + /// Requeues a pending operation or falls back to readiness notification. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool DispatchPendingIoUringOperation(SocketAsyncContext.AsyncOperation operation, ulong userData) + { + PendingIoUringReprepareResult inlineReprepareResult = TryDispatchPendingIoUringOperationInline(operation); + if (inlineReprepareResult == PendingIoUringReprepareResult.Prepared) + { + return false; + } + + if (inlineReprepareResult == PendingIoUringReprepareResult.NotAttempted && + operation.TryQueueIoUringPreparation()) + { + SocketAsyncEngine.RecordIoUringPendingRetryQueuedToPrepareQueue(); + return false; + } + + Debug.Assert( + inlineReprepareResult == PendingIoUringReprepareResult.Failed || + !_engine._ioUringCapabilities.IsCompletionMode, + "Requeue should not fail in pure io_uring completion mode when inline re-prepare was not attempted."); + + _engine.RecordIoUringCompletionRequeueFailure(userData); + operation.ClearIoUringUserData(); + Interop.Sys.SocketEvents fallbackEvents = operation.GetIoUringFallbackSocketEvents(); + if (fallbackEvents == Interop.Sys.SocketEvents.None) + { + return false; + } + + if (NetEventSource.Log.IsEnabled()) + { + LogUnexpectedCompletionFallback(_engine, fallbackEvents, userData); + } + _eventQueue.Enqueue(new SocketIOEvent(operation.AssociatedContext, fallbackEvents)); + return true; + + [MethodImpl(MethodImplOptions.NoInlining)] + static void LogUnexpectedCompletionFallback(SocketAsyncEngine engine, Interop.Sys.SocketEvents events, ulong completionUserData) + { + NetEventSource.Error( + engine, + $"io_uring completion fallback to readiness notification in unexpected path: events={events}, user_data=0x{completionUserData:x}"); + } + } + + /// + /// Attempts to re-prepare and re-track a pending operation inline on the event loop thread. + /// This avoids an extra prepare-queue round-trip for completion-mode retries. + /// + private enum PendingIoUringReprepareResult : byte + { + NotAttempted = 0, + Prepared = 1, + Failed = 2 + } + + /// + /// Attempts to re-prepare a pending operation inline. + /// Returns whether inline re-prepare was prepared, skipped, or failed without producing an SQE. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private PendingIoUringReprepareResult TryDispatchPendingIoUringOperationInline(SocketAsyncContext.AsyncOperation operation) + { + if (!_engine._ioUringCapabilities.IsCompletionMode || !_engine.IsCurrentThreadEventLoopThread()) + { + return PendingIoUringReprepareResult.NotAttempted; + } + + long prepareSequence = operation.MarkReadyForIoUringPreparation(); + Interop.Error prepareError = _engine.TryPrepareAndTrackIoUringOperation( + operation, + prepareSequence, + out bool preparedSqe); + if (prepareError != Interop.Error.SUCCESS) + { + ThrowInternalException(prepareError); + return PendingIoUringReprepareResult.Failed; + } + + return preparedSqe ? PendingIoUringReprepareResult.Prepared : PendingIoUringReprepareResult.Failed; + } + + /// Routes a CQE completion result to the appropriate dispatch behavior. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void DispatchIoUringCompletionResult( + SocketAsyncContext.AsyncOperation operation, + SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionResult, + ulong userData, + ref bool enqueuedFallbackEvent) + { + switch (completionResult) + { + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed: + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Completed); + operation.ClearIoUringUserData(); + DispatchCompletedIoUringOperation(operation, userData); + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending: + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Queued); + if (operation.ShouldReuseIoUringPreparationResourcesOnPending) + { + operation.MarkIoUringPreparationReusable(); + operation.ResetIoUringUserDataForRequeue(); + } + else + { + operation.ClearIoUringUserData(); + } + + enqueuedFallbackEvent |= DispatchPendingIoUringOperation(operation, userData); + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled: + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored: + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Canceled); + operation.ClearIoUringUserData(); + _engine.RecordBenignLateIoUringCompletion(userData); + break; + + default: + Debug.Fail($"Unexpected io_uring completion result: {completionResult}"); + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Detached); + operation.ClearIoUringUserData(); + _engine.RecordBenignLateIoUringCompletion(userData); + break; + } + } + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index ae9b6c9095e43f..0a59231138c169 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -4,20 +4,22 @@ using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Threading; namespace System.Net.Sockets { - internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem + internal sealed unsafe partial class SocketAsyncEngine : IThreadPoolWorkItem { - private const int EventBufferCount = + private const int DefaultEventBufferCount = #if DEBUG 32; #else 1024; #endif + private static readonly int s_eventBufferCount = GetEventBufferCount(); // Socket continuations are dispatched to the ThreadPool from the event thread. // This avoids continuations blocking the event handling. @@ -25,9 +27,31 @@ internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem // PreferInlineCompletions defaults to false and can be set to true using the DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS envvar. internal static readonly bool InlineSocketCompletionsEnabled = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS") == "1"; + private static int GetEventBufferCount() + { +#if DEBUG + // Test-only knob to make wait-buffer saturation deterministic for io_uring diagnostics coverage. + // Only available in DEBUG builds so production code never reads test env vars. + if (OperatingSystem.IsLinux()) + { + string? configuredValue = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_EVENT_BUFFER_COUNT"); + if (configuredValue is not null && + int.TryParse(configuredValue, out int parsedValue) && + parsedValue >= 1 && + parsedValue <= DefaultEventBufferCount) + { + return parsedValue; + } + } +#endif + + return DefaultEventBufferCount; + } + private static int GetEngineCount() { // The responsibility of SocketAsyncEngine is to get notifications from epoll|kqueue + // (or io_uring on Linux when enabled in the native shim) // and schedule corresponding work items to ThreadPool (socket reads and writes). // // Using TechEmpower benchmarks that generate a LOT of SMALL socket reads and writes under a VERY HIGH load @@ -85,6 +109,7 @@ private static SocketAsyncEngine[] CreateEngines() private readonly IntPtr _port; private readonly Interop.Sys.SocketEvent* _buffer; + private int _eventLoopManagedThreadId; // // Queue of events generated by EventLoop() that would be processed by the thread pool @@ -143,8 +168,20 @@ private bool TryRegisterCore(IntPtr socketHandle, SocketAsyncContext context, ou context.GlobalContextIndex = index; } - error = Interop.Sys.TryChangeSocketEventRegistration(_port, socketHandle, Interop.Sys.SocketEvents.None, - Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, context.GlobalContextIndex); + Interop.Error managedError = default; + bool managedHandled = false; + LinuxTryChangeSocketEventRegistration(socketHandle, Interop.Sys.SocketEvents.None, + Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, + context.GlobalContextIndex, ref managedError, ref managedHandled); + if (managedHandled) + { + error = managedError; + } + else + { + error = Interop.Sys.TryChangeSocketEventRegistration(_port, socketHandle, Interop.Sys.SocketEvents.None, + Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, context.GlobalContextIndex); + } if (error == Interop.Error.SUCCESS) { return true; @@ -182,19 +219,21 @@ private SocketAsyncEngine() err = Interop.Sys.CreateSocketEventPort(portPtr); if (err != Interop.Error.SUCCESS) { - throw new InternalException(err); + ThrowInternalException(err); } } fixed (Interop.Sys.SocketEvent** bufferPtr = &_buffer) { - err = Interop.Sys.CreateSocketEventBuffer(EventBufferCount, bufferPtr); + err = Interop.Sys.CreateSocketEventBuffer(s_eventBufferCount, bufferPtr); if (err != Interop.Error.SUCCESS) { - throw new InternalException(err); + ThrowInternalException(err); } } + LinuxDetectAndInitializeIoUring(); + var thread = new Thread(static s => ((SocketAsyncEngine)s!).EventLoop()) { IsBackground = true, @@ -209,32 +248,78 @@ private SocketAsyncEngine() } } + partial void LinuxDetectAndInitializeIoUring(); + partial void LinuxEventLoopBeforeWait(); + partial void LinuxEventLoopTryCompletionWait(SocketEventHandler handler, ref int numEvents, ref int numCompletions, ref Interop.Error err, ref bool waitHandled); + partial void LinuxEventLoopAfterIteration(); + partial void LinuxBeforeFreeNativeResources(ref bool closeSocketEventPort); + partial void LinuxFreeIoUringResources(); + partial void LinuxTryChangeSocketEventRegistration(IntPtr socketHandle, Interop.Sys.SocketEvents currentEvents, Interop.Sys.SocketEvents newEvents, int data, ref Interop.Error error, ref bool handled); + + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowInternalException(Interop.Error error) => + throw new InternalException(error); + + [DoesNotReturn] + [StackTraceHidden] + [MethodImpl(MethodImplOptions.NoInlining)] + private static void FailFastEventLoop(Exception exception) => + Environment.FailFast($"Exception thrown from SocketAsyncEngine event loop: {exception}", exception); + + private void RecordAndAssertEventLoopThreadIdentity() + { + int currentThreadId = Environment.CurrentManagedThreadId; +#if DEBUG + int previousThreadId = Interlocked.CompareExchange(ref _eventLoopManagedThreadId, currentThreadId, 0); + Debug.Assert( + previousThreadId == 0 || previousThreadId == currentThreadId, + $"SocketAsyncEngine event loop thread changed: previous={previousThreadId}, current={currentThreadId}"); +#else + Interlocked.CompareExchange(ref _eventLoopManagedThreadId, currentThreadId, 0); +#endif + } + private void EventLoop() { try { + RecordAndAssertEventLoopThreadIdentity(); SocketEventHandler handler = new SocketEventHandler(this); while (true) { - int numEvents = EventBufferCount; - Interop.Error err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents); + LinuxEventLoopBeforeWait(); + + int numEvents = s_eventBufferCount; + int numCompletions = 0; + Interop.Error err = default; + bool waitHandled = false; + LinuxEventLoopTryCompletionWait(handler, ref numEvents, ref numCompletions, ref err, ref waitHandled); + if (!waitHandled) + { + err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents); + } + if (err != Interop.Error.SUCCESS) { - throw new InternalException(err); + ThrowInternalException(err); } - // The native shim is responsible for ensuring this condition. - Debug.Assert(numEvents > 0, $"Unexpected numEvents: {numEvents}"); + // io_uring completion-mode wait can return with zero surfaced events/completions + // when woken only to flush managed prepare/cancel queues. + Debug.Assert(waitHandled || numEvents > 0 || numCompletions > 0, $"Unexpected wait result: events={numEvents}, completions={numCompletions}"); - if (handler.HandleSocketEvents(numEvents)) + if (numEvents > 0 && handler.HandleSocketEvents(numEvents)) { EnsureWorkerScheduled(); } + + LinuxEventLoopAfterIteration(); } } catch (Exception e) { - Environment.FailFast("Exception thrown from SocketAsyncEngine event loop: " + e.ToString(), e); + FailFastEventLoop(e); } } @@ -295,11 +380,19 @@ void IThreadPoolWorkItem.Execute() private void FreeNativeResources() { + bool closeSocketEventPort = true; + // Linux io_uring teardown may need to close the port first to ensure native + // ownership is detached before managed operation resources are released. + LinuxBeforeFreeNativeResources(ref closeSocketEventPort); + + LinuxFreeIoUringResources(); + if (_buffer != null) { Interop.Sys.FreeSocketEventBuffer(_buffer); } - if (_port != (IntPtr)(-1)) + + if (closeSocketEventPort && _port != (IntPtr)(-1)) { Interop.Sys.CloseSocketEventPort(_port); } @@ -310,14 +403,16 @@ private void FreeNativeResources() // To avoid this, the event handling logic is delegated to a non-inlined processing method. // See discussion: https://github.com/dotnet/runtime/issues/37064 // SocketEventHandler holds an on-stack cache of SocketAsyncEngine members needed by the handler method. - private readonly struct SocketEventHandler + private readonly partial struct SocketEventHandler { public Interop.Sys.SocketEvent* Buffer { get; } private readonly ConcurrentQueue _eventQueue; + private readonly SocketAsyncEngine _engine; public SocketEventHandler(SocketAsyncEngine engine) { + _engine = engine; Buffer = engine._buffer; _eventQueue = engine._eventQueue; } diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs new file mode 100644 index 00000000000000..38d7ef78334b34 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs @@ -0,0 +1,12 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace System.Net.Sockets +{ + internal static partial class SocketPal + { + /// Extracts from a completed io_uring recvmsg message header. + internal static unsafe IPPacketInformation GetIoUringIPPacketInformation(Interop.Sys.MessageHeader* messageHeader, bool isIPv4, bool isIPv6) => + GetIPPacketInformation(messageHeader, isIPv4, isIPv6); + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs index 1171961a204351..81c19d9b082918 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs @@ -14,6 +14,46 @@ internal sealed partial class SocketsTelemetry : EventSource private const string ConnectActivityName = ActivitySourceName + ".Connect"; private static readonly ActivitySource s_connectActivitySource = new ActivitySource(ActivitySourceName); + internal static class Keywords + { + // Stable operational counters are always published when the source is enabled on Linux. + // Diagnostic counters are opt-in and can evolve without name stability guarantees. + internal const EventKeywords IoUringDiagnostics = (EventKeywords)0x1; + } + + internal static class IoUringCounterNames + { + internal const string PrepareNonPinnableFallbacks = "io-uring-prepare-nonpinnable-fallbacks"; + internal const string SocketEventBufferFull = "io-uring-socket-event-buffer-full"; + internal const string CqOverflow = "io-uring-cq-overflow"; + internal const string PrepareQueueOverflows = "io-uring-prepare-queue-overflows"; + internal const string PrepareQueueOverflowFallbacks = "io-uring-prepare-queue-overflow-fallbacks"; + internal const string CompletionSlotExhaustions = "io-uring-completion-slot-exhaustions"; + internal const string SqPollWakeups = "io-uring-sqpoll-wakeups"; + internal const string SqPollSubmissionsSkipped = "io-uring-sqpoll-submissions-skipped"; + } + + internal static class IoUringDiagnosticCounterNames + { + internal const string AsyncCancelRequestCqes = "io-uring-async-cancel-request-cqes"; + internal const string CompletionRequeueFailures = "io-uring-completion-requeue-failures"; + internal const string PrepareQueueDepth = "io-uring-prepare-queue-depth"; + internal const string CompletionSlotDrainRecoveries = "io-uring-completion-slot-drain-recoveries"; + internal const string ProvidedBufferDepletions = "io-uring-provided-buffer-depletions"; + internal const string ProvidedBufferCurrentSize = "io-uring-provided-buffer-current-size"; + internal const string ProvidedBufferRecycles = "io-uring-provided-buffer-recycles"; + internal const string ProvidedBufferResizes = "io-uring-provided-buffer-resizes"; + internal const string RegisteredBuffersInitialSuccess = "io-uring-registered-buffers-initial-success"; + internal const string RegisteredBuffersInitialFailure = "io-uring-registered-buffers-initial-failure"; + internal const string RegisteredBuffersReregistrationSuccess = "io-uring-registered-buffers-reregistration-success"; + internal const string RegisteredBuffersReregistrationFailure = "io-uring-registered-buffers-reregistration-failure"; + internal const string FixedRecvSelected = "io-uring-fixed-recv-selected"; + internal const string FixedRecvFallbacks = "io-uring-fixed-recv-fallbacks"; + internal const string PersistentMultishotRecvReuse = "io-uring-persistent-multishot-recv-reuse"; + internal const string PersistentMultishotRecvTermination = "io-uring-persistent-multishot-recv-termination"; + internal const string PersistentMultishotRecvEarlyData = "io-uring-persistent-multishot-recv-early-data"; + } + public static readonly SocketsTelemetry Log = new SocketsTelemetry(); private PollingCounter? _currentOutgoingConnectAttemptsCounter; @@ -23,6 +63,33 @@ internal sealed partial class SocketsTelemetry : EventSource private PollingCounter? _bytesSentCounter; private PollingCounter? _datagramsReceivedCounter; private PollingCounter? _datagramsSentCounter; + // Keep io_uring counter backing fields always present so EventCounter name contracts remain stable + // across platforms; OnEventCommand only registers these counters on Linux. + private PollingCounter? _ioUringPrepareNonPinnableFallbacksCounter; + private PollingCounter? _ioUringAsyncCancelRequestCqesCounter; + private PollingCounter? _ioUringSocketEventBufferFullCounter; + private PollingCounter? _ioUringCqOverflowCounter; + private PollingCounter? _ioUringCompletionRequeueFailuresCounter; + private PollingCounter? _ioUringPrepareQueueOverflowsCounter; + private PollingCounter? _ioUringPrepareQueueOverflowFallbacksCounter; + private PollingCounter? _ioUringPrepareQueueDepthCounter; + private PollingCounter? _ioUringCompletionSlotExhaustionsCounter; + private PollingCounter? _ioUringCompletionSlotDrainRecoveriesCounter; + private PollingCounter? _ioUringProvidedBufferDepletionsCounter; + private PollingCounter? _ioUringProvidedBufferCurrentSizeCounter; + private PollingCounter? _ioUringProvidedBufferRecyclesCounter; + private PollingCounter? _ioUringProvidedBufferResizesCounter; + private PollingCounter? _ioUringRegisteredBuffersInitialSuccessCounter; + private PollingCounter? _ioUringRegisteredBuffersInitialFailureCounter; + private PollingCounter? _ioUringRegisteredBuffersReregistrationSuccessCounter; + private PollingCounter? _ioUringRegisteredBuffersReregistrationFailureCounter; + private PollingCounter? _ioUringFixedRecvSelectedCounter; + private PollingCounter? _ioUringFixedRecvFallbacksCounter; + private PollingCounter? _ioUringSqPollWakeupsCounter; + private PollingCounter? _ioUringSqPollSubmissionsSkippedCounter; + private PollingCounter? _ioUringPersistentMultishotRecvReuseCounter; + private PollingCounter? _ioUringPersistentMultishotRecvTerminationCounter; + private PollingCounter? _ioUringPersistentMultishotRecvEarlyDataCounter; private long _currentOutgoingConnectAttempts; private long _outgoingConnectionsEstablished; @@ -31,6 +98,32 @@ internal sealed partial class SocketsTelemetry : EventSource private long _bytesSent; private long _datagramsReceived; private long _datagramsSent; + // Backing fields stay cross-platform for contract stability; they are only surfaced as counters on Linux. + private long _ioUringPrepareNonPinnableFallbacks; + private long _ioUringAsyncCancelRequestCqes; + private long _ioUringSocketEventBufferFull; + private long _ioUringCqOverflow; + private long _ioUringCompletionRequeueFailures; + private long _ioUringPrepareQueueOverflows; + private long _ioUringPrepareQueueOverflowFallbacks; + private long _ioUringPrepareQueueDepth; + private long _ioUringCompletionSlotExhaustions; + private long _ioUringCompletionSlotDrainRecoveries; + private long _ioUringProvidedBufferDepletions; + private long _ioUringProvidedBufferCurrentSize; + private long _ioUringProvidedBufferRecycles; + private long _ioUringProvidedBufferResizes; + private long _ioUringRegisteredBuffersInitialSuccess; + private long _ioUringRegisteredBuffersInitialFailure; + private long _ioUringRegisteredBuffersReregistrationSuccess; + private long _ioUringRegisteredBuffersReregistrationFailure; + private long _ioUringFixedRecvSelected; + private long _ioUringFixedRecvFallbacks; + private long _ioUringSqPollWakeups; + private long _ioUringSqPollSubmissionsSkipped; + private long _ioUringPersistentMultishotRecvReuse; + private long _ioUringPersistentMultishotRecvTermination; + private long _ioUringPersistentMultishotRecvEarlyData; [Event(1, Level = EventLevel.Informational)] private void ConnectStart(string? address) @@ -80,6 +173,15 @@ private void AcceptFailed(SocketError error, string? exceptionMessage) } } + [Event(7, Level = EventLevel.Informational)] + private void SocketEngineBackendSelected(string backend, int isIoUringPort, int sqPollEnabled) + { + if (IsEnabled(EventLevel.Informational, EventKeywords.All)) + { + WriteEvent(eventId: 7, backend, isIoUringPort, sqPollEnabled); + } + } + [NonEvent] public Activity? ConnectStart(SocketAddress address, ProtocolType protocolType, EndPoint endPoint, bool keepActivityCurrent) { @@ -189,6 +291,20 @@ public void AcceptStart(EndPoint address) } } + [NonEvent] + internal void ReportSocketEngineBackendSelected(bool isIoUringPort, bool isCompletionMode, bool sqPollEnabled) + { + if (!IsEnabled(EventLevel.Informational, EventKeywords.All)) + { + return; + } + + SocketEngineBackendSelected( + isCompletionMode ? "io_uring_completion" : "epoll", + isIoUringPort ? 1 : 0, + sqPollEnabled ? 1 : 0); + } + [NonEvent] public void AfterAccept(SocketError error, string? exceptionMessage = null) { @@ -231,6 +347,182 @@ public void DatagramSent() Interlocked.Increment(ref _datagramsSent); } + [NonEvent] + public void IoUringPrepareNonPinnableFallback(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPrepareNonPinnableFallbacks, count); + } + + [NonEvent] + public void IoUringAsyncCancelRequestCqes(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringAsyncCancelRequestCqes, count); + } + + [NonEvent] + public void IoUringSocketEventBufferFull(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringSocketEventBufferFull, count); + } + + [NonEvent] + public void IoUringCqOverflow(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringCqOverflow, count); + } + + [NonEvent] + public void IoUringCompletionRequeueFailure(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringCompletionRequeueFailures, count); + } + + [NonEvent] + public void IoUringPrepareQueueOverflow(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPrepareQueueOverflows, count); + } + + [NonEvent] + public void IoUringPrepareQueueOverflowFallback(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPrepareQueueOverflowFallbacks, count); + } + + [NonEvent] + public void IoUringPrepareQueueDepthDelta(long delta) + { + long value = Interlocked.Add(ref _ioUringPrepareQueueDepth, delta); + Debug.Assert(value >= 0, $"io_uring prepare queue depth cannot be negative: {value}"); + } + + [NonEvent] + public void IoUringCompletionSlotExhaustion(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringCompletionSlotExhaustions, count); + } + + [NonEvent] + public void IoUringCompletionSlotDrainRecovery(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringCompletionSlotDrainRecoveries, count); + } + + [NonEvent] + public void IoUringProvidedBufferDepletion(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringProvidedBufferDepletions, count); + } + + [NonEvent] + public void IoUringProvidedBufferCurrentSize(int size) + { + Debug.Assert(size >= 0); + Volatile.Write(ref _ioUringProvidedBufferCurrentSize, size); + } + + [NonEvent] + public void IoUringProvidedBufferRecycle(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringProvidedBufferRecycles, count); + } + + [NonEvent] + public void IoUringProvidedBufferResize(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringProvidedBufferResizes, count); + } + + [NonEvent] + public void IoUringRegisteredBuffersResult(bool success, int bufferCount, int bufferSize) + { + Debug.Assert(bufferCount >= 0); + Debug.Assert(bufferSize >= 0); + + if (success) + { + Interlocked.Increment(ref _ioUringRegisteredBuffersInitialSuccess); + } + else + { + Interlocked.Increment(ref _ioUringRegisteredBuffersInitialFailure); + } + } + + [NonEvent] + public void IoUringRegisteredBuffersReregistration(bool success) + { + if (success) + { + Interlocked.Increment(ref _ioUringRegisteredBuffersReregistrationSuccess); + } + else + { + Interlocked.Increment(ref _ioUringRegisteredBuffersReregistrationFailure); + } + } + + [NonEvent] + public void IoUringFixedRecvSelected(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringFixedRecvSelected, count); + } + + [NonEvent] + public void IoUringFixedRecvFallback(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringFixedRecvFallbacks, count); + } + + [NonEvent] + public void IoUringSqPollWakeup(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringSqPollWakeups, count); + } + + [NonEvent] + public void IoUringSqPollSubmissionSkipped(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringSqPollSubmissionsSkipped, count); + } + + [NonEvent] + public void IoUringPersistentMultishotRecvReuse(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPersistentMultishotRecvReuse, count); + } + + [NonEvent] + public void IoUringPersistentMultishotRecvTermination(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPersistentMultishotRecvTermination, count); + } + + [NonEvent] + public void IoUringPersistentMultishotRecvEarlyData(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPersistentMultishotRecvEarlyData, count); + } + private static string GetErrorType(SocketError socketError) => socketError switch { // Common connect() errors expected to be seen: @@ -291,6 +583,118 @@ protected override void OnEventCommand(EventCommandEventArgs command) { DisplayName = "Datagrams Sent", }; + + if (!OperatingSystem.IsLinux()) + { + return; + } + + _ioUringPrepareNonPinnableFallbacksCounter ??= new PollingCounter(IoUringCounterNames.PrepareNonPinnableFallbacks, this, () => Interlocked.Read(ref _ioUringPrepareNonPinnableFallbacks)) + { + DisplayName = "io_uring Prepare Non-Pinnable Fallbacks", + }; + _ioUringSocketEventBufferFullCounter ??= new PollingCounter(IoUringCounterNames.SocketEventBufferFull, this, () => Interlocked.Read(ref _ioUringSocketEventBufferFull)) + { + DisplayName = "io_uring Socket Event Buffer Full", + }; + _ioUringCqOverflowCounter ??= new PollingCounter(IoUringCounterNames.CqOverflow, this, () => Interlocked.Read(ref _ioUringCqOverflow)) + { + DisplayName = "io_uring Completion Queue Overflow", + }; + _ioUringPrepareQueueOverflowsCounter ??= new PollingCounter(IoUringCounterNames.PrepareQueueOverflows, this, () => Interlocked.Read(ref _ioUringPrepareQueueOverflows)) + { + DisplayName = "io_uring Prepare Queue Overflows", + }; + _ioUringPrepareQueueOverflowFallbacksCounter ??= new PollingCounter(IoUringCounterNames.PrepareQueueOverflowFallbacks, this, () => Interlocked.Read(ref _ioUringPrepareQueueOverflowFallbacks)) + { + DisplayName = "io_uring Prepare Queue Overflow Fallbacks", + }; + _ioUringCompletionSlotExhaustionsCounter ??= new PollingCounter(IoUringCounterNames.CompletionSlotExhaustions, this, () => Interlocked.Read(ref _ioUringCompletionSlotExhaustions)) + { + DisplayName = "io_uring Completion Slot Exhaustions", + }; + _ioUringSqPollWakeupsCounter ??= new PollingCounter(IoUringCounterNames.SqPollWakeups, this, () => Interlocked.Read(ref _ioUringSqPollWakeups)) + { + DisplayName = "io_uring SQPOLL Wakeups", + }; + _ioUringSqPollSubmissionsSkippedCounter ??= new PollingCounter(IoUringCounterNames.SqPollSubmissionsSkipped, this, () => Interlocked.Read(ref _ioUringSqPollSubmissionsSkipped)) + { + DisplayName = "io_uring SQPOLL Submissions Skipped", + }; + + if (!IsEnabled(EventLevel.LogAlways, Keywords.IoUringDiagnostics)) + { + return; + } + + _ioUringAsyncCancelRequestCqesCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.AsyncCancelRequestCqes, this, () => Interlocked.Read(ref _ioUringAsyncCancelRequestCqes)) + { + DisplayName = "io_uring Async-Cancel Request CQEs", + }; + _ioUringCompletionRequeueFailuresCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.CompletionRequeueFailures, this, () => Interlocked.Read(ref _ioUringCompletionRequeueFailures)) + { + DisplayName = "io_uring Completion Requeue Failures", + }; + _ioUringPrepareQueueDepthCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.PrepareQueueDepth, this, () => Interlocked.Read(ref _ioUringPrepareQueueDepth)) + { + DisplayName = "io_uring Prepare Queue Depth", + }; + _ioUringCompletionSlotDrainRecoveriesCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.CompletionSlotDrainRecoveries, this, () => Interlocked.Read(ref _ioUringCompletionSlotDrainRecoveries)) + { + DisplayName = "io_uring Completion Slot Drain Recoveries", + }; + _ioUringProvidedBufferDepletionsCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.ProvidedBufferDepletions, this, () => Interlocked.Read(ref _ioUringProvidedBufferDepletions)) + { + DisplayName = "io_uring Provided Buffer Depletions", + }; + _ioUringProvidedBufferCurrentSizeCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.ProvidedBufferCurrentSize, this, () => Volatile.Read(ref _ioUringProvidedBufferCurrentSize)) + { + DisplayName = "io_uring Provided Buffer Current Size", + }; + _ioUringProvidedBufferRecyclesCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.ProvidedBufferRecycles, this, () => Interlocked.Read(ref _ioUringProvidedBufferRecycles)) + { + DisplayName = "io_uring Provided Buffer Recycles", + }; + _ioUringProvidedBufferResizesCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.ProvidedBufferResizes, this, () => Interlocked.Read(ref _ioUringProvidedBufferResizes)) + { + DisplayName = "io_uring Provided Buffer Resizes", + }; + _ioUringRegisteredBuffersInitialSuccessCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.RegisteredBuffersInitialSuccess, this, () => Interlocked.Read(ref _ioUringRegisteredBuffersInitialSuccess)) + { + DisplayName = "io_uring Registered Buffers Initial Success", + }; + _ioUringRegisteredBuffersInitialFailureCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.RegisteredBuffersInitialFailure, this, () => Interlocked.Read(ref _ioUringRegisteredBuffersInitialFailure)) + { + DisplayName = "io_uring Registered Buffers Initial Failure", + }; + _ioUringRegisteredBuffersReregistrationSuccessCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.RegisteredBuffersReregistrationSuccess, this, () => Interlocked.Read(ref _ioUringRegisteredBuffersReregistrationSuccess)) + { + DisplayName = "io_uring Registered Buffers Re-Registration Success", + }; + _ioUringRegisteredBuffersReregistrationFailureCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.RegisteredBuffersReregistrationFailure, this, () => Interlocked.Read(ref _ioUringRegisteredBuffersReregistrationFailure)) + { + DisplayName = "io_uring Registered Buffers Re-Registration Failure", + }; + _ioUringFixedRecvSelectedCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.FixedRecvSelected, this, () => Interlocked.Read(ref _ioUringFixedRecvSelected)) + { + DisplayName = "io_uring Fixed Recv Selected", + }; + _ioUringFixedRecvFallbacksCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.FixedRecvFallbacks, this, () => Interlocked.Read(ref _ioUringFixedRecvFallbacks)) + { + DisplayName = "io_uring Fixed Recv Fallbacks", + }; + _ioUringPersistentMultishotRecvReuseCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.PersistentMultishotRecvReuse, this, () => Interlocked.Read(ref _ioUringPersistentMultishotRecvReuse)) + { + DisplayName = "io_uring Persistent Multishot Recv Reuse", + }; + _ioUringPersistentMultishotRecvTerminationCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.PersistentMultishotRecvTermination, this, () => Interlocked.Read(ref _ioUringPersistentMultishotRecvTermination)) + { + DisplayName = "io_uring Persistent Multishot Recv Terminations", + }; + _ioUringPersistentMultishotRecvEarlyDataCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.PersistentMultishotRecvEarlyData, this, () => Interlocked.Read(ref _ioUringPersistentMultishotRecvEarlyData)) + { + DisplayName = "io_uring Persistent Multishot Recv Early Data", + }; } } } diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs new file mode 100644 index 00000000000000..c058336c247162 --- /dev/null +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs @@ -0,0 +1,6366 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Net; +using System.Reflection; +using System.Reflection.Emit; +using System.Runtime.InteropServices; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.DotNet.RemoteExecutor; +using Xunit; + +namespace System.Net.Sockets.Tests +{ + public class IoUring + { + private static class IoUringEnvironmentVariables + { + public const string Enabled = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING"; + public const string ProvidedBufferSize = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PROVIDED_BUFFER_SIZE"; + public const string AdaptiveBufferSizing = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ADAPTIVE_BUFFER_SIZING"; + public const string RegisterBuffers = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS"; + public const string SqPoll = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL"; + public const string ZeroCopySend = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND"; + public const string DirectSqe = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE"; + public const string ForceEagainOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_EAGAIN_ONCE_MASK"; + public const string ForceEcanceledOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ECANCELED_ONCE_MASK"; + public const string TestEventBufferCount = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_EVENT_BUFFER_COUNT"; + public const string PrepareQueueCapacity = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PREPARE_QUEUE_CAPACITY"; + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // Uses Linux-only io_uring publication internals. + public static async Task IoUringNonPinnableFallbackPublication_ConcurrentPublishers_EmitSingleDelta() + { + await RemoteExecutor.Invoke(static () => + { + const BindingFlags StaticNonPublic = BindingFlags.Static | BindingFlags.NonPublic; + + Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + MethodInfo getDeltaMethod = engineType.GetMethod("GetIoUringNonPinnablePrepareFallbackDelta", StaticNonPublic)!; + FieldInfo publishedCountField = engineType.GetField("s_ioUringPublishedNonPinnablePrepareFallbackCount", StaticNonPublic)!; + FieldInfo publishingGateField = engineType.GetField("s_ioUringPublishingNonPinnablePrepareFallback", StaticNonPublic)!; + + Type contextType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncContext", throwOnError: true)!; + FieldInfo fallbackCountField = contextType.GetField("s_ioUringNonPinnablePrepareFallbackCount", StaticNonPublic)!; + + long originalPublished = (long)publishedCountField.GetValue(null)!; + int originalPublishingGate = (int)publishingGateField.GetValue(null)!; + long originalFallback = (long)fallbackCountField.GetValue(null)!; + + try + { + const long firstFallbackCount = 17; + const int publisherCount = 16; + long[] deltas = new long[publisherCount]; + using var start = new ManualResetEventSlim(initialState: false); + var tasks = new Task[publisherCount]; + + publishedCountField.SetValue(null, 0L); + publishingGateField.SetValue(null, 0); + fallbackCountField.SetValue(null, firstFallbackCount); + + for (int i = 0; i < publisherCount; i++) + { + int capturedIndex = i; + tasks[i] = Task.Run(() => + { + start.Wait(); + deltas[capturedIndex] = (long)getDeltaMethod.Invoke(null, null)!; + }); + } + + start.Set(); + Task.WaitAll(tasks); + + long deltaTotal = 0; + int nonZeroCount = 0; + long nonZeroValue = 0; + foreach (long delta in deltas) + { + deltaTotal += delta; + if (delta != 0) + { + nonZeroCount++; + nonZeroValue = delta; + } + } + + Assert.Equal(firstFallbackCount, deltaTotal); + Assert.Equal(1, nonZeroCount); + Assert.Equal(firstFallbackCount, nonZeroValue); + + const long secondFallbackCount = 23; + fallbackCountField.SetValue(null, secondFallbackCount); + Assert.Equal(secondFallbackCount - firstFallbackCount, (long)getDeltaMethod.Invoke(null, null)!); + Assert.Equal(0, (long)getDeltaMethod.Invoke(null, null)!); + } + finally + { + fallbackCountField.SetValue(null, originalFallback); + publishedCountField.SetValue(null, originalPublished); + publishingGateField.SetValue(null, originalPublishingGate); + } + }).DisposeAsync(); + } + + private static RemoteInvokeOptions CreateSocketEngineOptions( + string? ioUringValue = "1", + string? forceEagainOnceMask = null, + string? forceEcanceledOnceMask = null, + int? testEventBufferCount = null, + string? testEventBufferCountRaw = null, + int? prepareQueueCapacity = null, + int? providedBufferSize = null, + bool? adaptiveBufferSizingEnabled = null, + bool? registerBuffersEnabled = null, + bool? sqPollEnabled = null, + bool? directSqeEnabled = null, + bool? zeroCopySendEnabled = null) + { + static void SetOrRemoveEnvironmentVariable(RemoteInvokeOptions options, string name, string? value) + { + if (value is null) + { + options.StartInfo.EnvironmentVariables.Remove(name); + } + else + { + options.StartInfo.EnvironmentVariables[name] = value; + } + } + + static void ValidateSocketEngineOptionCombination(int? configuredEventBufferCount, string? configuredEventBufferCountRaw) + { + if (configuredEventBufferCount.HasValue && configuredEventBufferCountRaw is not null) + { + throw new ArgumentException( + "Specify either testEventBufferCount or testEventBufferCountRaw, not both.", + nameof(configuredEventBufferCountRaw)); + } + } + + ValidateSocketEngineOptionCombination(testEventBufferCount, testEventBufferCountRaw); + + RemoteInvokeOptions options = new RemoteInvokeOptions(); + string? configuredEventBufferCount = + testEventBufferCountRaw ?? (testEventBufferCount.HasValue ? testEventBufferCount.Value.ToString() : null); + (string Name, string? Value)[] ioUringEnvironmentAssignments = + { + (IoUringEnvironmentVariables.Enabled, ioUringValue), + (IoUringEnvironmentVariables.ProvidedBufferSize, providedBufferSize?.ToString()), + (IoUringEnvironmentVariables.AdaptiveBufferSizing, adaptiveBufferSizingEnabled.HasValue ? (adaptiveBufferSizingEnabled.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.RegisterBuffers, registerBuffersEnabled.HasValue ? (registerBuffersEnabled.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.SqPoll, sqPollEnabled.HasValue ? (sqPollEnabled.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.DirectSqe, directSqeEnabled.HasValue ? (directSqeEnabled.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.ZeroCopySend, zeroCopySendEnabled.HasValue ? (zeroCopySendEnabled.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.ForceEagainOnceMask, string.IsNullOrEmpty(forceEagainOnceMask) ? null : forceEagainOnceMask), + (IoUringEnvironmentVariables.ForceEcanceledOnceMask, string.IsNullOrEmpty(forceEcanceledOnceMask) ? null : forceEcanceledOnceMask), + (IoUringEnvironmentVariables.TestEventBufferCount, configuredEventBufferCount), + (IoUringEnvironmentVariables.PrepareQueueCapacity, prepareQueueCapacity?.ToString()), + }; + + foreach ((string Name, string? Value) assignment in ioUringEnvironmentAssignments) + { + SetOrRemoveEnvironmentVariable(options, assignment.Name, assignment.Value); + } + + options.TimeOut = (int)TimeSpan.FromMinutes(10).TotalMilliseconds; + return options; + } + + private static Task ToTask(Task task) => task; + private static Task ToTask(ValueTask task) => task.AsTask(); + + private static async Task AwaitWithTimeoutAsync(Task task, string operationName) + { + Task completed = await Task.WhenAny(task, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.True(ReferenceEquals(task, completed), $"Timed out waiting for {operationName}"); + return await task; + } + + private static void AssertCanceledOrInterrupted(Exception? ex) + { + Assert.NotNull(ex); + Assert.True( + ex is OperationCanceledException || + ex is SocketException socketException && + (socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted), + $"Unexpected exception: {ex}"); + } + + private static void AssertCanceledDisposedOrInterrupted(Exception? ex) + { + if (ex is null) + { + return; + } + + Assert.True( + ex is ObjectDisposedException || + ex is OperationCanceledException || + ex is SocketException socketException && + (socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted), + $"Unexpected exception: {ex}"); + } + + private readonly struct IoUringNativeDiagnosticsSnapshot + { + public IoUringNativeDiagnosticsSnapshot( + bool hasIoUringPort, + ulong asyncCancelRequestCqeCount, + ulong asyncCancelRequestCqeEnoentCount, + ulong asyncCancelRequestCqeEalreadyCount, + ulong asyncCancelRequestCqeOtherCount, + ulong socketEventBufferFullCount, + ulong unsupportedOpcodePrepareCount, + ulong cqOverflowCount) + { + HasIoUringPort = hasIoUringPort; + AsyncCancelRequestCqeCount = asyncCancelRequestCqeCount; + AsyncCancelRequestCqeEnoentCount = asyncCancelRequestCqeEnoentCount; + AsyncCancelRequestCqeEalreadyCount = asyncCancelRequestCqeEalreadyCount; + AsyncCancelRequestCqeOtherCount = asyncCancelRequestCqeOtherCount; + SocketEventBufferFullCount = socketEventBufferFullCount; + UnsupportedOpcodePrepareCount = unsupportedOpcodePrepareCount; + CqOverflowCount = cqOverflowCount; + } + + public bool HasIoUringPort { get; } + public ulong AsyncCancelRequestCqeCount { get; } + public ulong AsyncCancelRequestCqeEnoentCount { get; } + public ulong AsyncCancelRequestCqeEalreadyCount { get; } + public ulong AsyncCancelRequestCqeOtherCount { get; } + public ulong SocketEventBufferFullCount { get; } + public ulong UnsupportedOpcodePrepareCount { get; } + public ulong CqOverflowCount { get; } + } + + private readonly struct IoUringProvidedBufferSnapshot + { + public IoUringProvidedBufferSnapshot( + bool hasIoUringPort, + bool supportsProvidedBufferRings, + bool hasProvidedBufferRing, + bool hasRegisteredBuffers, + bool adaptiveBufferSizingEnabled, + int availableCount, + int inUseCount, + int totalBufferCount, + int bufferSize, + int recommendedBufferSize, + long recycledCount, + long allocationFailureCount) + { + HasIoUringPort = hasIoUringPort; + SupportsProvidedBufferRings = supportsProvidedBufferRings; + HasProvidedBufferRing = hasProvidedBufferRing; + HasRegisteredBuffers = hasRegisteredBuffers; + AdaptiveBufferSizingEnabled = adaptiveBufferSizingEnabled; + AvailableCount = availableCount; + InUseCount = inUseCount; + TotalBufferCount = totalBufferCount; + BufferSize = bufferSize; + RecommendedBufferSize = recommendedBufferSize; + RecycledCount = recycledCount; + AllocationFailureCount = allocationFailureCount; + } + + public bool HasIoUringPort { get; } + public bool SupportsProvidedBufferRings { get; } + public bool HasProvidedBufferRing { get; } + public bool HasRegisteredBuffers { get; } + public bool AdaptiveBufferSizingEnabled { get; } + public int AvailableCount { get; } + public int InUseCount { get; } + public int TotalBufferCount { get; } + public int BufferSize { get; } + public int RecommendedBufferSize { get; } + public long RecycledCount { get; } + public long AllocationFailureCount { get; } + public bool IsUsable => + HasIoUringPort && + SupportsProvidedBufferRings && + HasProvidedBufferRing && + TotalBufferCount > 0; + public bool IsAdaptiveSizingUsable => IsUsable && AdaptiveBufferSizingEnabled; + } + + private readonly struct IoUringZeroCopySendSnapshot + { + public IoUringZeroCopySendSnapshot( + bool hasIoUringPort, + bool supportsSendZc, + bool supportsSendMsgZc, + bool zeroCopySendEnabled) + { + HasIoUringPort = hasIoUringPort; + SupportsSendZc = supportsSendZc; + SupportsSendMsgZc = supportsSendMsgZc; + ZeroCopySendEnabled = zeroCopySendEnabled; + } + + public bool HasIoUringPort { get; } + public bool SupportsSendZc { get; } + public bool SupportsSendMsgZc { get; } + public bool ZeroCopySendEnabled { get; } + } + + private readonly struct IoUringFixedRecvSnapshot + { + public IoUringFixedRecvSnapshot( + bool hasIoUringPort, + bool supportsReadFixed, + bool hasRegisteredBuffers) + { + HasIoUringPort = hasIoUringPort; + SupportsReadFixed = supportsReadFixed; + HasRegisteredBuffers = hasRegisteredBuffers; + } + + public bool HasIoUringPort { get; } + public bool SupportsReadFixed { get; } + public bool HasRegisteredBuffers { get; } + public bool FixedRecvEnabled => SupportsReadFixed && HasRegisteredBuffers; + } + + private readonly struct IoUringSqPollSnapshot + { + public IoUringSqPollSnapshot(bool hasIoUringPort, bool sqPollEnabled) + { + HasIoUringPort = hasIoUringPort; + SqPollEnabled = sqPollEnabled; + } + + public bool HasIoUringPort { get; } + public bool SqPollEnabled { get; } + public bool IsActive => HasIoUringPort && SqPollEnabled; + } + + private readonly struct IoUringZeroCopyPinHoldSnapshot + { + public IoUringZeroCopyPinHoldSnapshot(bool hasIoUringPort, int activePinHolds, int pendingNotificationCount) + { + HasIoUringPort = hasIoUringPort; + ActivePinHolds = activePinHolds; + PendingNotificationCount = pendingNotificationCount; + } + + public bool HasIoUringPort { get; } + public int ActivePinHolds { get; } + public int PendingNotificationCount { get; } + } + + private sealed class NonPinnableMemoryManager : MemoryManager + { + private readonly byte[] _buffer; + + public NonPinnableMemoryManager(byte[] buffer) + { + _buffer = buffer; + } + + public override Span GetSpan() => _buffer; + + public override MemoryHandle Pin(int elementIndex = 0) + { + _ = elementIndex; + throw new NotSupportedException("Non-pinnable test memory."); + } + + public override void Unpin() + { + } + + protected override void Dispose(bool disposing) + { + } + } + + private sealed unsafe class TrackingPinnableMemoryManager : MemoryManager + { + private readonly byte[] _buffer; + private int _pinCount; + private int _unpinCount; + + public TrackingPinnableMemoryManager(byte[] buffer) + { + _buffer = buffer; + } + + public int PinCount => Volatile.Read(ref _pinCount); + public int UnpinCount => Volatile.Read(ref _unpinCount); + + public override Span GetSpan() => _buffer; + + public override MemoryHandle Pin(int elementIndex = 0) + { + if ((uint)elementIndex > (uint)_buffer.Length) + { + throw new ArgumentOutOfRangeException(nameof(elementIndex)); + } + + Interlocked.Increment(ref _pinCount); + GCHandle handle = GCHandle.Alloc(_buffer, GCHandleType.Pinned); + byte* pointer = (byte*)handle.AddrOfPinnedObject() + elementIndex; + return new MemoryHandle(pointer, handle, this); + } + + public override void Unpin() + { + Interlocked.Increment(ref _unpinCount); + } + + protected override void Dispose(bool disposing) + { + } + } + + private static long GetIoUringPrepareNonPinnableFallbackCounterValue() + { + Type telemetryType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketsTelemetry", throwOnError: true)!; + FieldInfo logField = telemetryType.GetField("Log", BindingFlags.Public | BindingFlags.Static)!; + object telemetry = logField.GetValue(null)!; + FieldInfo fallbackCounterField = telemetryType.GetField("_ioUringPrepareNonPinnableFallbacks", BindingFlags.NonPublic | BindingFlags.Instance)!; + return Convert.ToInt64(fallbackCounterField.GetValue(telemetry)); + } + + private static long GetIoUringCompletionRequeueFailureCounterValue() + { + Type telemetryType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketsTelemetry", throwOnError: true)!; + FieldInfo logField = telemetryType.GetField("Log", BindingFlags.Public | BindingFlags.Static)!; + object telemetry = logField.GetValue(null)!; + FieldInfo counterField = telemetryType.GetField("_ioUringCompletionRequeueFailures", BindingFlags.NonPublic | BindingFlags.Instance)!; + return Convert.ToInt64(counterField.GetValue(telemetry)); + } + + private static Type GetSocketAsyncEngineType() => + typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + + private static bool InvokeSocketAsyncEngineBoolMethod(string methodName) + { + Type engineType = GetSocketAsyncEngineType(); + MethodInfo method = engineType.GetMethod(methodName, BindingFlags.NonPublic | BindingFlags.Static)!; + return (bool)method.Invoke(null, null)!; + } + + private static int InvokeSocketAsyncEngineIntMethod(string methodName) + { + Type engineType = GetSocketAsyncEngineType(); + MethodInfo method = engineType.GetMethod(methodName, BindingFlags.NonPublic | BindingFlags.Static)!; + return (int)method.Invoke(null, null)!; + } + + private static void AssertBooleanAppContextSwitch( + string switchName, + string methodName, + bool expectedWhenSwitchTrue, + bool expectedWhenSwitchFalse) + { + AppContext.SetSwitch(switchName, true); + Assert.Equal(expectedWhenSwitchTrue, InvokeSocketAsyncEngineBoolMethod(methodName)); + + AppContext.SetSwitch(switchName, false); + Assert.Equal(expectedWhenSwitchFalse, InvokeSocketAsyncEngineBoolMethod(methodName)); + } + + private static ulong GetIoUringTelemetryCounterValue(string fieldName) + { + Type telemetryType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketsTelemetry", throwOnError: true)!; + FieldInfo? logField = telemetryType.GetField("Log", BindingFlags.Public | BindingFlags.Static); + if (logField?.GetValue(null) is not object telemetry) + { + return 0; + } + + FieldInfo? counterField = telemetryType.GetField(fieldName, BindingFlags.NonPublic | BindingFlags.Instance); + return counterField?.GetValue(telemetry) is object value ? Convert.ToUInt64(value) : 0UL; + } + + private static readonly OpCode[] s_singleByteOpCodes = BuildSingleByteOpCodeTable(); + private static readonly OpCode[] s_multiByteOpCodes = BuildMultiByteOpCodeTable(); + + private static OpCode[] BuildSingleByteOpCodeTable() + { + OpCode[] table = new OpCode[256]; + foreach (FieldInfo field in typeof(OpCodes).GetFields(BindingFlags.Public | BindingFlags.Static)) + { + if (field.GetValue(null) is not OpCode opCode) + { + continue; + } + + ushort value = unchecked((ushort)opCode.Value); + if ((value & 0xFF00) == 0) + { + table[value & 0xFF] = opCode; + } + } + + return table; + } + + private static OpCode[] BuildMultiByteOpCodeTable() + { + OpCode[] table = new OpCode[256]; + foreach (FieldInfo field in typeof(OpCodes).GetFields(BindingFlags.Public | BindingFlags.Static)) + { + if (field.GetValue(null) is not OpCode opCode) + { + continue; + } + + ushort value = unchecked((ushort)opCode.Value); + if ((value & 0xFF00) == 0xFE00) + { + table[value & 0xFF] = opCode; + } + } + + return table; + } + + private static int FindCallInstructionOffset(ReadOnlySpan il, int targetMetadataToken) + { + int offset = 0; + while (offset < il.Length) + { + int instructionOffset = offset; + OpCode opCode; + byte first = il[offset++]; + if (first == 0xFE) + { + if (offset >= il.Length) + { + break; + } + + opCode = s_multiByteOpCodes[il[offset++]]; + } + else + { + opCode = s_singleByteOpCodes[first]; + } + + int operandSize = GetIlOperandSize(opCode.OperandType, il, offset); + if (operandSize < 0 || offset + operandSize > il.Length) + { + break; + } + + if ((opCode == OpCodes.Call || opCode == OpCodes.Callvirt) && + opCode.OperandType == OperandType.InlineMethod && + operandSize == 4) + { + int metadataToken = BitConverter.ToInt32(il.Slice(offset, 4)); + if (metadataToken == targetMetadataToken) + { + return instructionOffset; + } + } + + offset += operandSize; + } + + return -1; + } + + private static int GetIlOperandSize(OperandType operandType, ReadOnlySpan il, int operandOffset) + { + return operandType switch + { + OperandType.InlineNone => 0, + OperandType.ShortInlineBrTarget => 1, + OperandType.ShortInlineI => 1, + OperandType.ShortInlineVar => 1, + OperandType.InlineVar => 2, + OperandType.InlineI => 4, + OperandType.InlineBrTarget => 4, + OperandType.InlineField => 4, + OperandType.InlineMethod => 4, + OperandType.InlineSig => 4, + OperandType.InlineString => 4, + OperandType.InlineTok => 4, + OperandType.InlineType => 4, + OperandType.ShortInlineR => 4, + OperandType.InlineI8 => 8, + OperandType.InlineR => 8, + OperandType.InlineSwitch => operandOffset + 4 <= il.Length + ? 4 + (BitConverter.ToInt32(il.Slice(operandOffset, 4)) * 4) + : -1, + _ => -1, + }; + } + + private static long GetIoUringPollReadinessCqeCount() + { + Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + MethodInfo? countMethod = engineType.GetMethod( + "GetIoUringPollReadinessCqeCount", + BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Static); + if (countMethod is null) + { + return 0; + } + + return Convert.ToInt64(countMethod.Invoke(null, null)); + } + + private static long GetIoUringPendingRetryQueuedToPrepareQueueCount() + { + Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + MethodInfo? countMethod = engineType.GetMethod( + "GetIoUringPendingRetryQueuedToPrepareQueueCount", + BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Static); + if (countMethod is null) + { + return 0; + } + + return Convert.ToInt64(countMethod.Invoke(null, null)); + } + + private static bool IsIoUringMultishotRecvSupported() + { + Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo supportsMultishotRecvField = engineType.GetField("_supportsMultishotRecv", BindingFlags.NonPublic | BindingFlags.Instance)!; + + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!) + { + continue; + } + + if ((bool)supportsMultishotRecvField.GetValue(engine)!) + { + return true; + } + } + + return false; + } + + private static bool IsIoUringMultishotAcceptSupported() + { + Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + PropertyInfo? supportsMultishotAcceptProperty = engineType.GetProperty("SupportsMultishotAccept", BindingFlags.NonPublic | BindingFlags.Instance); + FieldInfo? supportsMultishotAcceptField = engineType.GetField("_supportsMultishotAccept", BindingFlags.NonPublic | BindingFlags.Instance); + + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!) + { + continue; + } + + if (supportsMultishotAcceptProperty is not null && (bool)supportsMultishotAcceptProperty.GetValue(engine)!) + { + return true; + } + + if (supportsMultishotAcceptField is not null && (bool)supportsMultishotAcceptField.GetValue(engine)!) + { + return true; + } + } + + return false; + } + + private static bool TryGetSocketAsyncContextForTest(Socket socket, out object asyncContext) + { + asyncContext = null!; + + try + { + var socketHandle = socket.SafeHandle; + Type safeSocketHandleType = socketHandle.GetType(); + PropertyInfo? asyncContextProperty = safeSocketHandleType.GetProperty("AsyncContext", BindingFlags.NonPublic | BindingFlags.Instance); + if (asyncContextProperty?.GetValue(socketHandle) is not object context) + { + return false; + } + + asyncContext = context; + return true; + } + catch (ObjectDisposedException) + { + return false; + } + } + + private static bool IsListenerMultishotAcceptArmed(Socket listener) + { + if (!TryGetSocketAsyncContextForTest(listener, out object asyncContext)) + { + return false; + } + + Type asyncContextType = asyncContext.GetType(); + PropertyInfo? armedProperty = asyncContextType.GetProperty( + "IsMultishotAcceptArmed", + BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance); + if (armedProperty is not null) + { + object? value = armedProperty.GetValue(asyncContext); + return value is bool armed && armed; + } + + FieldInfo? armedField = asyncContextType.GetField("_multishotAcceptArmed", BindingFlags.NonPublic | BindingFlags.Instance); + return armedField?.GetValue(asyncContext) is int armedState && armedState != 0; + } + + private static int GetListenerMultishotAcceptQueueCount(Socket listener) + { + if (!TryGetSocketAsyncContextForTest(listener, out object asyncContext)) + { + return 0; + } + + FieldInfo? queueField = asyncContext.GetType().GetField("_multishotAcceptQueue", BindingFlags.NonPublic | BindingFlags.Instance); + if (queueField is null) + { + return 0; + } + + object? queue = queueField.GetValue(asyncContext); + if (queue is null) + { + return 0; + } + + PropertyInfo? countProperty = queue.GetType().GetProperty("Count", BindingFlags.Public | BindingFlags.Instance); + return countProperty?.GetValue(queue) is int count ? count : 0; + } + + private static async Task WaitForMultishotAcceptArmedStateAsync(Socket listener, bool expectedArmed, int timeoutMilliseconds = 5000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + while (DateTime.UtcNow < deadline) + { + if (IsListenerMultishotAcceptArmed(listener) == expectedArmed) + { + return true; + } + + await Task.Delay(20); + } + + return IsListenerMultishotAcceptArmed(listener) == expectedArmed; + } + + private static bool IsPersistentMultishotRecvArmed(Socket socket) + { + if (!TryGetSocketAsyncContextForTest(socket, out object asyncContext)) + { + return false; + } + + MethodInfo? armedMethod = asyncContext.GetType().GetMethod( + "IsPersistentMultishotRecvArmed", + BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance); + if (armedMethod is null) + { + return false; + } + + return (bool)armedMethod.Invoke(asyncContext, null)!; + } + + private static async Task WaitForPersistentMultishotRecvArmedStateAsync(Socket socket, bool expectedArmed, int timeoutMilliseconds = 5000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + while (DateTime.UtcNow < deadline) + { + if (IsPersistentMultishotRecvArmed(socket) == expectedArmed) + { + return true; + } + + await Task.Delay(20); + } + + return IsPersistentMultishotRecvArmed(socket) == expectedArmed; + } + + private static async Task WaitForZeroCopyPinHoldSnapshotAsync( + Func predicate, + int timeoutMilliseconds = 5000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + IoUringZeroCopyPinHoldSnapshot snapshot = GetIoUringZeroCopyPinHoldSnapshot(); + while (DateTime.UtcNow < deadline) + { + if (predicate(snapshot)) + { + return snapshot; + } + + await Task.Delay(20); + snapshot = GetIoUringZeroCopyPinHoldSnapshot(); + } + + return snapshot; + } + + private static async Task AssertConnectedPairRoundTripAsync(Socket client, Socket server, byte marker) + { + byte[] payload = new byte[] { marker }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(marker, receiveBuffer[0]); + } + + private static async Task AssertPinsReleasedAsync(TrackingPinnableMemoryManager manager) + { + DateTime start = DateTime.UtcNow; + while (manager.PinCount != manager.UnpinCount) + { + if (DateTime.UtcNow - start > TimeSpan.FromSeconds(10)) + { + break; + } + + await Task.Delay(20); + } + + Assert.True(manager.PinCount > 0, "Expected at least one pin."); + Assert.Equal(manager.PinCount, manager.UnpinCount); + } + + private static IoUringNativeDiagnosticsSnapshot GetIoUringNativeDiagnosticsSnapshot() + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + Type? interopSysType = socketsAssembly.GetType("Interop+Sys", throwOnError: false); + + FieldInfo? enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static); + PropertyInfo? isIoUringEnabledProp = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance); + FieldInfo? portField = engineType.GetField("_port", BindingFlags.NonPublic | BindingFlags.Instance); + if (enginesField is null || isIoUringEnabledProp is null || portField is null) + { + return new IoUringNativeDiagnosticsSnapshot(false, 0, 0, 0, 0, 0, 0, 0); + } + + MethodInfo? tryGetDiagnosticsMethod = interopSysType?.GetMethod( + "TryGetIoUringSocketEventPortDiagnostics", + BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Static); + + Type? diagnosticsType = null; + if (tryGetDiagnosticsMethod is not null) + { + ParameterInfo[] parameters = tryGetDiagnosticsMethod.GetParameters(); + if (parameters.Length >= 2) + { + diagnosticsType = parameters[1].ParameterType.GetElementType(); + } + } + + FieldInfo? asyncCancelRequestCqeCountField = diagnosticsType?.GetField("AsyncCancelRequestCqeCount", BindingFlags.Public | BindingFlags.Instance); + FieldInfo? asyncCancelRequestCqeEnoentCountField = diagnosticsType?.GetField("AsyncCancelRequestCqeEnoentCount", BindingFlags.Public | BindingFlags.Instance); + FieldInfo? asyncCancelRequestCqeEalreadyCountField = diagnosticsType?.GetField("AsyncCancelRequestCqeEalreadyCount", BindingFlags.Public | BindingFlags.Instance); + FieldInfo? asyncCancelRequestCqeOtherCountField = diagnosticsType?.GetField("AsyncCancelRequestCqeOtherCount", BindingFlags.Public | BindingFlags.Instance); + FieldInfo? socketEventBufferFullCountField = diagnosticsType?.GetField("SocketEventBufferFullCount", BindingFlags.Public | BindingFlags.Instance); + FieldInfo? unsupportedOpcodePrepareCountField = diagnosticsType?.GetField("UnsupportedOpcodePrepareCount", BindingFlags.Public | BindingFlags.Instance); + FieldInfo? cqOverflowCountField = diagnosticsType?.GetField("CqOverflowCount", BindingFlags.Public | BindingFlags.Instance); + + bool hasIoUringPort = false; + ulong asyncCancelRequestCqeCount = 0; + ulong asyncCancelRequestCqeEnoentCount = 0; + ulong asyncCancelRequestCqeEalreadyCount = 0; + ulong asyncCancelRequestCqeOtherCount = 0; + ulong socketEventBufferFullCount = 0; + ulong unsupportedOpcodePrepareCount = 0; + ulong cqOverflowCount = 0; + + bool hasNativeDiagnosticsInterop = + tryGetDiagnosticsMethod is not null && + asyncCancelRequestCqeCountField is not null && + asyncCancelRequestCqeEnoentCountField is not null && + asyncCancelRequestCqeEalreadyCountField is not null && + asyncCancelRequestCqeOtherCountField is not null && + socketEventBufferFullCountField is not null && + unsupportedOpcodePrepareCountField is not null && + cqOverflowCountField is not null; + + if (enginesField.GetValue(null) is not Array engines) + { + return new IoUringNativeDiagnosticsSnapshot(false, 0, 0, 0, 0, 0, 0, 0); + } + + foreach (object? engine in engines) + { + if (engine is null || + isIoUringEnabledProp.GetValue(engine) is not bool isIoUringEnabled || + !isIoUringEnabled) + { + continue; + } + + hasIoUringPort = true; + if (!hasNativeDiagnosticsInterop) + { + continue; + } + + if (portField.GetValue(engine) is not IntPtr port) + { + continue; + } + + object?[] args = new object?[] { port, null }; + object? error = tryGetDiagnosticsMethod!.Invoke(null, args); + if (error is null || Convert.ToInt32(error) != 0 || args[1] is null) + { + continue; + } + + object diagnostics = args[1]!; + asyncCancelRequestCqeCount += Convert.ToUInt64(asyncCancelRequestCqeCountField!.GetValue(diagnostics)); + asyncCancelRequestCqeEnoentCount += Convert.ToUInt64(asyncCancelRequestCqeEnoentCountField!.GetValue(diagnostics)); + asyncCancelRequestCqeEalreadyCount += Convert.ToUInt64(asyncCancelRequestCqeEalreadyCountField!.GetValue(diagnostics)); + asyncCancelRequestCqeOtherCount += Convert.ToUInt64(asyncCancelRequestCqeOtherCountField!.GetValue(diagnostics)); + socketEventBufferFullCount += Convert.ToUInt64(socketEventBufferFullCountField!.GetValue(diagnostics)); + unsupportedOpcodePrepareCount += Convert.ToUInt64(unsupportedOpcodePrepareCountField!.GetValue(diagnostics)); + cqOverflowCount += Convert.ToUInt64(cqOverflowCountField!.GetValue(diagnostics)); + } + + if (hasIoUringPort && !hasNativeDiagnosticsInterop) + { + // Native diagnostics interop is not available in the managed io_uring path. + // Fall back to managed telemetry counters for the subset of metrics still exposed. + asyncCancelRequestCqeCount = GetIoUringTelemetryCounterValue("_ioUringAsyncCancelRequestCqes"); + socketEventBufferFullCount = GetIoUringTelemetryCounterValue("_ioUringSocketEventBufferFull"); + cqOverflowCount = GetIoUringTelemetryCounterValue("_ioUringCqOverflow"); + } + + return new IoUringNativeDiagnosticsSnapshot( + hasIoUringPort, + asyncCancelRequestCqeCount, + asyncCancelRequestCqeEnoentCount, + asyncCancelRequestCqeEalreadyCount, + asyncCancelRequestCqeOtherCount, + socketEventBufferFullCount, + unsupportedOpcodePrepareCount, + cqOverflowCount); + } + + private static IoUringProvidedBufferSnapshot GetIoUringProvidedBufferSnapshot() + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo supportsProvidedBufferRingsField = engineType.GetField("_supportsProvidedBufferRings", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo providedBufferRingField = engineType.GetField("_ioUringProvidedBufferRing", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo registeredBuffersField = engineType.GetField("_ioUringBuffersRegistered", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo adaptiveSizingEnabledField = engineType.GetField("_adaptiveBufferSizingEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + + bool hasIoUringPort = false; + bool supportsProvidedBufferRings = false; + bool hasProvidedBufferRing = false; + bool hasRegisteredBuffers = false; + bool adaptiveBufferSizingEnabled = false; + int availableCount = 0; + int inUseCount = 0; + int totalBufferCount = 0; + int bufferSize = 0; + int recommendedBufferSize = 0; + long recycledCount = 0; + long allocationFailureCount = 0; + + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!) + { + continue; + } + + hasIoUringPort = true; + if (!(bool)supportsProvidedBufferRingsField.GetValue(engine)!) + { + continue; + } + + supportsProvidedBufferRings = true; + if ((bool)adaptiveSizingEnabledField.GetValue(engine)!) + { + adaptiveBufferSizingEnabled = true; + } + if ((bool)registeredBuffersField.GetValue(engine)!) + { + hasRegisteredBuffers = true; + } + object? providedBufferRing = providedBufferRingField.GetValue(engine); + if (providedBufferRing is null) + { + continue; + } + + hasProvidedBufferRing = true; + Type providedBufferRingType = providedBufferRing.GetType(); + PropertyInfo? availableCountProperty = providedBufferRingType.GetProperty("AvailableCount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance); + PropertyInfo? inUseCountProperty = providedBufferRingType.GetProperty("InUseCount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance); + PropertyInfo? recycledCountProperty = providedBufferRingType.GetProperty("RecycledCount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance); + PropertyInfo? allocationFailureCountProperty = providedBufferRingType.GetProperty("AllocationFailureCount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance); + PropertyInfo? bufferSizeProperty = providedBufferRingType.GetProperty("BufferSize", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance); + PropertyInfo? recommendedBufferSizeProperty = providedBufferRingType.GetProperty("RecommendedBufferSize", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance); + FieldInfo? bufferStatesField = providedBufferRingType.GetField("_bufferStates", BindingFlags.NonPublic | BindingFlags.Instance); + + if (availableCountProperty is null || + inUseCountProperty is null || + recycledCountProperty is null || + allocationFailureCountProperty is null || + bufferSizeProperty is null || + recommendedBufferSizeProperty is null || + bufferStatesField is null) + { + continue; + } + + availableCount += Convert.ToInt32(availableCountProperty.GetValue(providedBufferRing)); + inUseCount += Convert.ToInt32(inUseCountProperty.GetValue(providedBufferRing)); + recycledCount += Convert.ToInt64(recycledCountProperty.GetValue(providedBufferRing)); + allocationFailureCount += Convert.ToInt64(allocationFailureCountProperty.GetValue(providedBufferRing)); + bufferSize = Math.Max(bufferSize, Convert.ToInt32(bufferSizeProperty.GetValue(providedBufferRing))); + recommendedBufferSize = Math.Max(recommendedBufferSize, Convert.ToInt32(recommendedBufferSizeProperty.GetValue(providedBufferRing))); + + byte[] bufferStates = (byte[])bufferStatesField.GetValue(providedBufferRing)!; + totalBufferCount += bufferStates.Length; + } + + return new IoUringProvidedBufferSnapshot( + hasIoUringPort, + supportsProvidedBufferRings, + hasProvidedBufferRing, + hasRegisteredBuffers, + adaptiveBufferSizingEnabled, + availableCount, + inUseCount, + totalBufferCount, + bufferSize, + recommendedBufferSize, + recycledCount, + allocationFailureCount); + } + + private static IoUringZeroCopySendSnapshot GetIoUringZeroCopySendSnapshot() + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo supportsSendZcField = engineType.GetField("_supportsOpSendZc", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo supportsSendMsgZcField = engineType.GetField("_supportsOpSendMsgZc", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo zeroCopySendEnabledField = engineType.GetField("_zeroCopySendEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + + bool hasIoUringPort = false; + bool supportsSendZc = false; + bool supportsSendMsgZc = false; + bool zeroCopySendEnabled = false; + + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!) + { + continue; + } + + hasIoUringPort = true; + if ((bool)supportsSendZcField.GetValue(engine)!) + { + supportsSendZc = true; + } + + if ((bool)supportsSendMsgZcField.GetValue(engine)!) + { + supportsSendMsgZc = true; + } + + if ((bool)zeroCopySendEnabledField.GetValue(engine)!) + { + zeroCopySendEnabled = true; + } + } + + return new IoUringZeroCopySendSnapshot( + hasIoUringPort, + supportsSendZc, + supportsSendMsgZc, + zeroCopySendEnabled); + } + + private static IoUringFixedRecvSnapshot GetIoUringFixedRecvSnapshot() + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo supportsReadFixedField = engineType.GetField("_supportsOpReadFixed", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo registeredBuffersField = engineType.GetField("_ioUringBuffersRegistered", BindingFlags.NonPublic | BindingFlags.Instance)!; + + bool hasIoUringPort = false; + bool supportsReadFixed = false; + bool hasRegisteredBuffers = false; + + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!) + { + continue; + } + + hasIoUringPort = true; + supportsReadFixed |= (bool)supportsReadFixedField.GetValue(engine)!; + hasRegisteredBuffers |= (bool)registeredBuffersField.GetValue(engine)!; + } + + return new IoUringFixedRecvSnapshot( + hasIoUringPort, + supportsReadFixed, + hasRegisteredBuffers); + } + + private static IoUringSqPollSnapshot GetIoUringSqPollSnapshot() + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo sqPollEnabledField = engineType.GetField("_sqPollEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + + bool hasIoUringPort = false; + bool sqPollEnabled = false; + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!) + { + continue; + } + + hasIoUringPort = true; + sqPollEnabled |= (bool)sqPollEnabledField.GetValue(engine)!; + } + + return new IoUringSqPollSnapshot(hasIoUringPort, sqPollEnabled); + } + + private static bool IsAnyIoUringSqPollEngineNeedingWakeup() + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo sqPollEnabledField = engineType.GetField("_sqPollEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + MethodInfo sqNeedWakeupMethod = engineType.GetMethod("SqNeedWakeup", BindingFlags.NonPublic | BindingFlags.Instance)!; + + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)! || !(bool)sqPollEnabledField.GetValue(engine)!) + { + continue; + } + + if (sqNeedWakeupMethod.Invoke(engine, null) is bool needsWakeup && needsWakeup) + { + return true; + } + } + + return false; + } + + private static bool ValidateSqNeedWakeupMatchesRawSqFlagBit() + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo sqPollEnabledField = engineType.GetField("_sqPollEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo sqFlagsPtrField = engineType.GetField("_managedSqFlagsPtr", BindingFlags.NonPublic | BindingFlags.Instance)!; + MethodInfo sqNeedWakeupMethod = engineType.GetMethod("SqNeedWakeup", BindingFlags.NonPublic | BindingFlags.Instance)!; + + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)! || !(bool)sqPollEnabledField.GetValue(engine)!) + { + continue; + } + + bool methodValue = (bool)sqNeedWakeupMethod.Invoke(engine, null)!; + object? pointerBoxed = sqFlagsPtrField.GetValue(engine); + if (pointerBoxed is null) + { + Assert.True(methodValue, "SqNeedWakeup should return true when SQ flags pointer is unavailable."); + return true; + } + + unsafe + { + uint* ptr = (uint*)Pointer.Unbox(pointerBoxed); + bool rawValue = ptr == null || (Volatile.Read(ref *ptr) & 0x1u) != 0; + Assert.Equal(rawValue, methodValue); + } + + return true; + } + + return false; + } + + private static void EnableSqPollAppContextOptIn() => + AppContext.SetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", true); + + private static IoUringZeroCopyPinHoldSnapshot GetIoUringZeroCopyPinHoldSnapshot() + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo zeroCopyPinHoldsField = engineType.GetField("_zeroCopyPinHolds", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo completionSlotsField = engineType.GetField("_completionSlots", BindingFlags.NonPublic | BindingFlags.Instance)!; + + bool hasIoUringPort = false; + int activePinHolds = 0; + int pendingNotificationCount = 0; + + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!) + { + continue; + } + + hasIoUringPort = true; + + if (zeroCopyPinHoldsField.GetValue(engine) is MemoryHandle[] pinHolds) + { + foreach (MemoryHandle pinHold in pinHolds) + { + if (!pinHold.Equals(default(MemoryHandle))) + { + activePinHolds++; + } + } + } + + if (completionSlotsField.GetValue(engine) is Array completionSlots) + { + Type? completionSlotType = completionSlots.GetType().GetElementType(); + FieldInfo? zeroCopyPendingField = completionSlotType?.GetField("ZeroCopyNotificationPending", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance); + if (zeroCopyPendingField is null) + { + continue; + } + + foreach (object? slot in completionSlots) + { + if (slot is not null && (bool)zeroCopyPendingField.GetValue(slot)!) + { + pendingNotificationCount++; + } + } + } + } + + return new IoUringZeroCopyPinHoldSnapshot( + hasIoUringPort, + activePinHolds, + pendingNotificationCount); + } + + private static bool TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount) + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo supportsProvidedBufferRingsField = engineType.GetField("_supportsProvidedBufferRings", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo providedBufferRingField = engineType.GetField("_ioUringProvidedBufferRing", BindingFlags.NonPublic | BindingFlags.Instance)!; + + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || + !(bool)isIoUringEnabledProperty.GetValue(engine)! || + !(bool)supportsProvidedBufferRingsField.GetValue(engine)!) + { + continue; + } + + object? providedBufferRing = providedBufferRingField.GetValue(engine); + if (providedBufferRing is null) + { + continue; + } + + Type providedBufferRingType = providedBufferRing.GetType(); + FieldInfo? bufferStatesField = providedBufferRingType.GetField("_bufferStates", BindingFlags.NonPublic | BindingFlags.Instance); + FieldInfo? availableCountField = providedBufferRingType.GetField("_availableCount", BindingFlags.NonPublic | BindingFlags.Instance); + FieldInfo? inUseCountField = providedBufferRingType.GetField("_inUseCount", BindingFlags.NonPublic | BindingFlags.Instance); + + if (bufferStatesField is null || availableCountField is null || inUseCountField is null) + { + continue; + } + + byte[] bufferStates = (byte[])bufferStatesField.GetValue(providedBufferRing)!; + for (int i = 0; i < bufferStates.Length; i++) + { + bufferStates[i] = 2; // BufferStateCheckedOut + } + + availableCountField.SetValue(providedBufferRing, 0); + inUseCountField.SetValue(providedBufferRing, bufferStates.Length); + forcedBufferCount = bufferStates.Length; + return true; + } + + forcedBufferCount = 0; + return false; + } + + private static bool TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount) + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo supportsProvidedBufferRingsField = engineType.GetField("_supportsProvidedBufferRings", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo providedBufferRingField = engineType.GetField("_ioUringProvidedBufferRing", BindingFlags.NonPublic | BindingFlags.Instance)!; + + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || + !(bool)isIoUringEnabledProperty.GetValue(engine)! || + !(bool)supportsProvidedBufferRingsField.GetValue(engine)!) + { + continue; + } + + object? providedBufferRing = providedBufferRingField.GetValue(engine); + if (providedBufferRing is null) + { + continue; + } + + MethodInfo? recycleMethod = providedBufferRing.GetType().GetMethod( + "RecycleCheckedOutBuffersForTeardown", + BindingFlags.NonPublic | BindingFlags.Instance); + if (recycleMethod is null) + { + continue; + } + + recycledBufferCount = Convert.ToInt32(recycleMethod.Invoke(providedBufferRing, null)); + return true; + } + + recycledBufferCount = 0; + return false; + } + + private static ulong CounterDelta(ulong before, ulong after) => + after >= before ? after - before : after; + + private static async Task WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + Func scenario, + Action validateDelta, + int settleDelayMilliseconds = 0, + bool skipScenarioWhenIoUringUnavailable = false) + { + IoUringNativeDiagnosticsSnapshot diagnosticsBefore = GetIoUringNativeDiagnosticsSnapshot(); + if (skipScenarioWhenIoUringUnavailable && !diagnosticsBefore.HasIoUringPort) + { + return; + } + + await scenario(); + + if (settleDelayMilliseconds > 0) + { + await Task.Delay(settleDelayMilliseconds); + } + + IoUringNativeDiagnosticsSnapshot diagnosticsAfter = GetIoUringNativeDiagnosticsSnapshot(); + if (!diagnosticsBefore.HasIoUringPort && !diagnosticsAfter.HasIoUringPort) + { + return; + } + + validateDelta(diagnosticsBefore, diagnosticsAfter); + } + + private static Task StartReceiveMessageFromAsync(Socket socket, SocketAsyncEventArgs eventArgs) + => StartSocketAsyncEventArgsOperation(socket, eventArgs, static (s, args) => s.ReceiveMessageFromAsync(args)); + + private static Task StartSocketAsyncEventArgsOperation( + Socket socket, + SocketAsyncEventArgs eventArgs, + Func startOperation) + { + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + EventHandler handler = null!; + handler = (_, completedArgs) => + { + eventArgs.Completed -= handler; + tcs.TrySetResult(completedArgs); + }; + + eventArgs.Completed += handler; + if (!startOperation(socket, eventArgs)) + { + eventArgs.Completed -= handler; + tcs.TrySetResult(eventArgs); + } + + return tcs.Task; + } + + private static async Task<(Socket Listener, Socket Client, Socket Server)> CreateConnectedTcpSocketTrioAsync(int listenBacklog = 1) + { + Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + try + { + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(listenBacklog); + + Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + try + { + Task acceptTask = listener.AcceptAsync(); + await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + Socket server = await acceptTask; + return (listener, client, server); + } + catch + { + client.Dispose(); + throw; + } + } + catch + { + listener.Dispose(); + throw; + } + } + + private static async Task<(Socket Client, Socket Server)> AcceptConnectedTcpPairAsync(Socket listener, IPEndPoint endpoint) + { + Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + try + { + Task acceptTask = listener.AcceptAsync(); + await client.ConnectAsync(endpoint); + Socket server = await acceptTask; + return (client, server); + } + catch + { + client.Dispose(); + throw; + } + } + + private static async Task RunTcpRoundTripAsync(int iterations) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] sendBuffer = new byte[] { 1 }; + byte[] receiveBuffer = new byte[1]; + + for (int i = 0; i < iterations; i++) + { + var serverReceiveTask = server.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + int clientSent = await client.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, clientSent); + + int serverReceived = await serverReceiveTask; + Assert.Equal(1, serverReceived); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + var clientReceiveTask = client.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + int serverSent = await server.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, serverSent); + + int clientReceived = await clientReceiveTask; + Assert.Equal(1, clientReceived); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + unchecked + { + sendBuffer[0]++; + } + } + } + + private static async Task RunBufferListSendRoundTripAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[] { 0x11, 0x22, 0x33, 0x44, 0x55 }; + var sendBuffers = new List> + { + new ArraySegment(payload, 0, 2), + new ArraySegment(payload, 2, 1), + new ArraySegment(payload, 3, 2) + }; + + byte[] receiveBuffer = new byte[payload.Length]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + int sent = await client.SendAsync(sendBuffers, SocketFlags.None); + Assert.Equal(payload.Length, sent); + Assert.Equal(payload.Length, await receiveTask); + Assert.Equal(payload, receiveBuffer); + } + + private static async Task RunReceiveMessageFromRoundTripAsync() + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + + receiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] payload = new byte[] { 0x91, 0x92, 0x93 }; + byte[] receiveBuffer = new byte[payload.Length]; + EndPoint remoteEndPoint = new IPEndPoint(IPAddress.Any, 0); + + var receiveTask = receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint); + await Task.Yield(); + + int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(payload.Length, sent); + + SocketReceiveMessageFromResult result = await receiveTask; + Assert.Equal(payload.Length, result.ReceivedBytes); + Assert.Equal(payload, receiveBuffer); + Assert.Equal(sender.LocalEndPoint, result.RemoteEndPoint); + } + + private static async Task RunReceiveMessageFromPacketInformationRoundTripAsync(bool useIpv6) + { + if (useIpv6 && !Socket.OSSupportsIPv6) + { + return; + } + + AddressFamily addressFamily = useIpv6 ? AddressFamily.InterNetworkV6 : AddressFamily.InterNetwork; + SocketOptionLevel optionLevel = useIpv6 ? SocketOptionLevel.IPv6 : SocketOptionLevel.IP; + IPAddress loopbackAddress = useIpv6 ? IPAddress.IPv6Loopback : IPAddress.Loopback; + IPAddress anyAddress = useIpv6 ? IPAddress.IPv6Any : IPAddress.Any; + + using Socket receiver = new Socket(addressFamily, SocketType.Dgram, ProtocolType.Udp); + using Socket sender = new Socket(addressFamily, SocketType.Dgram, ProtocolType.Udp); + + receiver.SetSocketOption(optionLevel, SocketOptionName.PacketInformation, true); + receiver.Bind(new IPEndPoint(loopbackAddress, 0)); + sender.Bind(new IPEndPoint(loopbackAddress, 0)); + + byte[] payload = useIpv6 ? + new byte[] { 0xA1, 0xA2, 0xA3 } : + new byte[] { 0x90, 0x91, 0x92, 0x93 }; + byte[] receiveBuffer = new byte[payload.Length]; + EndPoint remoteEndPoint = new IPEndPoint(anyAddress, 0); + + Task receiveTask = + ToTask(receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint)); + await Task.Yield(); + + int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(payload.Length, sent); + + SocketReceiveMessageFromResult result = await receiveTask; + Assert.Equal(payload.Length, result.ReceivedBytes); + Assert.Equal(payload, receiveBuffer); + Assert.Equal(sender.LocalEndPoint, result.RemoteEndPoint); + Assert.Equal(((IPEndPoint)sender.LocalEndPoint!).Address, result.PacketInformation.Address); + } + + private static async Task RunNonPinnableMemorySendFallbackScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[] { 0x71, 0x72, 0x73, 0x74 }; + using var nonPinnableMemory = new NonPinnableMemoryManager(payload); + byte[] receiveBuffer = new byte[payload.Length]; + + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + int sent = await client.SendAsync(nonPinnableMemory.Memory, SocketFlags.None); + Assert.Equal(payload.Length, sent); + Assert.Equal(payload.Length, await receiveTask); + Assert.Equal(payload, receiveBuffer); + } + + private static async Task RunNonPinnableMemoryReceiveFallbackScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[4]; + using var nonPinnableMemory = new NonPinnableMemoryManager(receiveBuffer); + byte[] payload = new byte[] { 0x81, 0x82, 0x83, 0x84 }; + + Task receiveTask = ToTask(server.ReceiveAsync(nonPinnableMemory.Memory, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(payload.Length, await receiveTask); + Assert.Equal(payload, receiveBuffer); + } + + private static Task RunNonPinnableMemoryFallbackScenarioAsync(bool receivePath) => + receivePath ? RunNonPinnableMemoryReceiveFallbackScenarioAsync() : RunNonPinnableMemorySendFallbackScenarioAsync(); + + private static async Task RunNonPinnableFallbackTelemetryScenarioAsync() + { + long before = 0; + long after = 0; + + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + before = GetIoUringPrepareNonPinnableFallbackCounterValue(); + await RunNonPinnableMemorySendFallbackScenarioAsync(); + await RunNonPinnableMemoryReceiveFallbackScenarioAsync(); + after = GetIoUringPrepareNonPinnableFallbackCounterValue(); + }, + (_, _) => + { + Assert.True( + after > before, + $"Expected io_uring non-pinnable fallback telemetry to increase. before={before}, after={after}"); + }, + skipScenarioWhenIoUringUnavailable: true); + } + + private static async Task RunPinnableMemoryPinReleaseLifecycleScenarioAsync() + { + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + // Completion path: receive completes with data and must release pin. + byte[] completionPayload = new byte[] { 0x91 }; + using var completionMemory = new TrackingPinnableMemoryManager(new byte[completionPayload.Length]); + Task completionReceive = ToTask(server.ReceiveAsync(completionMemory.Memory, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(completionPayload, SocketFlags.None)); + Assert.Equal(1, await completionReceive); + Assert.Equal(completionPayload, completionMemory.GetSpan().ToArray()); + await AssertPinsReleasedAsync(completionMemory); + + // Cancellation path: pending receive canceled by token must release pin. + using var cancellationMemory = new TrackingPinnableMemoryManager(new byte[16]); + using (var cts = new CancellationTokenSource()) + { + Task canceledReceive = ToTask(server.ReceiveAsync(cancellationMemory.Memory, SocketFlags.None, cts.Token)); + await Task.Delay(20); + cts.Cancel(); + + Exception? canceledException = await Record.ExceptionAsync(async () => await canceledReceive); + AssertCanceledOrInterrupted(canceledException); + } + + await AssertPinsReleasedAsync(cancellationMemory); + + // Teardown/abort path: pending receive interrupted by close must release pin. + using var teardownMemory = new TrackingPinnableMemoryManager(new byte[16]); + Task teardownReceive = ToTask(server.ReceiveAsync(teardownMemory.Memory, SocketFlags.None)); + await Task.Yield(); + client.Dispose(); + server.Dispose(); + + Exception? teardownException = await Record.ExceptionAsync(async () => await teardownReceive); + AssertCanceledDisposedOrInterrupted(teardownException); + await AssertPinsReleasedAsync(teardownMemory); + }, + static (_, _) => { }, + skipScenarioWhenIoUringUnavailable: true); + } + + private static async Task RunProvidedBufferRegistrationLifecycleScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[1]; + Task initialReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xA1 }, SocketFlags.None)); + Assert.Equal(1, await initialReceive); + + IoUringProvidedBufferSnapshot initialSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!initialSnapshot.IsUsable) + { + return; + } + + Assert.Equal(initialSnapshot.TotalBufferCount, initialSnapshot.AvailableCount + initialSnapshot.InUseCount); + Assert.Equal(0, initialSnapshot.InUseCount); + + using (var cts = new CancellationTokenSource()) + { + Task canceledReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None, cts.Token)); + await Task.Yield(); + cts.Cancel(); + + Exception? canceledException = await Record.ExceptionAsync(async () => await canceledReceive); + AssertCanceledOrInterrupted(canceledException); + } + + await Task.Delay(50); + IoUringProvidedBufferSnapshot postCancellationSnapshot = GetIoUringProvidedBufferSnapshot(); + Assert.Equal(initialSnapshot.TotalBufferCount, postCancellationSnapshot.TotalBufferCount); + Assert.Equal(postCancellationSnapshot.TotalBufferCount, postCancellationSnapshot.AvailableCount + postCancellationSnapshot.InUseCount); + Assert.Equal(0, postCancellationSnapshot.InUseCount); + } + + private static async Task RunProvidedBufferSelectReceiveScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!beforeSnapshot.IsUsable) + { + return; + } + + ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + Assert.Equal(1, await client.SendAsync(new byte[] { 0xB2 }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + Assert.Equal(0xB2, receiveBuffer[0]); + + IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot(); + ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + + Assert.True(recycleAfter > recycleBefore, "Expected provided-buffer recycle counter to increase after a completion."); + Assert.Equal(depletionBefore, depletionAfter); + Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount); + Assert.Equal(0, afterSnapshot.InUseCount); + } + + private static async Task RunProvidedBufferRecycleReuseScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!beforeSnapshot.IsUsable) + { + return; + } + + ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + long allocationFailuresBefore = beforeSnapshot.AllocationFailureCount; + + int iterations = Math.Max(beforeSnapshot.TotalBufferCount + 64, 512); + byte[] receiveBuffer = new byte[1]; + byte[] payload = new byte[1]; + + for (int i = 0; i < iterations; i++) + { + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + payload[0] = unchecked((byte)i); + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + Assert.Equal(payload[0], receiveBuffer[0]); + } + + IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot(); + ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + + Assert.True( + recycleAfter >= recycleBefore + (ulong)iterations, + $"Expected at least {iterations} provided-buffer recycle increments. before={recycleBefore}, after={recycleAfter}"); + Assert.Equal(depletionBefore, depletionAfter); + Assert.Equal(allocationFailuresBefore, afterSnapshot.AllocationFailureCount); + Assert.Equal(beforeSnapshot.TotalBufferCount, afterSnapshot.TotalBufferCount); + Assert.Equal(0, afterSnapshot.InUseCount); + Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount); + } + + private static async Task RunProvidedBufferExhaustionScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] warmupBuffer = new byte[1]; + Task warmupReceive = ToTask(server.ReceiveAsync(warmupBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xC1 }, SocketFlags.None)); + Assert.Equal(1, await warmupReceive); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!snapshot.IsUsable) + { + return; + } + + ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + Assert.True(TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount)); + Assert.True(forcedBufferCount > 0); + + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + Assert.Equal(1, await client.SendAsync(new byte[] { 0xC2 }, SocketFlags.None)); + Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(receiveTask, completed); + + Exception? receiveException = await Record.ExceptionAsync(async () => await receiveTask); + SocketException socketException = Assert.IsType(receiveException); + Assert.Equal(SocketError.NoBufferSpaceAvailable, socketException.SocketErrorCode); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions") > depletionBefore, + "Expected provided-buffer depletion counter to increase when ring buffers are forced unavailable."); + } + + private static async Task RunProvidedBufferMixedWorkloadScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!beforeSnapshot.IsUsable) + { + return; + } + + using Socket udpReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + using Socket udpSender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + udpReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + udpSender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + + byte[] tcpReceiveBuffer = new byte[1]; + byte[] udpReceiveBuffer = new byte[2]; + + Task tcpReceive = ToTask(server.ReceiveAsync(tcpReceiveBuffer, SocketFlags.None)); + Task udpReceive = ToTask( + udpReceiver.ReceiveFromAsync( + udpReceiveBuffer, + SocketFlags.None, + new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + + Assert.Equal(1, await client.SendAsync(new byte[] { 0xD1 }, SocketFlags.None)); + Assert.Equal(2, await udpSender.SendToAsync(new byte[] { 0xE1, 0xE2 }, SocketFlags.None, udpReceiver.LocalEndPoint!)); + + Assert.Equal(1, await tcpReceive); + Assert.Equal(0xD1, tcpReceiveBuffer[0]); + + SocketReceiveFromResult udpResult = await udpReceive; + Assert.Equal(2, udpResult.ReceivedBytes); + Assert.Equal(0xE1, udpReceiveBuffer[0]); + Assert.Equal(0xE2, udpReceiveBuffer[1]); + + IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot(); + ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + + Assert.True(recycleAfter > recycleBefore, "Expected provided-buffer recycle counter to increase in mixed workload."); + Assert.Equal(depletionBefore, depletionAfter); + Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount); + Assert.Equal(0, afterSnapshot.InUseCount); + } + + private static async Task SendExactlyAsync(Socket socket, ReadOnlyMemory buffer) + { + int totalSent = 0; + while (totalSent < buffer.Length) + { + int sent = await socket.SendAsync(buffer.Slice(totalSent), SocketFlags.None); + Assert.True(sent > 0, "Socket.SendAsync returned 0 before sending all bytes."); + totalSent += sent; + } + } + + private static async Task ReceiveExactlyAsync(Socket socket, Memory buffer) + { + int totalReceived = 0; + while (totalReceived < buffer.Length) + { + int received = await socket.ReceiveAsync(buffer.Slice(totalReceived), SocketFlags.None); + Assert.True(received > 0, "Socket.ReceiveAsync returned 0 before receiving all expected bytes."); + totalReceived += received; + } + } + + private static async Task WaitForProvidedBufferSnapshotAsync( + Func predicate, + int timeoutMilliseconds = 10000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + while (DateTime.UtcNow < deadline) + { + if (predicate(snapshot)) + { + return snapshot; + } + + await Task.Delay(50); + snapshot = GetIoUringProvidedBufferSnapshot(); + } + + return snapshot; + } + + private static async Task RunAdaptiveProvidedBufferSmallMessageShrinkScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!beforeSnapshot.IsAdaptiveSizingUsable) + { + return; + } + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + const int payloadSize = 64; + byte[] sendBuffer = new byte[payloadSize]; + byte[] receiveBuffer = new byte[payloadSize]; + + for (int i = 0; i < 320; i++) + { + sendBuffer.AsSpan().Fill(unchecked((byte)i)); + Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer); + await SendExactlyAsync(client, sendBuffer); + await receiveTask; + Assert.Equal(sendBuffer, receiveBuffer); + } + + IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync( + snapshot => snapshot.IsAdaptiveSizingUsable && + (snapshot.RecommendedBufferSize < initialBufferSize || snapshot.BufferSize < initialBufferSize)); + + Assert.True( + afterSnapshot.RecommendedBufferSize < initialBufferSize || afterSnapshot.BufferSize < initialBufferSize, + $"Expected adaptive recommendation to shrink from {initialBufferSize}. " + + $"actual buffer={afterSnapshot.BufferSize}, recommended={afterSnapshot.RecommendedBufferSize}"); + } + + private static async Task RunAdaptiveProvidedBufferLargeMessageGrowScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!beforeSnapshot.IsAdaptiveSizingUsable) + { + return; + } + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + int payloadSize = initialBufferSize; + byte[] sendBuffer = new byte[payloadSize]; + byte[] receiveBuffer = new byte[payloadSize]; + sendBuffer.AsSpan().Fill(0x5A); + + for (int i = 0; i < 320; i++) + { + Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer); + await SendExactlyAsync(client, sendBuffer); + await receiveTask; + Assert.Equal(sendBuffer, receiveBuffer); + } + + IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync( + snapshot => snapshot.IsAdaptiveSizingUsable && + (snapshot.RecommendedBufferSize > initialBufferSize || snapshot.BufferSize > initialBufferSize)); + + Assert.True( + afterSnapshot.RecommendedBufferSize > initialBufferSize || afterSnapshot.BufferSize > initialBufferSize, + $"Expected adaptive recommendation to grow from {initialBufferSize}. " + + $"actual buffer={afterSnapshot.BufferSize}, recommended={afterSnapshot.RecommendedBufferSize}"); + } + + private static async Task RunAdaptiveProvidedBufferMixedWorkloadStableScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!beforeSnapshot.IsAdaptiveSizingUsable) + { + return; + } + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + byte[] smallSend = new byte[64]; + byte[] smallReceive = new byte[64]; + byte[] largeSend = new byte[initialBufferSize]; + byte[] largeReceive = new byte[initialBufferSize]; + smallSend.AsSpan().Fill(0x11); + largeSend.AsSpan().Fill(0x77); + + for (int i = 0; i < 320; i++) + { + bool useLarge = (i & 1) == 1; + byte[] send = useLarge ? largeSend : smallSend; + byte[] receive = useLarge ? largeReceive : smallReceive; + + Task receiveTask = ReceiveExactlyAsync(server, receive); + await SendExactlyAsync(client, send); + await receiveTask; + Assert.Equal(send, receive); + } + + await Task.Delay(250); + IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot(); + Assert.True(afterSnapshot.IsAdaptiveSizingUsable); + Assert.Equal(initialBufferSize, afterSnapshot.RecommendedBufferSize); + } + + private static async Task RunAdaptiveProvidedBufferResizeSwapNoDataLossScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!beforeSnapshot.IsAdaptiveSizingUsable) + { + return; + } + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + const int payloadSize = 64; + byte[] sendBuffer = new byte[payloadSize]; + byte[] receiveBuffer = new byte[payloadSize]; + for (int i = 0; i < 384; i++) + { + sendBuffer.AsSpan().Fill(unchecked((byte)i)); + Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer); + await SendExactlyAsync(client, sendBuffer); + await receiveTask; + Assert.Equal(sendBuffer, receiveBuffer); + } + + IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync( + snapshot => snapshot.IsAdaptiveSizingUsable && snapshot.BufferSize < initialBufferSize, + timeoutMilliseconds: 15000); + + Assert.True( + afterSnapshot.BufferSize < initialBufferSize, + $"Expected adaptive resize swap to shrink active ring. initial={initialBufferSize}, current={afterSnapshot.BufferSize}"); + } + + private static async Task RunAdaptiveProvidedBufferDisabledScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!beforeSnapshot.IsUsable) + { + return; + } + + Assert.False(beforeSnapshot.AdaptiveBufferSizingEnabled); + + int initialBufferSize = beforeSnapshot.BufferSize; + int initialRecommendedSize = beforeSnapshot.RecommendedBufferSize; + + const int payloadSize = 64; + byte[] sendBuffer = new byte[payloadSize]; + byte[] receiveBuffer = new byte[payloadSize]; + sendBuffer.AsSpan().Fill(0xA5); + + for (int i = 0; i < 320; i++) + { + Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer); + await SendExactlyAsync(client, sendBuffer); + await receiveTask; + Assert.Equal(sendBuffer, receiveBuffer); + } + + await Task.Delay(250); + IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot(); + Assert.True(afterSnapshot.IsUsable); + Assert.False(afterSnapshot.AdaptiveBufferSizingEnabled); + Assert.Equal(initialBufferSize, afterSnapshot.BufferSize); + Assert.Equal(initialRecommendedSize, afterSnapshot.RecommendedBufferSize); + } + + private static async Task RunAdaptiveProvidedBufferSizingStateScenarioAsync(bool expectedEnabled) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + // Warm up receive path so io_uring provided-buffer ring state is initialized. + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!snapshot.IsUsable) + { + return; + } + + Assert.Equal(expectedEnabled, snapshot.AdaptiveBufferSizingEnabled); + } + + private static async Task RunProvidedBufferKernelRegistrationDisabledScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + // Warm up receive path so io_uring provided-buffer ring state is initialized. + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!snapshot.IsUsable) + { + return; + } + + Assert.False(snapshot.HasRegisteredBuffers); + } + + private static async Task RunProvidedBufferKernelRegistrationSuccessScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + // Warm up receive path so io_uring provided-buffer ring state and telemetry are initialized. + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!snapshot.IsUsable) + { + return; + } + + ulong successCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialSuccess"); + ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure"); + Assert.True( + successCount + failureCount > 0, + "Expected at least one registered-buffer initialization attempt."); + + // Best-effort success-path assertion: only enforce when registration succeeded on this machine. + if (!snapshot.HasRegisteredBuffers) + { + return; + } + + Assert.True(successCount > 0, "Expected success telemetry when registered buffers are active."); + } + + private static async Task RunProvidedBufferKernelRegistrationFailureNonFatalScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + // Warm up receive path so io_uring provided-buffer ring state and telemetry are initialized. + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!snapshot.IsUsable || snapshot.HasRegisteredBuffers) + { + // No observed registration failure in this environment. + return; + } + + // Registration is not active: verify provided-buffer receive path still works. + byte[] payload = new byte[4096]; + byte[] received = new byte[payload.Length]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 31)); + } + + Task receiveAllTask = ReceiveExactlyAsync(server, received); + await SendExactlyAsync(client, payload); + await receiveAllTask; + Assert.Equal(payload, received); + + ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure"); + Assert.True(failureCount > 0, "Expected failure telemetry when registered buffers are inactive."); + } + + private static async Task RunProvidedBufferKernelReregistrationOnResizeScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!beforeSnapshot.IsAdaptiveSizingUsable) + { + return; + } + + ulong reregSuccessBefore = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationSuccess"); + ulong reregFailureBefore = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationFailure"); + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + const int payloadSize = 64; + byte[] sendBuffer = new byte[payloadSize]; + byte[] receiveBuffer = new byte[payloadSize]; + for (int i = 0; i < 384; i++) + { + sendBuffer.AsSpan().Fill(unchecked((byte)(i + 1))); + Task receivePayloadTask = ReceiveExactlyAsync(server, receiveBuffer); + await SendExactlyAsync(client, sendBuffer); + await receivePayloadTask; + Assert.Equal(sendBuffer, receiveBuffer); + } + + IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync( + snapshot => snapshot.IsAdaptiveSizingUsable && snapshot.BufferSize < initialBufferSize, + timeoutMilliseconds: 15000); + + Assert.True(afterSnapshot.BufferSize < initialBufferSize); + + ulong reregSuccessAfter = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationSuccess"); + ulong reregFailureAfter = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationFailure"); + Assert.True( + (reregSuccessAfter + reregFailureAfter) > (reregSuccessBefore + reregFailureBefore), + "Expected at least one registered-buffer re-registration attempt after adaptive resize."); + } + + private static async Task RunProvidedBufferRegisteredBuffersDataCorrectnessScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!snapshot.IsUsable || !snapshot.HasRegisteredBuffers) + { + return; + } + + // Reuse the mixed workload profile to validate payload correctness with registered buffers active. + byte[] smallSend = new byte[64]; + byte[] largeSend = new byte[Math.Max(snapshot.BufferSize, 4096)]; + byte[] smallReceive = new byte[smallSend.Length]; + byte[] largeReceive = new byte[largeSend.Length]; + + for (int i = 0; i < 64; i++) + { + smallSend.AsSpan().Fill(unchecked((byte)(i + 5))); + largeSend.AsSpan().Fill(unchecked((byte)(i + 11))); + + Task smallReceiveTask = ReceiveExactlyAsync(server, smallReceive); + await SendExactlyAsync(client, smallSend); + await smallReceiveTask; + Assert.Equal(smallSend, smallReceive); + + Task largeReceiveTask = ReceiveExactlyAsync(server, largeReceive); + await SendExactlyAsync(client, largeSend); + await largeReceiveTask; + Assert.Equal(largeSend, largeReceive); + } + } + + private static async Task RunProvidedBufferRegistrationMemoryPressureScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!snapshot.IsUsable) + { + return; + } + + int payloadSize = Math.Min(snapshot.BufferSize, 16 * 1024); + payloadSize = Math.Max(payloadSize, 1024); + byte[] payload = new byte[payloadSize]; + byte[] received = new byte[payloadSize]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 41)); + } + + Task receiveTask = ReceiveExactlyAsync(server, received); + await SendExactlyAsync(client, payload); + await receiveTask; + Assert.Equal(payload, received); + + ulong successCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialSuccess"); + ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure"); + if (snapshot.HasRegisteredBuffers) + { + Assert.True(successCount > 0, "Expected successful registration telemetry when buffers are registered."); + } + else + { + Assert.True(failureCount > 0, "Expected failure telemetry when registration falls back under pressure."); + } + } + + private static Task RunProvidedBufferTeardownOrderingContractScenarioAsync() + { + Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + MethodInfo teardownMethod = engineType.GetMethod("LinuxFreeIoUringResources", BindingFlags.NonPublic | BindingFlags.Instance)!; + MethodInfo freeProvidedBufferRingMethod = engineType.GetMethod("FreeIoUringProvidedBufferRing", BindingFlags.NonPublic | BindingFlags.Instance)!; + MethodInfo cleanupManagedRingsMethod = engineType.GetMethod("CleanupManagedRings", BindingFlags.NonPublic | BindingFlags.Instance)!; + + byte[] ilBytes = teardownMethod.GetMethodBody()?.GetILAsByteArray() ?? Array.Empty(); + Assert.NotEmpty(ilBytes); + ReadOnlySpan il = ilBytes; + + int freeProvidedBufferRingOffset = FindCallInstructionOffset(il, freeProvidedBufferRingMethod.MetadataToken); + int cleanupManagedRingsOffset = FindCallInstructionOffset(il, cleanupManagedRingsMethod.MetadataToken); + + Assert.True(freeProvidedBufferRingOffset >= 0, "Expected teardown method to call FreeIoUringProvidedBufferRing."); + Assert.True(cleanupManagedRingsOffset >= 0, "Expected teardown method to call CleanupManagedRings."); + Assert.True( + freeProvidedBufferRingOffset < cleanupManagedRingsOffset, + "Expected teardown to unregister/dispose provided buffers before ring unmap/close."); + + return Task.CompletedTask; + } + + private static async Task RunZeroCopySendStateScenarioAsync(bool expectedEnabledWhenSupported) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] sendBuffer = new byte[64]; + byte[] receiveBuffer = new byte[sendBuffer.Length]; + Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None)); + await ReceiveExactlyAsync(server, receiveBuffer); + + IoUringZeroCopySendSnapshot snapshot = GetIoUringZeroCopySendSnapshot(); + if (!snapshot.HasIoUringPort) + { + return; + } + + if (!snapshot.SupportsSendZc) + { + Assert.False(snapshot.ZeroCopySendEnabled); + return; + } + + Assert.Equal(expectedEnabledWhenSupported, snapshot.ZeroCopySendEnabled); + } + + private static async Task RunFixedRecvStateScenarioAsync(bool expectedEnabled) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] sendBuffer = new byte[64]; + byte[] receiveBuffer = new byte[sendBuffer.Length]; + Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None)); + await ReceiveExactlyAsync(server, receiveBuffer); + + IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot(); + if (!snapshot.HasIoUringPort) + { + return; + } + + Assert.Equal(expectedEnabled, snapshot.FixedRecvEnabled); + } + + private static async Task RunFixedRecvActivationFollowsRuntimeCapabilitiesScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] sendBuffer = new byte[64]; + byte[] receiveBuffer = new byte[sendBuffer.Length]; + Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None)); + await ReceiveExactlyAsync(server, receiveBuffer); + + IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot(); + if (!snapshot.HasIoUringPort) + { + return; + } + + Assert.Equal(snapshot.SupportsReadFixed && snapshot.HasRegisteredBuffers, snapshot.FixedRecvEnabled); + } + + private static async Task RunFixedRecvDataCorrectnessScenarioAsync() + { + IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot(); + if (!snapshot.HasIoUringPort || !snapshot.FixedRecvEnabled || !snapshot.SupportsReadFixed || !snapshot.HasRegisteredBuffers) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + byte[] payload = new byte[32 * 1024]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i * 13)); + } + + byte[] received = new byte[payload.Length]; + Task receiveTask = ReceiveExactlyAsync(server, received); + Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None)); + await receiveTask; + Assert.Equal(payload, received); + } + + private static async Task RunSqPollBasicSendReceiveScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(8); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!snapshot.IsActive) + { + return; + } + + await RunTcpRoundTripAsync(16); + } + + private static async Task RunSqPollRequestedScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(8); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + // Some Helix legs can run without an active io_uring port (kernel/config/runtime gating). + // In that case this SQPOLL-request scenario is not applicable. + if (!snapshot.HasIoUringPort) + { + return; + } + + if (!snapshot.SqPollEnabled) + { + // SQPOLL wasn't active on this leg, but socket operations must continue to succeed. + await RunTcpRoundTripAsync(16); + } + } + + private static async Task RunSqPollWakeupAfterIdleScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(4); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!snapshot.IsActive) + { + return; + } + + ulong wakeupsBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups"); + + // Let the kernel SQPOLL thread go idle and set SQ_NEED_WAKEUP. + bool observedNeedWakeup = false; + for (int i = 0; i < 25; i++) + { + await Task.Delay(100); + if (IsAnyIoUringSqPollEngineNeedingWakeup()) + { + observedNeedWakeup = true; + break; + } + } + + if (!observedNeedWakeup) + { + return; + } + + await RunTcpRoundTripAsync(2); + + ulong wakeupsAfter = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups"); + Assert.True( + wakeupsAfter > wakeupsBefore, + $"Expected SQPOLL wakeups to increase after idle wake path. before={wakeupsBefore}, after={wakeupsAfter}"); + } + + private static async Task RunSqPollMultishotRecvScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(4); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!snapshot.IsActive) + { + return; + } + + await RunMultishotRecvBasicScenarioAsync(iterations: 32); + } + + private static async Task RunSqPollZeroCopySendScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(4); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!snapshot.IsActive) + { + return; + } + + await RunZeroCopySendLargeBufferRoundTripScenarioAsync(); + } + + private static async Task RunSqPollTelemetryCountersScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(4); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!snapshot.IsActive) + { + return; + } + + ulong skippedBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollSubmissionsSkipped"); + ulong wakeupsBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups"); + + await RunTcpRoundTripAsync(32); + ulong skippedAfterBurst = GetIoUringTelemetryCounterValue("_ioUringSqPollSubmissionsSkipped"); + Assert.True( + skippedAfterBurst > skippedBefore, + $"Expected SQPOLL submission-skipped counter to increase. before={skippedBefore}, after={skippedAfterBurst}"); + + await Task.Delay(1500); + await RunTcpRoundTripAsync(2); + + ulong wakeupsAfter = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups"); + Assert.True( + wakeupsAfter >= wakeupsBefore, + $"Expected SQPOLL wakeup counter to be readable/nondecreasing. before={wakeupsBefore}, after={wakeupsAfter}"); + } + + private static async Task RunSqPollNeedWakeupContractScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(4); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!snapshot.IsActive) + { + return; + } + + Assert.True( + ValidateSqNeedWakeupMatchesRawSqFlagBit(), + "Expected at least one active SQPOLL io_uring engine for SqNeedWakeup contract validation."); + } + + private static bool IsZeroCopySendEnabledAndSupported(out IoUringZeroCopySendSnapshot snapshot) + { + snapshot = GetIoUringZeroCopySendSnapshot(); + return snapshot.HasIoUringPort && snapshot.SupportsSendZc && snapshot.ZeroCopySendEnabled; + } + + private static bool IsZeroCopySendMessageEnabledAndSupported(out IoUringZeroCopySendSnapshot snapshot) + { + snapshot = GetIoUringZeroCopySendSnapshot(); + return snapshot.HasIoUringPort && snapshot.SupportsSendMsgZc && snapshot.ZeroCopySendEnabled; + } + + private static async Task RunZeroCopySendLargeBufferRoundTripScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + byte[] payload = new byte[64 * 1024]; + byte[] received = new byte[payload.Length]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)i); + } + + Task receiveTask = ReceiveExactlyAsync(server, received); + int sent = await client.SendAsync(payload, SocketFlags.None); + Assert.Equal(payload.Length, sent); + await receiveTask; + Assert.Equal(payload, received); + } + + private static async Task RunZeroCopySendSmallBufferUsesRegularSendScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + byte[] smallPayload = new byte[1024]; + Exception? sendException = await Record.ExceptionAsync(async () => await client.SendAsync(smallPayload, SocketFlags.None)); + AssertCanceledOrInterrupted(sendException); + + byte[] verificationPayload = new byte[] { 0x5A }; + byte[] verificationReceive = new byte[1]; + Task verificationReceiveTask = ToTask(server.ReceiveAsync(verificationReceive, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(verificationPayload, SocketFlags.None)); + Assert.Equal(1, await verificationReceiveTask); + Assert.Equal(verificationPayload[0], verificationReceive[0]); + } + + private static async Task RunZeroCopySendNotifCqeReleasesPinHoldsScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + byte[] payload = new byte[128 * 1024]; + byte[] received = new byte[payload.Length]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 1)); + } + + const int iterations = 8; + for (int i = 0; i < iterations; i++) + { + Task receiveTask = ReceiveExactlyAsync(server, received); + int sent = await client.SendAsync(payload, SocketFlags.None); + Assert.Equal(payload.Length, sent); + await receiveTask; + Assert.Equal(payload, received); + } + + IoUringZeroCopyPinHoldSnapshot releasedSnapshot = await WaitForZeroCopyPinHoldSnapshotAsync( + static snapshot => !snapshot.HasIoUringPort || (snapshot.ActivePinHolds == 0 && snapshot.PendingNotificationCount == 0)); + if (!releasedSnapshot.HasIoUringPort) + { + return; + } + + Assert.Equal(0, releasedSnapshot.ActivePinHolds); + Assert.Equal(0, releasedSnapshot.PendingNotificationCount); + } + + private static async Task RunZeroCopySendPartialSendResubmissionScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + await RunLargeSendWithBackpressureAsync(useBufferListSend: false); + } + + private static async Task RunZeroCopySendCompletionPinLifetimeScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + byte[] payload = new byte[96 * 1024]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 3)); + } + + using var trackingMemory = new TrackingPinnableMemoryManager(payload); + byte[] received = new byte[payload.Length]; + Task receiveTask = ReceiveExactlyAsync(server, received); + int sent = await client.SendAsync(trackingMemory.Memory, SocketFlags.None); + Assert.Equal(payload.Length, sent); + await receiveTask; + await AssertPinsReleasedAsync(trackingMemory); + Assert.Equal(payload, received); + } + + private static async Task RunZeroCopySendUnsupportedOpcodeFallbackScenarioAsync() + { + Assembly socketsAssembly = typeof(Socket).Assembly; + Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!; + FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!; + PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo supportsSendZcField = engineType.GetField("_supportsOpSendZc", BindingFlags.NonPublic | BindingFlags.Instance)!; + FieldInfo zeroCopySendEnabledField = engineType.GetField("_zeroCopySendEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!; + + var overrides = new List<(object Engine, bool SupportsSendZc, bool ZeroCopyEnabled)>(); + bool hasIoUringPort = false; + foreach (object? engine in (Array)enginesField.GetValue(null)!) + { + if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!) + { + continue; + } + + hasIoUringPort = true; + bool supports = (bool)supportsSendZcField.GetValue(engine)!; + bool enabled = (bool)zeroCopySendEnabledField.GetValue(engine)!; + overrides.Add((engine, supports, enabled)); + supportsSendZcField.SetValue(engine, false); + zeroCopySendEnabledField.SetValue(engine, false); + } + + if (!hasIoUringPort) + { + return; + } + + try + { + IoUringZeroCopySendSnapshot snapshot = GetIoUringZeroCopySendSnapshot(); + Assert.False(snapshot.SupportsSendZc); + Assert.False(snapshot.ZeroCopySendEnabled); + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[64 * 1024]; + byte[] received = new byte[payload.Length]; + Task receiveTask = ReceiveExactlyAsync(server, received); + int sent = await client.SendAsync(payload, SocketFlags.None); + Assert.Equal(payload.Length, sent); + await receiveTask; + Assert.Equal(payload, received); + } + finally + { + foreach ((object engine, bool supports, bool enabled) in overrides) + { + supportsSendZcField.SetValue(engine, supports); + zeroCopySendEnabledField.SetValue(engine, enabled); + } + } + } + + private static async Task RunZeroCopySendBufferListSegmentThresholdScenarioAsync() + { + if (!IsZeroCopySendMessageEnabledAndSupported(out _)) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + const int segmentCount = 8; + const int segmentSize = 4 * 1024; + int payloadLength = segmentCount * segmentSize; + byte[] payload = new byte[payloadLength]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 17)); + } + + var sendBuffers = new List>(segmentCount); + for (int i = 0; i < segmentCount; i++) + { + sendBuffers.Add(new ArraySegment(payload, i * segmentSize, segmentSize)); + } + + byte[] received = new byte[payload.Length]; + Task receiveTask = ReceiveExactlyAsync(server, received); + int sent = await client.SendAsync(sendBuffers, SocketFlags.None); + Assert.Equal(payload.Length, sent); + await receiveTask; + Assert.Equal(payload, received); + } + + private static async Task RunZeroCopySendToAboveThresholdScenarioAsync() + { + if (!IsZeroCopySendMessageEnabledAndSupported(out _)) + { + return; + } + + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] payload = new byte[20 * 1024]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 23)); + } + + byte[] receiveBuffer = new byte[payload.Length]; + Task receiveTask = + ToTask(receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + + int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(payload.Length, sent); + + SocketReceiveFromResult receiveResult = await receiveTask; + Assert.Equal(payload.Length, receiveResult.ReceivedBytes); + Assert.Equal(payload, receiveBuffer); + Assert.Equal(sender.LocalEndPoint, receiveResult.RemoteEndPoint); + } + + private static async Task RunMultishotRecvBasicScenarioAsync(int iterations) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + ulong reuseBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvReuse"); + ulong asyncCancelBefore = GetIoUringTelemetryCounterValue("_ioUringAsyncCancelRequestCqes"); + byte[] receiveBuffer = new byte[1]; + byte[] payload = new byte[1]; + for (int i = 0; i < iterations; i++) + { + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + payload[0] = unchecked((byte)(i + 1)); + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + Assert.Equal(payload[0], receiveBuffer[0]); + } + + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to remain armed after repeated ReceiveAsync calls."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvReuse") > reuseBefore, + "Expected ReceiveAsync calls to reuse an armed multishot recv (TryReplace path)."); + Assert.Equal( + asyncCancelBefore, + GetIoUringTelemetryCounterValue("_ioUringAsyncCancelRequestCqes")); + } + + private static async Task RunMultishotRecvCancellationScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + _ = client; + + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination"); + byte[] receiveBuffer = new byte[16]; + using var cts = new CancellationTokenSource(); + Task pendingReceive = ToTask(server.ReceiveAsync(receiveBuffer.AsMemory(), SocketFlags.None, cts.Token)); + await Task.Yield(); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to arm before cancellation."); + + cts.Cancel(); + Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, completed); + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + AssertCanceledOrInterrupted(ex); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: false), + "Expected persistent multishot recv to disarm after cancellation."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore, + "Expected cancellation to produce a terminal persistent multishot recv completion."); + } + + private static async Task RunMultishotRecvPeerCloseScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination"); + byte[] receiveBuffer = new byte[8]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + client.Shutdown(SocketShutdown.Both); + client.Dispose(); + + Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(receiveTask, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await receiveTask); + if (ex is null) + { + Assert.Equal(0, await receiveTask); + } + else + { + SocketException socketException = Assert.IsType(ex); + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error after multishot peer close: {socketException.SocketErrorCode}"); + } + + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: false), + "Expected persistent multishot recv to disarm after terminal peer-close completion."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore, + "Expected terminal completion to increment persistent multishot recv termination telemetry."); + } + + private static async Task RunPersistentMultishotRecvProvidedBufferExhaustionScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + byte[] receiveBuffer = new byte[1]; + Task armReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xC3 }, SocketFlags.None)); + Assert.Equal(1, await armReceive); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to arm before forced provided-buffer exhaustion."); + + ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination"); + + Assert.True(TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount)); + Assert.True(forcedBufferCount > 0); + + Task exhaustedReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xC4 }, SocketFlags.None)); + Task exhaustedCompleted = await Task.WhenAny(exhaustedReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(exhaustedReceive, exhaustedCompleted); + + Exception? exhaustedException = await Record.ExceptionAsync(async () => await exhaustedReceive); + SocketException exhaustedSocketException = Assert.IsType(exhaustedException); + Assert.Equal(SocketError.NoBufferSpaceAvailable, exhaustedSocketException.SocketErrorCode); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: false), + "Expected persistent multishot recv to disarm after ENOBUFS terminal completion."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions") > depletionBefore, + "Expected provided-buffer depletion counter to increase after forced exhaustion."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore, + "Expected persistent multishot recv termination counter to increase after ENOBUFS."); + + Assert.True(TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount)); + Assert.True(recycledBufferCount > 0, "Expected forced checked-out provided buffers to be recycled for recovery."); + + Task recoveredReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xC5 }, SocketFlags.None)); + Assert.Equal(1, await recoveredReceive); + Assert.Equal(0xC5, receiveBuffer[0]); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to re-arm after provided buffers were recycled."); + } + + private static async Task RunPersistentMultishotRecvShapeChangeScenarioAsync() + { + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + receiver.Connect(sender.LocalEndPoint!); + sender.Connect(receiver.LocalEndPoint!); + + byte[] receiveBuffer = new byte[1]; + Task armReceive = ToTask(receiver.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await sender.SendAsync(new byte[] { 0xD1 }, SocketFlags.None)); + Assert.Equal(1, await armReceive); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(receiver, expectedArmed: true), + "Expected persistent multishot recv to arm before shape-change scenario."); + + ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination"); + + byte[] receiveFromBuffer = new byte[1]; + Task receiveFromTask = ToTask( + receiver.ReceiveFromAsync(receiveFromBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + Assert.Equal(1, await sender.SendAsync(new byte[] { 0xD2 }, SocketFlags.None)); + SocketReceiveFromResult receiveFromResult = await receiveFromTask; + Assert.Equal(1, receiveFromResult.ReceivedBytes); + Assert.Equal(0xD2, receiveFromBuffer[0]); + + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(receiver, expectedArmed: false), + "Expected persistent multishot recv to disarm when receive shape switches to ReceiveFromAsync."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore, + "Expected shape-change cancellation to increment persistent multishot recv terminations."); + + Task rearmReceive = ToTask(receiver.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await sender.SendAsync(new byte[] { 0xD3 }, SocketFlags.None)); + Assert.Equal(1, await rearmReceive); + Assert.Equal(0xD3, receiveBuffer[0]); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(receiver, expectedArmed: true), + "Expected persistent multishot recv to re-arm after shape-change operation completed."); + } + + private static async Task RunPersistentMultishotRecvConcurrentCloseRaceScenarioAsync(int iterations) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(Math.Max(4, iterations)); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + for (int i = 0; i < iterations; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + byte[] armBuffer = new byte[1]; + Task armReceive = ToTask(server.ReceiveAsync(armBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xE1 }, SocketFlags.None)); + Assert.Equal(1, await armReceive); + + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to arm before concurrent close race."); + + Task pendingReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None)); + await Task.Yield(); + + _ = Task.Run(() => + { + server.Dispose(); + client.Dispose(); + }); + + Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error from persistent multishot recv close race: {socketException.SocketErrorCode}"); + } + else if (ex is not ObjectDisposedException and not null) + { + throw ex; + } + } + } + + private static async Task RunNetworkStreamReadAsyncCancellationTokenScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + using var networkStream = new NetworkStream(server, ownsSocket: false); + + byte[] readBuffer = new byte[1]; + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(15)); + ValueTask readTask = networkStream.ReadAsync(readBuffer, cts.Token); + await Task.Yield(); + + Assert.Equal(1, await client.SendAsync(new byte[] { 0xF1 }, SocketFlags.None)); + Assert.Equal(1, await readTask); + Assert.Equal(0xF1, readBuffer[0]); + } + + private static async Task RunReceiveAsyncSocketAsyncEventArgsBufferListScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[1]; + using var receiveEventArgs = new SocketAsyncEventArgs + { + BufferList = new List> + { + new ArraySegment(receiveBuffer) + } + }; + + Task receiveTask = StartSocketAsyncEventArgsOperation( + server, + receiveEventArgs, + static (s, args) => s.ReceiveAsync(args)); + await Task.Yield(); + + Assert.Equal(1, await client.SendAsync(new byte[] { 0xF2 }, SocketFlags.None)); + SocketAsyncEventArgs completedReceive = await receiveTask; + Assert.Equal(SocketError.Success, completedReceive.SocketError); + Assert.Equal(1, completedReceive.BytesTransferred); + Assert.Equal(0xF2, receiveBuffer[0]); + Assert.False( + IsPersistentMultishotRecvArmed(server), + "SAEA BufferList receive path should not arm persistent multishot recv state."); + } + + private static async Task RunMultishotAcceptBasicScenarioAsync(int connectionCount) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + Task firstAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Multishot accept was not armed while first accept was pending."); + + using (Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)) + { + await firstClient.ConnectAsync(endpoint); + using Socket firstServer = await AwaitWithTimeoutAsync(firstAcceptTask, "first multishot accept"); + await AssertConnectedPairRoundTripAsync(firstClient, firstServer, 0x41); + } + + for (int i = 1; i < connectionCount; i++) + { + (Socket clientSocket, Socket serverSocket) = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = clientSocket; + using Socket server = serverSocket; + await AssertConnectedPairRoundTripAsync(client, server, unchecked((byte)(0x41 + i))); + } + } + + private static async Task RunMultishotAcceptPrequeueScenarioAsync(int prequeuedConnectionCount) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(prequeuedConnectionCount + 2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + // Arm multishot accept once, then connect a burst of clients before issuing + // subsequent AcceptAsync calls to create a pre-queue opportunity. + Task armingAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Multishot accept was not armed while arming accept was pending."); + + var connectedClients = new List(prequeuedConnectionCount + 1); + try + { + var connectTasks = new List(prequeuedConnectionCount + 1); + for (int i = 0; i < prequeuedConnectionCount + 1; i++) + { + var client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + connectedClients.Add(client); + connectTasks.Add(client.ConnectAsync(endpoint)); + } + + await Task.WhenAll(connectTasks); + using Socket armingServer = await AwaitWithTimeoutAsync(armingAcceptTask, "arming multishot accept"); + + DateTime deadline = DateTime.UtcNow + TimeSpan.FromSeconds(5); + int queueCount = 0; + while (DateTime.UtcNow < deadline) + { + queueCount = GetListenerMultishotAcceptQueueCount(listener); + if (queueCount > 0) + { + break; + } + + await Task.Delay(25); + } + + Assert.True(queueCount > 0, "Expected at least one pre-accepted connection to be queued."); + + for (int i = 0; i < prequeuedConnectionCount; i++) + { + using Socket _ = await AwaitWithTimeoutAsync(listener.AcceptAsync(), "prequeued accept completion"); + } + } + finally + { + foreach (Socket client in connectedClients) + { + client.Dispose(); + } + } + } + + private static async Task RunMultishotAcceptListenerCloseScenarioAsync() + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(4); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + Task firstAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Multishot accept was not armed while first accept was pending."); + + using (Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)) + { + await firstClient.ConnectAsync(endpoint); + using Socket firstServer = await AwaitWithTimeoutAsync(firstAcceptTask, "first accept before listener close"); + await AssertConnectedPairRoundTripAsync(firstClient, firstServer, 0x71); + } + + Task pendingAccept = listener.AcceptAsync(); + await Task.Yield(); + listener.Dispose(); + + Task completed = await Task.WhenAny(pendingAccept, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingAccept, completed); + + Exception? acceptException = await Record.ExceptionAsync(async () => await pendingAccept); + Assert.NotNull(acceptException); + Assert.True( + acceptException is ObjectDisposedException || + acceptException is SocketException, + $"Unexpected pending-accept exception after listener close: {acceptException}"); + + Assert.Equal(0, GetListenerMultishotAcceptQueueCount(listener)); + Assert.False(IsListenerMultishotAcceptArmed(listener)); + } + + private static async Task RunMultishotAcceptDisposeDuringArmingRaceScenarioAsync(int iterations) + { + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + for (int i = 0; i < iterations; i++) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + Task pendingAccept = listener.AcceptAsync(); + Task disposeTask = Task.Run(listener.Dispose); + + Task completed = await Task.WhenAny(pendingAccept, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingAccept, completed); + await disposeTask; + + Exception? acceptException = await Record.ExceptionAsync(async () => await pendingAccept); + Assert.NotNull(acceptException); + Assert.True( + acceptException is ObjectDisposedException || acceptException is SocketException, + $"Unexpected accept exception during dispose/arm race at iteration {i}: {acceptException}"); + } + } + + private static async Task RunMultishotAcceptUnavailableOneShotScenarioAsync() + { + if (IsIoUringMultishotAcceptSupported()) + { + return; + } + + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + Task acceptTask = listener.AcceptAsync(); + await Task.Yield(); + Assert.False(IsListenerMultishotAcceptArmed(listener), "Listener should remain in one-shot accept mode."); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await client.ConnectAsync(endpoint); + using Socket server = await AwaitWithTimeoutAsync(acceptTask, "one-shot accept fallback"); + await AssertConnectedPairRoundTripAsync(client, server, 0x7A); + } + + private static async Task RunMultishotAcceptRearmAfterTerminalCqeScenarioAsync() + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(4); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + Task firstAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Multishot accept was not armed before forced terminal CQE."); + + using (Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)) + { + await firstClient.ConnectAsync(endpoint); + Exception? firstAcceptException = await Record.ExceptionAsync(async () => await firstAcceptTask); + Assert.NotNull(firstAcceptException); + Assert.True( + firstAcceptException is SocketException || + firstAcceptException is ObjectDisposedException, + $"Unexpected forced-accept exception type: {firstAcceptException}"); + } + + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: false), + "Expected multishot accept to disarm after terminal CQE."); + + Task secondAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Expected multishot accept to re-arm on subsequent accept."); + + using Socket secondClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await secondClient.ConnectAsync(endpoint); + using Socket secondServer = await AwaitWithTimeoutAsync(secondAcceptTask, "re-armed multishot accept"); + await AssertConnectedPairRoundTripAsync(secondClient, secondServer, 0x33); + } + + private static async Task RunMultishotAcceptHighConnectionRateScenarioAsync(int connectionCount) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + var acceptTasks = new Task[connectionCount]; + var clients = new Socket?[connectionCount]; + var connectTasks = new Task[connectionCount]; + + for (int i = 0; i < connectionCount; i++) + { + acceptTasks[i] = listener.AcceptAsync(); + } + + try + { + for (int i = 0; i < connectionCount; i++) + { + clients[i] = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + connectTasks[i] = clients[i].ConnectAsync(endpoint); + } + + await Task.WhenAll(connectTasks); + Socket[] servers = await Task.WhenAll(acceptTasks); + + try + { + var verificationTasks = new List(connectionCount); + for (int i = 0; i < connectionCount; i++) + { + Socket client = Assert.IsType(clients[i]); + Socket server = servers[i]; + byte marker = unchecked((byte)i); + verificationTasks.Add(AssertConnectedPairRoundTripAsync(client, server, marker)); + } + + await Task.WhenAll(verificationTasks); + } + finally + { + foreach (Socket server in servers) + { + server.Dispose(); + } + } + } + finally + { + foreach (Socket? client in clients) + { + client?.Dispose(); + } + } + } + + private static async Task RunLargeSendWithBackpressureAsync(bool useBufferListSend) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + client.SendBufferSize = 1024; + server.ReceiveBufferSize = 1024; + + const int PayloadLength = 2 * 1024 * 1024; + byte[] payload = new byte[PayloadLength]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = (byte)i; + } + + Task sendTask; + if (useBufferListSend) + { + const int SegmentSize = 1024; + var sendBuffers = new List>(); + for (int offset = 0; offset < payload.Length; offset += SegmentSize) + { + int count = Math.Min(SegmentSize, payload.Length - offset); + sendBuffers.Add(new ArraySegment(payload, offset, count)); + } + + sendTask = ToTask(client.SendAsync(sendBuffers, SocketFlags.None)); + } + else + { + sendTask = ToTask(client.SendAsync(payload, SocketFlags.None)); + } + + await Task.Delay(20); + + byte[] received = new byte[payload.Length]; + int totalReceived = 0; + while (totalReceived < payload.Length) + { + int receivedNow = await ToTask(server.ReceiveAsync(received.AsMemory(totalReceived), SocketFlags.None)); + Assert.True(receivedNow > 0); + totalReceived += receivedNow; + if ((totalReceived & 0x3FFF) == 0) + { + await Task.Delay(1); + } + } + + Assert.Equal(payload.Length, await sendTask); + Assert.Equal(payload.Length, totalReceived); + Assert.Equal(payload, received); + } + + private static async Task RunAsyncCancelRequestIsolationScenarioAsync(int iterations) + { + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var cancelPair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket cancelClient = cancelPair.Client; + using Socket cancelServer = cancelPair.Server; + + var activePair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket activeClient = activePair.Client; + using Socket activeServer = activePair.Server; + + byte[] cancelBuffer = new byte[1]; + byte[] activeBuffer = new byte[1]; + for (int i = 0; i < iterations; i++) + { + using var cts = new CancellationTokenSource(); + Task canceledReceive = ToTask(cancelServer.ReceiveAsync(cancelBuffer, SocketFlags.None, cts.Token)); + Task activeReceive = ToTask(activeServer.ReceiveAsync(activeBuffer, SocketFlags.None)); + await Task.Yield(); + + cts.Cancel(); + byte expected = unchecked((byte)(i + 1)); + Assert.Equal(1, await activeClient.SendAsync(new byte[] { expected }, SocketFlags.None)); + + Assert.Equal(1, await activeReceive); + Assert.Equal(expected, activeBuffer[0]); + + Exception? cancelException = await Record.ExceptionAsync(async () => await canceledReceive); + AssertCanceledOrInterrupted(cancelException); + } + }, + (diagnosticsBefore, diagnosticsAfter) => + { + ulong asyncCancelRequestCqeDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeCount, + diagnosticsAfter.AsyncCancelRequestCqeCount); + if (asyncCancelRequestCqeDelta == 0) + { + return; + } + + ulong asyncCancelRequestCqeEnoentDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEnoentCount, + diagnosticsAfter.AsyncCancelRequestCqeEnoentCount); + ulong asyncCancelRequestCqeEalreadyDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEalreadyCount, + diagnosticsAfter.AsyncCancelRequestCqeEalreadyCount); + ulong asyncCancelRequestCqeOtherDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeOtherCount, + diagnosticsAfter.AsyncCancelRequestCqeOtherCount); + + Assert.True( + asyncCancelRequestCqeEnoentDelta + asyncCancelRequestCqeEalreadyDelta + asyncCancelRequestCqeOtherDelta <= asyncCancelRequestCqeDelta, + $"Unexpected async-cancel accounting for isolation scenario: enoent_delta={asyncCancelRequestCqeEnoentDelta}, ealready_delta={asyncCancelRequestCqeEalreadyDelta}, other_delta={asyncCancelRequestCqeOtherDelta}, total_delta={asyncCancelRequestCqeDelta}"); + }, + settleDelayMilliseconds: 200); + } + + private static async Task RunReceiveMessageFromCancellationAndDisposeScenariosAsync() + { + using Socket cancelReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + cancelReceiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true); + cancelReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + EndPoint cancelRemoteEndPoint = new IPEndPoint(IPAddress.Any, 0); + using var cts = new CancellationTokenSource(); + Task canceledReceive = ToTask( + cancelReceiver.ReceiveMessageFromAsync(new byte[64], SocketFlags.None, cancelRemoteEndPoint, cts.Token)); + await Task.Yield(); + cts.Cancel(); + + Task cancelCompleted = await Task.WhenAny(canceledReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(canceledReceive, cancelCompleted); + Exception? cancelException = await Record.ExceptionAsync(async () => await canceledReceive); + AssertCanceledOrInterrupted(cancelException); + + using Socket disposeReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + disposeReceiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true); + disposeReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] receiveBuffer = new byte[32]; + using var receiveEventArgs = new SocketAsyncEventArgs + { + BufferList = new List> + { + new ArraySegment(receiveBuffer, 0, 16), + new ArraySegment(receiveBuffer, 16, 16) + }, + RemoteEndPoint = new IPEndPoint(IPAddress.Any, 0) + }; + + Task pendingReceive = StartReceiveMessageFromAsync(disposeReceiver, receiveEventArgs); + await Task.Yield(); + disposeReceiver.Dispose(); + + Task disposeCompleted = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, disposeCompleted); + SocketAsyncEventArgs completedArgs = await pendingReceive; + Assert.True( + completedArgs.SocketError == SocketError.OperationAborted || + completedArgs.SocketError == SocketError.Interrupted); + } + + private static async Task RunReceiveMessageFromCancelThenReceiveScenarioAsync() + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + EndPoint initialRemoteEndPoint = new IPEndPoint(IPAddress.Any, 0); + using var cts = new CancellationTokenSource(); + Task canceledReceive = ToTask( + receiver.ReceiveMessageFromAsync(new byte[64], SocketFlags.None, initialRemoteEndPoint, cts.Token)); + await Task.Yield(); + cts.Cancel(); + + Task canceledCompleted = await Task.WhenAny(canceledReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(canceledReceive, canceledCompleted); + Exception? cancelException = await Record.ExceptionAsync(async () => await canceledReceive); + AssertCanceledOrInterrupted(cancelException); + + byte[] payload = new byte[] { 0x10, 0x20, 0x30, 0x40 }; + Assert.Equal( + payload.Length, + await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!)); + + byte[] receiveBuffer = new byte[64]; + EndPoint remoteEndPoint = new IPEndPoint(IPAddress.Any, 0); + SocketReceiveMessageFromResult received = await ToTask( + receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint, CancellationToken.None)); + + Assert.Equal(payload.Length, received.ReceivedBytes); + Assert.True(payload.AsSpan().SequenceEqual(receiveBuffer.AsSpan(0, payload.Length))); + } + + private static async Task RunReceiveMessageFromCancellationAndDisposeScenariosWithGcPressureAsync(int iterations) + { + for (int i = 0; i < iterations; i++) + { + await RunReceiveMessageFromCancellationAndDisposeScenariosAsync(); + if ((i & 0x3) == 0) + { + GC.Collect(); + GC.WaitForPendingFinalizers(); + GC.Collect(); + } + } + } + + private static async Task RunTeardownDrainTrackedOperationsScenarioAsync(int iterations) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(8); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + for (int i = 0; i < iterations; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + Task pendingReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None)); + await Task.Yield(); + + client.Dispose(); + server.Dispose(); + + Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, completed); + Exception? receiveException = await Record.ExceptionAsync(async () => await pendingReceive); + AssertCanceledDisposedOrInterrupted(receiveException); + } + } + + private static async Task RunTeardownCancellationDuplicateGuardScenarioAsync(int iterations) + { + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(8); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + for (int i = 0; i < iterations; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + using var cts = new CancellationTokenSource(); + Task pendingReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None, cts.Token)); + await Task.Yield(); + cts.Cancel(); + + server.Dispose(); + client.Dispose(); + + Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, completed); + Exception? receiveException = await Record.ExceptionAsync(async () => await pendingReceive); + AssertCanceledDisposedOrInterrupted(receiveException); + } + }, + (diagnosticsBefore, diagnosticsAfter) => + { + ulong asyncCancelRequestCqeDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeCount, + diagnosticsAfter.AsyncCancelRequestCqeCount); + if (asyncCancelRequestCqeDelta == 0) + { + return; + } + + ulong asyncCancelRequestCqeEnoentDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEnoentCount, + diagnosticsAfter.AsyncCancelRequestCqeEnoentCount); + ulong asyncCancelRequestCqeEalreadyDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEalreadyCount, + diagnosticsAfter.AsyncCancelRequestCqeEalreadyCount); + ulong asyncCancelRequestCqeOtherDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeOtherCount, + diagnosticsAfter.AsyncCancelRequestCqeOtherCount); + + // Guardrail: one operation per iteration should not devolve into persistent multi-request cancellation churn. + ulong maxExpectedCancelRequestCqes = (ulong)(iterations + (iterations / 2) + 8); + Assert.True( + asyncCancelRequestCqeDelta <= maxExpectedCancelRequestCqes, + $"Unexpected async-cancel CQE inflation: delta={asyncCancelRequestCqeDelta}, max={maxExpectedCancelRequestCqes}, iterations={iterations}"); + Assert.True( + asyncCancelRequestCqeEnoentDelta + asyncCancelRequestCqeEalreadyDelta + asyncCancelRequestCqeOtherDelta <= asyncCancelRequestCqeDelta, + $"Unexpected async-cancel accounting: enoent_delta={asyncCancelRequestCqeEnoentDelta}, ealready_delta={asyncCancelRequestCqeEalreadyDelta}, other_delta={asyncCancelRequestCqeOtherDelta}, total_delta={asyncCancelRequestCqeDelta}"); + }, + settleDelayMilliseconds: 200); + } + + private static async Task RunCancellationSubmitContentionScenarioAsync(int connectionCount, int cancellationsPerConnection) + { + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var clients = new List(connectionCount); + var servers = new List(connectionCount); + try + { + for (int i = 0; i < connectionCount; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + clients.Add(pair.Client); + servers.Add(pair.Server); + } + + Task[] churnTasks = new Task[connectionCount]; + for (int index = 0; index < connectionCount; index++) + { + Socket server = servers[index]; + churnTasks[index] = Task.Run(async () => + { + byte[] receiveBuffer = new byte[1]; + for (int i = 0; i < cancellationsPerConnection; i++) + { + using var cts = new CancellationTokenSource(); + Task pendingReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None, cts.Token)); + cts.Cancel(); + + Exception? receiveException = await Record.ExceptionAsync(async () => await pendingReceive); + AssertCanceledOrInterrupted(receiveException); + } + }); + } + + await Task.WhenAll(churnTasks); + + // Ensure the cancellation churn does not stall normal completion progress afterward. + for (int i = 0; i < connectionCount; i++) + { + byte expected = unchecked((byte)(i + 1)); + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(servers[i].ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + Assert.Equal(1, await clients[i].SendAsync(new byte[] { expected }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + Assert.Equal(expected, receiveBuffer[0]); + } + } + finally + { + foreach (Socket server in servers) + { + server.Dispose(); + } + + foreach (Socket client in clients) + { + client.Dispose(); + } + } + }, + (diagnosticsBefore, diagnosticsAfter) => + { + ulong asyncCancelRequestCqeDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeCount, + diagnosticsAfter.AsyncCancelRequestCqeCount); + if (asyncCancelRequestCqeDelta == 0) + { + // On kernels without async-cancel opcode support this path may fallback without cancel-request CQEs. + return; + } + + ulong asyncCancelRequestCqeEnoentDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEnoentCount, + diagnosticsAfter.AsyncCancelRequestCqeEnoentCount); + ulong asyncCancelRequestCqeEalreadyDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEalreadyCount, + diagnosticsAfter.AsyncCancelRequestCqeEalreadyCount); + ulong asyncCancelRequestCqeOtherDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeOtherCount, + diagnosticsAfter.AsyncCancelRequestCqeOtherCount); + + ulong maxExpectedCancelRequestCqes = (ulong)(connectionCount * cancellationsPerConnection * 2) + 64; + Assert.True( + asyncCancelRequestCqeDelta <= maxExpectedCancelRequestCqes, + $"Unexpected async-cancel request CQE inflation under contention: delta={asyncCancelRequestCqeDelta}, max={maxExpectedCancelRequestCqes}, connections={connectionCount}, cancels_per_connection={cancellationsPerConnection}"); + Assert.True( + asyncCancelRequestCqeEnoentDelta + asyncCancelRequestCqeEalreadyDelta + asyncCancelRequestCqeOtherDelta <= asyncCancelRequestCqeDelta, + $"Unexpected async-cancel accounting under contention: enoent_delta={asyncCancelRequestCqeEnoentDelta}, ealready_delta={asyncCancelRequestCqeEalreadyDelta}, other_delta={asyncCancelRequestCqeOtherDelta}, total_delta={asyncCancelRequestCqeDelta}"); + }, + settleDelayMilliseconds: 200); + } + + private static async Task RunMixedModeReadinessCompletionStressScenarioAsync(int iterations) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] completionBuffer = new byte[1]; + byte[] payload = new byte[1]; + + for (int i = 0; i < iterations; i++) + { + Task completionReceive = ToTask(server.ReceiveAsync(completionBuffer, SocketFlags.None)); + Task readinessProbe = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None)); + await Task.Yield(); + + payload[0] = unchecked((byte)(i + 1)); + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await completionReceive); + Assert.Equal(payload[0], completionBuffer[0]); + + Task completed = await Task.WhenAny(readinessProbe, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(readinessProbe, completed); + Assert.Equal(0, await readinessProbe); + } + } + + private static async Task RunSameSocketReadinessCompletionBacklogScenarioAsync(int iterations, int completionBatchSize) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] sendPayload = new byte[completionBatchSize]; + for (int iteration = 0; iteration < iterations; iteration++) + { + var receiveBuffers = new byte[completionBatchSize][]; + var completionReceives = new Task[completionBatchSize]; + for (int i = 0; i < completionBatchSize; i++) + { + byte expected = unchecked((byte)((iteration + i + 1) & 0xFF)); + sendPayload[i] = expected; + byte[] receiveBuffer = new byte[1]; + receiveBuffers[i] = receiveBuffer; + completionReceives[i] = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + } + + Task readinessProbe = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None)); + await Task.Yield(); + + int sent = 0; + while (sent < sendPayload.Length) + { + sent += await client.SendAsync(sendPayload.AsMemory(sent), SocketFlags.None); + } + + Assert.Equal(sendPayload.Length, sent); + + Task readinessCompleted = await Task.WhenAny(readinessProbe, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(readinessProbe, readinessCompleted); + Assert.Equal(0, await readinessProbe); + + int[] receivedCounts = await Task.WhenAll(completionReceives); + for (int i = 0; i < completionBatchSize; i++) + { + Assert.Equal(1, receivedCounts[i]); + Assert.Equal(sendPayload[i], receiveBuffers[i][0]); + } + } + } + + private static async Task RunPureCompletionScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] tcpSendPayload = new byte[] { 0x11 }; + byte[] tcpReceiveBuffer = new byte[1]; + + Task tcpReceive = ToTask(server.ReceiveAsync(tcpReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(tcpSendPayload, SocketFlags.None)); + Assert.Equal(1, await AwaitWithTimeoutAsync(tcpReceive, nameof(tcpReceive))); + Assert.Equal(tcpSendPayload[0], tcpReceiveBuffer[0]); + + Task tcpZeroByteReceive = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None)); + await Task.Yield(); + + byte[] tcpPayloadAfterProbe = new byte[] { 0x22 }; + Assert.Equal(1, await client.SendAsync(tcpPayloadAfterProbe, SocketFlags.None)); + Task completed = await Task.WhenAny(tcpZeroByteReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(tcpZeroByteReceive, completed); + Assert.Equal(0, await tcpZeroByteReceive); + + byte[] tcpDataAfterZeroByte = new byte[1]; + Task tcpTailReceive = ToTask(server.ReceiveAsync(tcpDataAfterZeroByte, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await AwaitWithTimeoutAsync(tcpTailReceive, nameof(tcpTailReceive))); + Assert.Equal(tcpPayloadAfterProbe[0], tcpDataAfterZeroByte[0]); + + using Socket connectListener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + connectListener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + connectListener.Listen(1); + IPEndPoint connectEndPoint = (IPEndPoint)connectListener.LocalEndPoint!; + + using Socket connectClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = connectListener.AcceptAsync(); + await connectClient.ConnectAsync(connectEndPoint); + using Socket connectServer = await AwaitWithTimeoutAsync(acceptTask, nameof(acceptTask)); + + byte[] connectPayload = new byte[] { 0x33 }; + Assert.Equal(1, await connectClient.SendAsync(connectPayload, SocketFlags.None)); + byte[] connectReceiveBuffer = new byte[1]; + Assert.Equal(1, await connectServer.ReceiveAsync(connectReceiveBuffer, SocketFlags.None)); + Assert.Equal(connectPayload[0], connectReceiveBuffer[0]); + + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket udpSender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + udpSender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] udpPayload = new byte[] { 0x33, 0x44, 0x55 }; + byte[] udpReceiveBuffer = new byte[udpPayload.Length]; + + Task receiveFromTask = + ToTask(receiver.ReceiveFromAsync(udpReceiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + Assert.Equal(udpPayload.Length, await udpSender.SendToAsync(udpPayload, SocketFlags.None, receiver.LocalEndPoint!)); + + SocketReceiveFromResult receiveFromResult = await receiveFromTask; + Assert.Equal(udpPayload.Length, receiveFromResult.ReceivedBytes); + Assert.Equal(udpPayload, udpReceiveBuffer); + Assert.Equal(udpSender.LocalEndPoint, receiveFromResult.RemoteEndPoint); + } + + private static async Task RunBoundedWaitBufferPressureScenarioAsync(int connectionCount) + { + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var clients = new List(connectionCount); + var servers = new List(connectionCount); + var receiveBuffers = new List(connectionCount); + var receiveTasks = new List>(connectionCount); + var sendTasks = new List>(connectionCount); + + try + { + for (int i = 0; i < connectionCount; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + clients.Add(pair.Client); + servers.Add(pair.Server); + + byte[] receiveBuffer = new byte[1]; + receiveBuffers.Add(receiveBuffer); + receiveTasks.Add(ToTask(pair.Server.ReceiveAsync(receiveBuffer, SocketFlags.None))); + } + + await Task.Yield(); + + for (int i = 0; i < clients.Count; i++) + { + byte payload = unchecked((byte)(i + 1)); + sendTasks.Add(ToTask(clients[i].SendAsync(new byte[] { payload }, SocketFlags.None))); + } + + int[] sentBytes = await Task.WhenAll(sendTasks); + int[] receivedBytes = await Task.WhenAll(receiveTasks); + + for (int i = 0; i < connectionCount; i++) + { + Assert.Equal(1, sentBytes[i]); + Assert.Equal(1, receivedBytes[i]); + Assert.Equal(unchecked((byte)(i + 1)), receiveBuffers[i][0]); + } + } + finally + { + foreach (Socket server in servers) + { + server.Dispose(); + } + + foreach (Socket client in clients) + { + client.Dispose(); + } + } + }, + (diagnosticsBefore, diagnosticsAfter) => + { + ulong socketEventBufferFullDelta = CounterDelta( + diagnosticsBefore.SocketEventBufferFullCount, + diagnosticsAfter.SocketEventBufferFullCount); + + Assert.True( + socketEventBufferFullDelta != 0, + $"Expected io_uring wait-buffer pressure counter to increase. socket_delta={socketEventBufferFullDelta}"); + }, + skipScenarioWhenIoUringUnavailable: true); + } + + private static async Task RunPrepareQueueOverflowFallbackScenarioAsync(int connectionCount) + { + ulong overflowBefore = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflows"); + ulong fallbackBefore = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflowFallbacks"); + bool observedOverflow = false; + + for (int round = 0; round < 4; round++) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var clients = new List(connectionCount); + var servers = new List(connectionCount); + var receiveTasks = new List>(connectionCount); + try + { + for (int i = 0; i < connectionCount; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + clients.Add(pair.Client); + servers.Add(pair.Server); + + receiveTasks.Add(ToTask(pair.Server.ReceiveAsync(new byte[1], SocketFlags.None))); + } + + await Task.Yield(); + + for (int i = 0; i < connectionCount; i++) + { + Assert.Equal(1, await clients[i].SendAsync(new byte[] { 0x5A }, SocketFlags.None)); + } + + for (int i = 0; i < receiveTasks.Count; i++) + { + Assert.Equal(1, await AwaitWithTimeoutAsync(receiveTasks[i], $"overflow_receive_{round}_{i}")); + } + } + finally + { + foreach (Socket server in servers) + { + server.Dispose(); + } + + foreach (Socket client in clients) + { + client.Dispose(); + } + } + + ulong overflowAfterRound = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflows"); + ulong fallbackAfterRound = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflowFallbacks"); + if (overflowAfterRound > overflowBefore) + { + observedOverflow = true; + Assert.True( + fallbackAfterRound > fallbackBefore, + $"Expected prepare queue overflow fallback counter to increase once overflow is observed. before={fallbackBefore}, after={fallbackAfterRound}"); + return; + } + } + + if (!observedOverflow) + { + // With very fast event-loop draining, queue overflow can be scheduler-dependent even at capacity=1. + // The scenario still validates that completion-mode operations make progress without hangs. + ulong fallbackAfter = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflowFallbacks"); + Assert.True( + fallbackAfter >= fallbackBefore, + $"Prepare queue overflow fallback counter should be nondecreasing. before={fallbackBefore}, after={fallbackAfter}"); + } + } + + private static async Task RunConnectQueueOverflowFallbackScenarioAsync(int connectionCount) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var clients = new List(connectionCount); + var connectTasks = new List(connectionCount); + var acceptTasks = new List>(connectionCount); + var acceptedSockets = new List(connectionCount); + + try + { + for (int i = 0; i < connectionCount; i++) + { + Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + clients.Add(client); + acceptTasks.Add(listener.AcceptAsync()); + connectTasks.Add(client.ConnectAsync(endpoint)); + } + + Task connectAll = Task.WhenAll(connectTasks); + Task connectCompleted = await Task.WhenAny(connectAll, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(connectAll, connectCompleted); + await connectAll; + + foreach (Task acceptTask in acceptTasks) + { + acceptedSockets.Add(await AwaitWithTimeoutAsync(acceptTask, nameof(RunConnectQueueOverflowFallbackScenarioAsync))); + } + } + finally + { + foreach (Socket acceptedSocket in acceptedSockets) + { + acceptedSocket.Dispose(); + } + + foreach (Socket client in clients) + { + client.Dispose(); + } + } + } + + private static async Task RunCompletionCancellationRaceAsync(int iterations) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[1]; + int completedCount = 0; + int canceledCount = 0; + for (int i = 0; i < iterations; i++) + { + while (server.Available > 0) + { + int drainLength = Math.Min(server.Available, 256); + byte[] drainBuffer = new byte[drainLength]; + int drained = await ToTask(server.ReceiveAsync(drainBuffer, SocketFlags.None)); + if (drained == 0) + { + break; + } + } + + using var cts = new CancellationTokenSource(); + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None, cts.Token)); + Task sendTask; + + if ((i & 1) == 0) + { + cts.Cancel(); + sendTask = ToTask(client.SendAsync(new byte[] { unchecked((byte)(i + 1)) }, SocketFlags.None)); + } + else + { + sendTask = ToTask(client.SendAsync(new byte[] { unchecked((byte)(i + 1)) }, SocketFlags.None)); + await Task.Yield(); + cts.Cancel(); + } + + Exception? receiveException = await Record.ExceptionAsync(async () => await receiveTask); + if (receiveException is null) + { + completedCount++; + Assert.Equal(1, receiveTask.Result); + } + else + { + canceledCount++; + AssertCanceledOrInterrupted(receiveException); + } + + Assert.Equal(1, await sendTask); + } + + Assert.True(completedCount > 0); + Assert.True(canceledCount > 0); + } + + private static async Task DrainAvailableBytesAsync(Socket socket) + { + while (socket.Available > 0) + { + int bytesToRead = Math.Min(socket.Available, 256); + byte[] drainBuffer = new byte[bytesToRead]; + int read = await ToTask(socket.ReceiveAsync(drainBuffer, SocketFlags.None)); + if (read <= 0) + { + return; + } + } + } + + private static async Task RunForcedEagainReceiveScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] firstReceiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(firstReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + + byte sendByte = 0x31; + for (int i = 0; i < 6 && !receiveTask.IsCompleted; i++) + { + Assert.Equal(1, await client.SendAsync(new byte[] { sendByte }, SocketFlags.None)); + sendByte++; + await Task.Delay(10); + } + + Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(receiveTask, completed); + Assert.True(await receiveTask > 0); + await DrainAvailableBytesAsync(server); + + byte[] secondReceiveBuffer = new byte[1]; + Task followUpReceiveTask = ToTask(server.ReceiveAsync(secondReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x40 }, SocketFlags.None)); + Task followUpCompleted = await Task.WhenAny(followUpReceiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(followUpReceiveTask, followUpCompleted); + Assert.True(await followUpReceiveTask > 0); + } + + private static async Task RunForcedEcanceledReceiveScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[1]; + Task forcedReceiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x44 }, SocketFlags.None)); + + Task completed = await Task.WhenAny(forcedReceiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(forcedReceiveTask, completed); + Exception? forcedReceiveException = await Record.ExceptionAsync(async () => await forcedReceiveTask); + if (forcedReceiveException is null) + { + Assert.True(forcedReceiveTask.Result > 0); + } + else + { + AssertCanceledOrInterrupted(forcedReceiveException); + } + await DrainAvailableBytesAsync(server); + + byte[] followUpReceiveBuffer = new byte[1]; + Task followUpReceiveTask = ToTask(server.ReceiveAsync(followUpReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x45 }, SocketFlags.None)); + Task followUpCompleted = await Task.WhenAny(followUpReceiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(followUpReceiveTask, followUpCompleted); + Assert.True(await followUpReceiveTask > 0); + } + + private static Task RunForcedReceiveScenarioAsync(bool forceEcanceled) => + forceEcanceled ? RunForcedEcanceledReceiveScenarioAsync() : RunForcedEagainReceiveScenarioAsync(); + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_DoesNotBreakAsyncSocketWorkflows() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(64), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task SocketEngine_DefaultOptOut_DoesNotBreakAsyncSocketWorkflows() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(ioUringValue: null)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task SocketEngine_KillSwitchZero_DoesNotBreakAsyncSocketWorkflows() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(ioUringValue: "0")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringConfig_AppContextSwitches_HonoredWhenEnvUnset() + { + await RemoteExecutor.Invoke( + static () => + { + AssertBooleanAppContextSwitch( + switchName: "System.Net.Sockets.IoUring.Enable", + methodName: "IsIoUringEnabled", + expectedWhenSwitchTrue: true, + expectedWhenSwitchFalse: false); + AppContext.SetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", true); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested")); + AppContext.SetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", false); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested")); + }, + CreateSocketEngineOptions(ioUringValue: null)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringConfig_EnvironmentOverridesAppContext() + { + await RemoteExecutor.Invoke( + static () => + { + AppContext.SetSwitch("System.Net.Sockets.IoUring.Enable", true); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsIoUringEnabled")); + + AppContext.SetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", true); + Assert.True(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested")); + AppContext.SetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", false); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested")); + }, + CreateSocketEngineOptions( + ioUringValue: "0", + sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringConfig_RemovedProductionKnobs_DefaultEnabled() + { + await RemoteExecutor.Invoke( + static () => + { + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsIoUringDirectSqeDisabled")); + Assert.True(InvokeSocketAsyncEngineBoolMethod("IsZeroCopySendOptedIn")); + Assert.True(InvokeSocketAsyncEngineBoolMethod("IsIoUringRegisterBuffersEnabled")); + }, + CreateSocketEngineOptions(ioUringValue: null)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_UdpSendReceive_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + IPEndPoint receiverEndpoint = (IPEndPoint)receiver.LocalEndPoint!; + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + IPEndPoint senderEndpoint = (IPEndPoint)sender.LocalEndPoint!; + sender.Connect(receiverEndpoint); + + byte[] sendBuffer = new byte[] { 7 }; + byte[] receiveBuffer = new byte[1]; + + for (int i = 0; i < 64; i++) + { + int sent = await sender.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, sent); + + EndPoint remote = new IPEndPoint(IPAddress.Any, 0); + SocketReceiveFromResult receiveFrom = await receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, remote); + Assert.Equal(1, receiveFrom.ReceivedBytes); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + Assert.Equal(senderEndpoint, receiveFrom.RemoteEndPoint); + + int echoed = await receiver.SendToAsync(sendBuffer, SocketFlags.None, receiveFrom.RemoteEndPoint); + Assert.Equal(1, echoed); + + int received = await sender.ReceiveAsync(receiveBuffer, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + unchecked + { + sendBuffer[0]++; + } + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_MultipleConcurrentConnections_Work() + { + await RemoteExecutor.Invoke(static async () => + { + const int ConnectionCount = 32; + + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(ConnectionCount); + + var acceptTasks = new Task[ConnectionCount]; + var clients = new Socket[ConnectionCount]; + + for (int i = 0; i < ConnectionCount; i++) + { + acceptTasks[i] = listener.AcceptAsync(); + } + + var connectTasks = new Task[ConnectionCount]; + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + for (int i = 0; i < ConnectionCount; i++) + { + clients[i] = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + connectTasks[i] = clients[i].ConnectAsync(endpoint); + } + + await Task.WhenAll(connectTasks); + Socket[] servers = await Task.WhenAll(acceptTasks); + + var roundTripTasks = new List(ConnectionCount); + for (int i = 0; i < ConnectionCount; i++) + { + Socket client = clients[i]; + Socket server = servers[i]; + byte value = (byte)(i + 1); + roundTripTasks.Add(Task.Run(async () => + { + byte[] tx = new byte[] { value }; + byte[] rx = new byte[1]; + + int sent = await client.SendAsync(tx, SocketFlags.None); + Assert.Equal(1, sent); + + int received = await server.ReceiveAsync(rx, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(value, rx[0]); + + sent = await server.SendAsync(tx, SocketFlags.None); + Assert.Equal(1, sent); + + received = await client.ReceiveAsync(rx, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(value, rx[0]); + })); + } + + await Task.WhenAll(roundTripTasks); + + for (int i = 0; i < ConnectionCount; i++) + { + servers[i].Dispose(); + clients[i].Dispose(); + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_DisconnectReconnectAndCancellation_Work() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + // First connection lifecycle — block scope ensures disposal before reconnect. + { + var firstPair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket firstClient = firstPair.Client; + using Socket firstServer = firstPair.Server; + } + + // Reconnect and validate cancellation + subsequent data flow. + var secondPair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket secondClient = secondPair.Client; + using Socket secondServer = secondPair.Server; + + byte[] receiveBuffer = new byte[1]; + using (var cts = new CancellationTokenSource()) + { + var pendingReceive = secondServer.ReceiveAsync(receiveBuffer.AsMemory(), SocketFlags.None, cts.Token); + cts.Cancel(); + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + Assert.NotNull(ex); + Assert.True( + ex is OperationCanceledException || + ex is SocketException socketException && + (socketException.SocketErrorCode == SocketError.OperationAborted || socketException.SocketErrorCode == SocketError.Interrupted), + $"Unexpected exception: {ex}"); + } + + byte[] sendBuffer = new byte[] { 42 }; + int sent = await secondClient.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, sent); + + int received = await secondServer.ReceiveAsync(receiveBuffer, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_QueuedZeroByteReceive_DoesNotStall() + { + await RemoteExecutor.Invoke(static async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] firstReceiveBuffer = new byte[1]; + Task firstReceive = ToTask(server.ReceiveAsync(firstReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + + Task zeroByteReceive = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None)); + await Task.Yield(); + + byte[] firstPayload = new byte[] { 0x11 }; + Assert.Equal(1, await client.SendAsync(firstPayload, SocketFlags.None)); + Assert.Equal(1, await firstReceive); + Assert.Equal(firstPayload[0], firstReceiveBuffer[0]); + + Task completed = await Task.WhenAny(zeroByteReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(zeroByteReceive, completed); + Assert.Equal(0, await zeroByteReceive); + + byte[] secondReceiveBuffer = new byte[1]; + Task secondReceive = ToTask(server.ReceiveAsync(secondReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + + byte[] secondPayload = new byte[] { 0x22 }; + Assert.Equal(1, await client.SendAsync(secondPayload, SocketFlags.None)); + Assert.Equal(1, await secondReceive); + Assert.Equal(secondPayload[0], secondReceiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_PureCompletionMode_MixesTcpAndUdp() + { + static async Task RunWithoutPollReadinessAsync() + { + long pollReadinessBefore = GetIoUringPollReadinessCqeCount(); + await RunPureCompletionScenarioAsync(); + Assert.Equal(pollReadinessBefore, GetIoUringPollReadinessCqeCount()); + } + + await RemoteExecutor.Invoke( + static () => RunWithoutPollReadinessAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CancelWithoutTraffic_CompletesPromptly() + { + await RemoteExecutor.Invoke(static async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[16]; + using var cts = new CancellationTokenSource(); + Task pendingReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None, cts.Token)); + + cts.Cancel(); + Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + AssertCanceledOrInterrupted(ex); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveMessageFrom_CancellationAndDispose_DoNotHang() + { + await RemoteExecutor.Invoke(static () => RunReceiveMessageFromCancellationAndDisposeScenariosAsync(), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveMessageFrom_Cancellation_DoesNotPoisonNextReceive() + { + await RemoteExecutor.Invoke( + static () => RunReceiveMessageFromCancelThenReceiveScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveMessageFrom_CancellationAndDispose_GcPressure_DoNotHang() + { + await RemoteExecutor.Invoke( + static () => RunReceiveMessageFromCancellationAndDisposeScenariosWithGcPressureAsync(iterations: 32), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_TeardownDrainTrackedOperations_CancelsPendingReceives() + { + await RemoteExecutor.Invoke( + static () => RunTeardownDrainTrackedOperationsScenarioAsync(iterations: 64), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_TeardownCancellationDuplicateGuard_DoesNotInflateAsyncCancelRequestCqes() + { + await RemoteExecutor.Invoke( + static () => RunTeardownCancellationDuplicateGuardScenarioAsync(iterations: 96), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_MixedReadinessAndCompletion_NoStarvation() + { + await RemoteExecutor.Invoke( + static () => RunMixedModeReadinessCompletionStressScenarioAsync(iterations: 128), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_SameSocketReadinessCompletionBacklog_NoStarvation() + { + await RemoteExecutor.Invoke( + static () => RunSameSocketReadinessCompletionBacklogScenarioAsync(iterations: 64, completionBatchSize: 8), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_BoundedWaitBufferPressure_NoLossAndCountersIncrease() + { + await RemoteExecutor.Invoke( + static () => RunBoundedWaitBufferPressureScenarioAsync(connectionCount: 32), + CreateSocketEngineOptions(testEventBufferCount: 1)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_InvalidTestEventBufferCount_FallsBackToDefault() + { + await RemoteExecutor.Invoke( + static () => RunTcpRoundTripAsync(32), + CreateSocketEngineOptions(testEventBufferCountRaw: "not-a-number")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_PrepareQueueOverflow_FallsBackAndCompletes() + { + await RemoteExecutor.Invoke( + static () => RunPrepareQueueOverflowFallbackScenarioAsync(connectionCount: 32), + CreateSocketEngineOptions(prepareQueueCapacity: 1, directSqeEnabled: false)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ConnectQueueOverflow_FallsBackAndCompletes() + { + await RemoteExecutor.Invoke( + static () => RunConnectQueueOverflowFallbackScenarioAsync(connectionCount: 32), + CreateSocketEngineOptions(prepareQueueCapacity: 1, directSqeEnabled: false)).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_PrepareQueueOverflow_Stress_NoHangs() + { + await RemoteExecutor.Invoke( + static () => RunPrepareQueueOverflowFallbackScenarioAsync(connectionCount: 96), + CreateSocketEngineOptions(prepareQueueCapacity: 2, directSqeEnabled: false)).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_NonPinnableMemory_FallsBackAndCompletes(bool receivePath) + { + await RemoteExecutor.Invoke( + static arg => RunNonPinnableMemoryFallbackScenarioAsync(receivePath: bool.Parse(arg)), + receivePath.ToString(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_NonPinnableFallbackTelemetryCounter_Increments() + { + await RemoteExecutor.Invoke( + static () => RunNonPinnableFallbackTelemetryScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_PinnableMemory_PinReleaseLifecycle_Works() + { + await RemoteExecutor.Invoke( + static () => RunPinnableMemoryPinReleaseLifecycleScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegistrationLifecycle_IsStable() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferRegistrationLifecycleScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_BufferSelectReceive_RecyclesBuffer() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferSelectReceiveScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RecyclesBeyondRingCapacity() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferRecycleReuseScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_ForcedExhaustion_ReportsNoBufferSpace() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferExhaustionScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_MixedWithRecvFrom_Works() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferMixedWorkloadScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_SmallMessages_Shrinks() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferSmallMessageShrinkScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_LargeMessages_Grows() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferLargeMessageGrowScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_MixedWorkload_Stable() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferMixedWorkloadStableScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_ResizeSwap_NoDataLoss() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferResizeSwapNoDataLossScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_Disabled_StaysFixed() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferDisabledScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: false)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_Default_IsDisabled() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferSizingStateScenarioAsync(expectedEnabled: false), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_Switch_HonorsBothValues(bool enabled) + { + await RemoteExecutor.Invoke( + static arg => RunAdaptiveProvidedBufferSizingStateScenarioAsync(bool.Parse(arg)), + enabled.ToString(), + CreateSocketEngineOptions(adaptiveBufferSizingEnabled: enabled)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_DisabledByEnvVar() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferKernelRegistrationDisabledScenarioAsync(), + CreateSocketEngineOptions(registerBuffersEnabled: false)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_SuccessState_VisibleWhenAvailable() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferKernelRegistrationSuccessScenarioAsync(), + CreateSocketEngineOptions(registerBuffersEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_FailureWhenObserved_IsNonFatal() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferKernelRegistrationFailureNonFatalScenarioAsync(), + CreateSocketEngineOptions( + registerBuffersEnabled: true, + providedBufferSize: 65536)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_AdaptiveResize_TriggersReregistration() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferKernelReregistrationOnResizeScenarioAsync(), + CreateSocketEngineOptions( + registerBuffersEnabled: true, + adaptiveBufferSizingEnabled: true, + providedBufferSize: 4096)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_DataCorrectness_WithRegisteredBuffers() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferRegisteredBuffersDataCorrectnessScenarioAsync(), + CreateSocketEngineOptions(registerBuffersEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_MemoryPressure_GracefulFallbackOrSuccess() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferRegistrationMemoryPressureScenarioAsync(), + CreateSocketEngineOptions( + registerBuffersEnabled: true, + providedBufferSize: 65536)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring teardown ordering contract is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_TeardownOrdering_UnregisterBeforeRingClose() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferTeardownOrderingContractScenarioAsync(), + CreateSocketEngineOptions(registerBuffersEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_FixedRecv_Default_IsDisabled() + { + await RemoteExecutor.Invoke( + static () => RunFixedRecvStateScenarioAsync(expectedEnabled: false), + CreateSocketEngineOptions(registerBuffersEnabled: false)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_FixedRecv_Activation_FollowsRuntimeCapabilities() + { + await RemoteExecutor.Invoke( + static () => RunFixedRecvActivationFollowsRuntimeCapabilitiesScenarioAsync(), + CreateSocketEngineOptions(registerBuffersEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_FixedRecv_Enabled_DataCorrectness_WithRegisteredBuffers() + { + await RemoteExecutor.Invoke( + static () => RunFixedRecvDataCorrectnessScenarioAsync(), + CreateSocketEngineOptions( + registerBuffersEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_BasicSendReceive() + { + await RemoteExecutor.Invoke( + static () => RunSqPollBasicSendReceiveScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL request behavior is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_Requested_DoesNotBreakSocketOperations() + { + await RemoteExecutor.Invoke( + static () => RunSqPollRequestedScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL wakeup path is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_IdleWakeupPath_IncrementsWakeupCounterWhenObserved() + { + await RemoteExecutor.Invoke( + static () => RunSqPollWakeupAfterIdleScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL + multishot recv is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_MultishotRecv_Works() + { + await RemoteExecutor.Invoke( + static () => RunSqPollMultishotRecvScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL + zero-copy send is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_ZeroCopySend_Works() + { + await RemoteExecutor.Invoke( + static () => RunSqPollZeroCopySendScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL telemetry counters are Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_TelemetryCounters_Emitted() + { + await RemoteExecutor.Invoke( + static () => RunSqPollTelemetryCountersScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL SQ flags contract is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_SqNeedWakeup_ContractMatchesSqFlagBit() + { + await RemoteExecutor.Invoke( + static () => RunSqPollNeedWakeupContractScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_Default_IsEnabledWhenSupported() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendStateScenarioAsync(expectedEnabledWhenSupported: true), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_ZeroCopySend_Switch_HonorsBothValues(bool enabled) + { + await RemoteExecutor.Invoke( + static arg => RunZeroCopySendStateScenarioAsync(bool.Parse(arg)), + enabled.ToString(), + CreateSocketEngineOptions(zeroCopySendEnabled: enabled)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_LargeBuffer_CompletesCorrectly() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendLargeBufferRoundTripScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_SmallBuffer_UsesRegularSendFallbackPath() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendSmallBufferUsesRegularSendScenarioAsync(), + CreateSocketEngineOptions( + zeroCopySendEnabled: true, + forceEcanceledOnceMask: "send")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_NotifCqe_ReleasesPinHolds() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendNotifCqeReleasesPinHoldsScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_PartialSendResubmission_CompletesFully() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendPartialSendResubmissionScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_TaskCompletion_ReleasesPins() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendCompletionPinLifetimeScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_UnsupportedOpcode_FallsBackGracefully() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendUnsupportedOpcodeFallbackScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_BufferList4KSegments_AboveThreshold_UsesSendMsgZc() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendBufferListSegmentThresholdScenarioAsync(), + CreateSocketEngineOptions( + zeroCopySendEnabled: true, + forceEcanceledOnceMask: "sendmsg")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_SendToAboveThreshold_UsesSendMsgZc() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendToAboveThresholdScenarioAsync(), + CreateSocketEngineOptions( + zeroCopySendEnabled: true, + forceEcanceledOnceMask: "sendmsg")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotRecv_Basic_CompletesAcrossIterations() + { + await RemoteExecutor.Invoke( + static () => RunMultishotRecvBasicScenarioAsync(iterations: 64), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotRecv_Cancellation_Completes() + { + await RemoteExecutor.Invoke( + static () => RunMultishotRecvCancellationScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotRecv_PeerClose_Terminates() + { + await RemoteExecutor.Invoke( + static () => RunMultishotRecvPeerCloseScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotRecv_ProvidedBufferExhaustion_FollowsPolicy() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferExhaustionScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringPersistentMultishotRecv_ProvidedBufferExhaustion_TerminatesAndRecovers() + { + await RemoteExecutor.Invoke( + static () => RunPersistentMultishotRecvProvidedBufferExhaustionScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotRecv_MixedWithOneShot_Coexists() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferMixedWorkloadScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringPersistentMultishotRecv_ShapeChange_CancelsAndRearms() + { + await RemoteExecutor.Invoke( + static () => RunPersistentMultishotRecvShapeChangeScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringPersistentMultishotRecv_ConcurrentCloseRace_DoesNotHang() + { + await RemoteExecutor.Invoke( + static () => RunPersistentMultishotRecvConcurrentCloseRaceScenarioAsync(iterations: 32), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_Basic_CompletesAcrossIterations() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptBasicScenarioAsync(connectionCount: 10), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_PrequeuesConnections_BeforeSubsequentAcceptAsync() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptPrequeueScenarioAsync(prequeuedConnectionCount: 5), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_ListenerClose_CompletesPendingAcceptAndDrainsQueue() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptListenerCloseScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_DisposeDuringArmingRace_DoesNotHang() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptDisposeDuringArmingRaceScenarioAsync(iterations: 64), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_UsesOneShotWhenMultishotUnavailable() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptUnavailableOneShotScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_TerminalCompletion_RearmsOnNextAccept() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptRearmAfterTerminalCqeScenarioAsync(), + CreateSocketEngineOptions(forceEcanceledOnceMask: "accept")).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_HighConnectionRate_NoLoss() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptHighConnectionRateScenarioAsync(connectionCount: 256), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_NetworkStream_ReadAsync_CancellationToken_Works() + { + await RemoteExecutor.Invoke( + static () => RunNetworkStreamReadAsyncCancellationTokenScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveAsync_SocketAsyncEventArgs_BufferList_Unaffected() + { + await RemoteExecutor.Invoke( + static () => RunReceiveAsyncSocketAsyncEventArgsBufferListScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_BufferListSendReceive_Works() + { + await RemoteExecutor.Invoke(static async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[] { 0x01, 0x11, 0x21, 0x31, 0x41, 0x51, 0x61 }; + var sendBuffers = new List> + { + new ArraySegment(payload, 0, 2), + new ArraySegment(payload, 2, 1), + new ArraySegment(payload, 3, 4) + }; + + byte[] receiveBuffer1 = new byte[3]; + byte[] receiveBuffer2 = new byte[4]; + var receiveBuffers = new List> + { + new ArraySegment(receiveBuffer1), + new ArraySegment(receiveBuffer2) + }; + + Task receiveTask = server.ReceiveAsync(receiveBuffers, SocketFlags.None); + await Task.Yield(); + + int sent = await client.SendAsync(sendBuffers, SocketFlags.None); + Assert.Equal(payload.Length, sent); + + int received = await receiveTask; + Assert.Equal(payload.Length, received); + + byte[] combined = new byte[payload.Length]; + Buffer.BlockCopy(receiveBuffer1, 0, combined, 0, receiveBuffer1.Length); + Buffer.BlockCopy(receiveBuffer2, 0, combined, receiveBuffer1.Length, receiveBuffer2.Length); + Assert.Equal(payload, combined); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_BufferListReceive_WithPeek_PreservesData() + { + await RemoteExecutor.Invoke(static async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[] { 0x0A, 0x1A, 0x2A, 0x3A }; + Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None)); + + byte[] peekBuffer1 = new byte[2]; + byte[] peekBuffer2 = new byte[2]; + int peeked = await server.ReceiveAsync( + new List> + { + new ArraySegment(peekBuffer1), + new ArraySegment(peekBuffer2) + }, + SocketFlags.Peek); + Assert.Equal(payload.Length, peeked); + + byte[] peekCombined = new byte[payload.Length]; + Buffer.BlockCopy(peekBuffer1, 0, peekCombined, 0, peekBuffer1.Length); + Buffer.BlockCopy(peekBuffer2, 0, peekCombined, peekBuffer1.Length, peekBuffer2.Length); + Assert.Equal(payload, peekCombined); + + byte[] receiveBuffer1 = new byte[1]; + byte[] receiveBuffer2 = new byte[3]; + int received = await server.ReceiveAsync( + new List> + { + new ArraySegment(receiveBuffer1), + new ArraySegment(receiveBuffer2) + }, + SocketFlags.None); + Assert.Equal(payload.Length, received); + + byte[] receiveCombined = new byte[payload.Length]; + Buffer.BlockCopy(receiveBuffer1, 0, receiveCombined, 0, receiveBuffer1.Length); + Buffer.BlockCopy(receiveBuffer2, 0, receiveCombined, receiveBuffer1.Length, receiveBuffer2.Length); + Assert.Equal(payload, receiveCombined); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_BufferListReceiveFrom_WritesRemoteEndPoint() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] receiveBuffer1 = new byte[3]; + byte[] receiveBuffer2 = new byte[4]; + using var receiveEventArgs = new SocketAsyncEventArgs + { + BufferList = new List> + { + new ArraySegment(receiveBuffer1), + new ArraySegment(receiveBuffer2) + }, + RemoteEndPoint = new IPEndPoint(IPAddress.Any, 0) + }; + + Task receiveTask = StartSocketAsyncEventArgsOperation( + receiver, + receiveEventArgs, + static (s, args) => s.ReceiveFromAsync(args)); + await Task.Yield(); + + byte[] payload = new byte[] { 0xA0, 0xB0, 0xC0, 0xD0, 0xE0, 0xF0, 0x01 }; + int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(payload.Length, sent); + + SocketAsyncEventArgs completedReceive = await receiveTask; + Assert.Equal(SocketError.Success, completedReceive.SocketError); + Assert.Equal(payload.Length, completedReceive.BytesTransferred); + Assert.Equal(SocketFlags.None, completedReceive.SocketFlags); + + IPEndPoint expectedRemoteEndPoint = (IPEndPoint)sender.LocalEndPoint!; + IPEndPoint actualRemoteEndPoint = Assert.IsType(completedReceive.RemoteEndPoint); + Assert.Equal(expectedRemoteEndPoint, actualRemoteEndPoint); + + byte[] combined = new byte[payload.Length]; + Buffer.BlockCopy(receiveBuffer1, 0, combined, 0, receiveBuffer1.Length); + Buffer.BlockCopy(receiveBuffer2, 0, combined, receiveBuffer1.Length, receiveBuffer2.Length); + Assert.Equal(payload, combined); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_BufferListSendTo_WritesPayloadAndEndpoint() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] payload = new byte[] { 0x33, 0x44, 0x55, 0x66, 0x77, 0x88 }; + byte[] receiveBuffer = new byte[payload.Length]; + + Task receiveTask = + ToTask(receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + + using var sendEventArgs = new SocketAsyncEventArgs + { + BufferList = new List> + { + new ArraySegment(payload, 0, 2), + new ArraySegment(payload, 2, 1), + new ArraySegment(payload, 3, 3) + }, + RemoteEndPoint = receiver.LocalEndPoint + }; + + SocketAsyncEventArgs completedSend = await StartSocketAsyncEventArgsOperation( + sender, + sendEventArgs, + static (s, args) => s.SendToAsync(args)); + Assert.Equal(SocketError.Success, completedSend.SocketError); + Assert.Equal(payload.Length, completedSend.BytesTransferred); + + SocketReceiveFromResult receiveResult = await receiveTask; + Assert.Equal(payload.Length, receiveResult.ReceivedBytes); + Assert.Equal(payload, receiveBuffer); + + IPEndPoint expectedRemoteEndPoint = (IPEndPoint)sender.LocalEndPoint!; + IPEndPoint actualRemoteEndPoint = Assert.IsType(receiveResult.RemoteEndPoint); + Assert.Equal(expectedRemoteEndPoint, actualRemoteEndPoint); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_AcceptConnect_SocketAsyncEventArgs_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + using var acceptEventArgs = new SocketAsyncEventArgs(); + Task acceptTask = StartSocketAsyncEventArgsOperation( + listener, + acceptEventArgs, + static (s, args) => s.AcceptAsync(args)); + await Task.Yield(); + + using var connectEventArgs = new SocketAsyncEventArgs + { + RemoteEndPoint = listener.LocalEndPoint + }; + + SocketAsyncEventArgs completedConnect = await StartSocketAsyncEventArgsOperation( + client, + connectEventArgs, + static (s, args) => s.ConnectAsync(args)); + Assert.Equal(SocketError.Success, completedConnect.SocketError); + + SocketAsyncEventArgs completedAccept = await acceptTask; + Assert.Equal(SocketError.Success, completedAccept.SocketError); + + Socket accepted = Assert.IsType(completedAccept.AcceptSocket); + completedAccept.AcceptSocket = null; + using Socket server = accepted; + + // Validates accept address-length handling: the endpoint must match the connecting socket exactly. + IPEndPoint expectedRemoteEndPoint = (IPEndPoint)client.LocalEndPoint!; + IPEndPoint actualRemoteEndPoint = Assert.IsType(server.RemoteEndPoint); + Assert.Equal(expectedRemoteEndPoint, actualRemoteEndPoint); + + byte[] payload = new byte[] { 0x5A }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_AcceptAsync_CancellationToken_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(15)); + Task acceptTask = ToTask(listener.AcceptAsync(cts.Token)); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + using Socket server = await acceptTask; + + byte[] payload = new byte[] { 0x4D }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_AcceptAsync_SocketAsyncEventArgs_PrecreatedAcceptSocket_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(4); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + // Arm multishot accept and leave one connection queued for pre-accept dequeue. + Task armingAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Expected multishot accept to arm before precreated AcceptSocket test."); + + using Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + using Socket secondClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await Task.WhenAll(firstClient.ConnectAsync(endpoint), secondClient.ConnectAsync(endpoint)); + using Socket firstServer = await armingAcceptTask; + + DateTime deadline = DateTime.UtcNow + TimeSpan.FromSeconds(5); + while (DateTime.UtcNow < deadline && GetListenerMultishotAcceptQueueCount(listener) == 0) + { + await Task.Delay(25); + } + + Assert.True(GetListenerMultishotAcceptQueueCount(listener) > 0, "Expected a queued pre-accepted connection."); + + using Socket precreatedAcceptSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + using var acceptEventArgs = new SocketAsyncEventArgs + { + AcceptSocket = precreatedAcceptSocket + }; + + SocketAsyncEventArgs completedAccept = await StartSocketAsyncEventArgsOperation( + listener, + acceptEventArgs, + static (s, args) => s.AcceptAsync(args)); + Assert.Equal(SocketError.Success, completedAccept.SocketError); + Assert.Same(precreatedAcceptSocket, completedAccept.AcceptSocket); + + byte[] payload = new byte[] { 0x3F }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await secondClient.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await precreatedAcceptSocket.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + + // Keep ownership of the accepted socket out of event-args disposal. + completedAccept.AcceptSocket = null; + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_TcpListener_AcceptTcpClientAsync_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using var listener = new TcpListener(IPAddress.Loopback, 0); + listener.Start(); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndpoint; + + Task acceptTask = listener.AcceptTcpClientAsync(); + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await client.ConnectAsync(endpoint); + + using TcpClient acceptedClient = await acceptTask; + using Socket server = acceptedClient.Client; + + byte[] payload = new byte[] { 0x2A }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ConnectAsync_OffLenRegression_Ipv4AndIpv6_Works() + { + await RemoteExecutor.Invoke(static async () => + { + await VerifyConnectAsync(AddressFamily.InterNetwork, IPAddress.Loopback); + + if (Socket.OSSupportsIPv6) + { + await VerifyConnectAsync(AddressFamily.InterNetworkV6, IPAddress.IPv6Loopback); + } + + static async Task VerifyConnectAsync(AddressFamily addressFamily, IPAddress loopback) + { + using Socket listener = new Socket(addressFamily, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(loopback, 0)); + listener.Listen(1); + + using Socket client = new Socket(addressFamily, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = listener.AcceptAsync(); + + using var connectEventArgs = new SocketAsyncEventArgs + { + RemoteEndPoint = listener.LocalEndPoint + }; + + SocketAsyncEventArgs completedConnect = await StartSocketAsyncEventArgsOperation( + client, + connectEventArgs, + static (s, args) => s.ConnectAsync(args)); + Assert.Equal(SocketError.Success, completedConnect.SocketError); + + using Socket server = await acceptTask; + Assert.Equal(client.LocalEndPoint, server.RemoteEndPoint); + + byte[] payload = new byte[] { 0x3C }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ConnectAsync_WithInitialData_ForcedSendFailure_PropagatesError() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(2); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + client.SendBufferSize = 1024; + using var connectEventArgs = new SocketAsyncEventArgs + { + RemoteEndPoint = listener.LocalEndPoint + }; + + byte[] initialPayload = new byte[8 * 1024 * 1024]; + for (int i = 0; i < initialPayload.Length; i++) + { + initialPayload[i] = unchecked((byte)i); + } + connectEventArgs.SetBuffer(initialPayload, 0, initialPayload.Length); + + Task firstAcceptTask = listener.AcceptAsync(); + Task connectTask = StartSocketAsyncEventArgsOperation( + client, + connectEventArgs, + static (s, args) => s.ConnectAsync(args)); + using (Socket firstServer = await firstAcceptTask) + { + firstServer.LingerState = new LingerOption(enable: true, seconds: 0); + } + + Task completed = await Task.WhenAny(connectTask, Task.Delay(TimeSpan.FromSeconds(30))); + Assert.Same(connectTask, completed); + SocketAsyncEventArgs completedConnect = await connectTask; + Assert.NotEqual(SocketError.Success, completedConnect.SocketError); + + using Socket secondClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task secondAcceptTask = listener.AcceptAsync(); + await secondClient.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + using Socket secondServer = await secondAcceptTask; + + byte[] payload = new byte[] { 0x9A }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await secondClient.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await secondServer.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + }, CreateSocketEngineOptions(forceEcanceledOnceMask: "send")).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_ReceiveMessageFrom_PacketInformation_Works(bool useIpv6) + { + await RemoteExecutor.Invoke( + static arg => RunReceiveMessageFromPacketInformationRoundTripAsync(useIpv6: bool.Parse(arg)), + useIpv6.ToString(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_ReceiveMessageFrom_BufferList_PacketInformation_Works(bool useIpv6) + { + await RemoteExecutor.Invoke(static async arg => + { + bool useIpv6 = bool.Parse(arg); + if (useIpv6 && !Socket.OSSupportsIPv6) + { + return; + } + + AddressFamily family = useIpv6 ? AddressFamily.InterNetworkV6 : AddressFamily.InterNetwork; + IPAddress loopback = useIpv6 ? IPAddress.IPv6Loopback : IPAddress.Loopback; + IPAddress anyAddress = useIpv6 ? IPAddress.IPv6Any : IPAddress.Any; + SocketOptionLevel packetInfoLevel = useIpv6 ? SocketOptionLevel.IPv6 : SocketOptionLevel.IP; + + using Socket receiver = new Socket(family, SocketType.Dgram, ProtocolType.Udp); + using Socket sender = new Socket(family, SocketType.Dgram, ProtocolType.Udp); + + receiver.SetSocketOption(packetInfoLevel, SocketOptionName.PacketInformation, true); + receiver.Bind(new IPEndPoint(loopback, 0)); + sender.Bind(new IPEndPoint(loopback, 0)); + + byte[] payload = new byte[] { 0x70, 0x71, 0x72, 0x73, 0x74 }; + byte[] receiveBuffer = new byte[payload.Length]; + + using var receiveEventArgs = new SocketAsyncEventArgs + { + BufferList = new List> + { + new ArraySegment(receiveBuffer, 0, 2), + new ArraySegment(receiveBuffer, 2, 3) + }, + RemoteEndPoint = new IPEndPoint(anyAddress, 0) + }; + + Task receiveTask = StartReceiveMessageFromAsync(receiver, receiveEventArgs); + await Task.Yield(); + + int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(payload.Length, sent); + + SocketAsyncEventArgs completedReceive = await receiveTask; + Assert.Equal(SocketError.Success, completedReceive.SocketError); + Assert.Equal(payload.Length, completedReceive.BytesTransferred); + Assert.Equal(payload, receiveBuffer); + Assert.Equal(sender.LocalEndPoint, completedReceive.RemoteEndPoint); + Assert.Equal(((IPEndPoint)sender.LocalEndPoint!).Address, completedReceive.ReceiveMessageFromPacketInfo.Address); + }, useIpv6.ToString(), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_BasicRoundTrip_StillWorks() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_SendAsync_PartialSendResubmission_CompletesFully(bool useBufferListSend) + { + await RemoteExecutor.Invoke( + static (arg) => RunLargeSendWithBackpressureAsync(useBufferListSend: bool.Parse(arg)), + useBufferListSend.ToString(), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_ForcedReceiveResultOnce_RecoversAndNextOperationStillWorks(bool forceEcanceled) + { + await RemoteExecutor.Invoke( + static arg => RunForcedReceiveScenarioAsync(forceEcanceled: bool.Parse(arg)), + forceEcanceled.ToString(), + CreateSocketEngineOptions( + forceEagainOnceMask: forceEcanceled ? null : "recv", + forceEcanceledOnceMask: forceEcanceled ? "recv" : null)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ForcedEagain_Recv_RequeuesViaCompletionPath() + { + await RemoteExecutor.Invoke( + static () => + { + long pollReadinessBefore = GetIoUringPollReadinessCqeCount(); + long requeueFailureBefore = GetIoUringCompletionRequeueFailureCounterValue(); + long queuedRetryBefore = GetIoUringPendingRetryQueuedToPrepareQueueCount(); + + return Task.Run(async () => + { + await RunForcedReceiveScenarioAsync(forceEcanceled: false); + + Assert.Equal(pollReadinessBefore, GetIoUringPollReadinessCqeCount()); + Assert.Equal(requeueFailureBefore, GetIoUringCompletionRequeueFailureCounterValue()); + Assert.Equal(queuedRetryBefore, GetIoUringPendingRetryQueuedToPrepareQueueCount()); + }); + }, + CreateSocketEngineOptions( + forceEagainOnceMask: "recv")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroByteReceive_OnPeerClose_ReturnsZeroOrCloseError() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + Task zeroByteReceive = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None)); + await Task.Yield(); + + client.Shutdown(SocketShutdown.Both); + client.Dispose(); + + Task completed = await Task.WhenAny(zeroByteReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(zeroByteReceive, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await zeroByteReceive); + if (ex is null) + { + Assert.Equal(0, await zeroByteReceive); + } + else + { + SocketException socketException = Assert.IsType(ex); + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error while waiting for peer-close zero-byte receive completion: {socketException.SocketErrorCode}"); + } + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveFrom_TruncatedPayload_ReturnsTruncatedLengthOrMessageSizeError() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] sendPayload = new byte[] { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06 }; + EndPoint senderEndpoint = sender.LocalEndPoint!; + byte[] receiveBuffer = new byte[2]; + + Task receiveTask = + ToTask(receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + + int sent = await sender.SendToAsync(sendPayload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(sendPayload.Length, sent); + + Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(receiveTask, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await receiveTask); + if (ex is not null) + { + SocketException socketException = Assert.IsType(ex); + Assert.Equal(SocketError.MessageSize, socketException.SocketErrorCode); + return; + } + + SocketReceiveFromResult receiveResult = await receiveTask; + Assert.True(receiveResult.ReceivedBytes > 0 && receiveResult.ReceivedBytes <= receiveBuffer.Length); + for (int i = 0; i < receiveResult.ReceivedBytes; i++) + { + Assert.Equal(sendPayload[i], receiveBuffer[i]); + } + Assert.Equal(senderEndpoint, receiveResult.RemoteEndPoint); + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_SendTo_UnreachableEndpoint_CompletesOrFailsWithExpectedError() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] payload = new byte[] { 0xAA }; + EndPoint destination = new IPEndPoint(IPAddress.Parse("192.0.2.1"), 9); + + try + { + int sent = await sender.SendToAsync(payload, SocketFlags.None, destination); + Assert.Equal(payload.Length, sent); + // UDP sendto may succeed on some Linux/network configurations even for TEST-NET destinations. + return; + } + catch (Exception ex) + { + SocketException socketException = Assert.IsType(ex); + Assert.True( + socketException.SocketErrorCode == SocketError.NetworkUnreachable || + socketException.SocketErrorCode == SocketError.HostUnreachable || + socketException.SocketErrorCode == SocketError.HostNotFound || + socketException.SocketErrorCode == SocketError.NetworkDown || + socketException.SocketErrorCode == SocketError.AccessDenied || + socketException.SocketErrorCode == SocketError.InvalidArgument, + $"Unexpected socket error for unreachable send: {socketException.SocketErrorCode}"); + } + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_AsyncCancelRequestCqe_IsolatedFromManagedOperationDispatch() + { + await RemoteExecutor.Invoke(static () => RunAsyncCancelRequestIsolationScenarioAsync(64), CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CancellationSubmitContention_ProgressesUnderLoad() + { + await RemoteExecutor.Invoke( + static () => RunCancellationSubmitContentionScenarioAsync(connectionCount: 8, cancellationsPerConnection: 96), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CompletionCancellationRace_CompletesExactlyOnce() + { + await RemoteExecutor.Invoke(static () => RunCompletionCancellationRaceAsync(128), CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_RapidCancelWhileEnqueued_DoesNotCorruptState() + { + await RemoteExecutor.Invoke(static async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + const int WorkerCount = 8; + const int IterationsPerWorker = 128; + var tasks = new Task[WorkerCount]; + + for (int worker = 0; worker < WorkerCount; worker++) + { + tasks[worker] = Task.Run(async () => + { + byte[] receiveBuffer = new byte[1]; + for (int i = 0; i < IterationsPerWorker; i++) + { + using var cts = new CancellationTokenSource(); + var receiveTask = server.ReceiveAsync(receiveBuffer.AsMemory(), SocketFlags.None, cts.Token); + cts.Cancel(); + + Exception? ex = await Record.ExceptionAsync(async () => await receiveTask); + AssertCanceledOrInterrupted(ex); + } + }); + } + + await Task.WhenAll(tasks); + + // Ensure socket state still allows normal async flow after rapid cancellation churn. + byte[] payload = new byte[] { 0xA5 }; + int sent = await client.SendAsync(payload, SocketFlags.None); + Assert.Equal(1, sent); + int received = await server.ReceiveAsync(payload, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(0xA5, payload[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CloseDisposeStress_DoesNotHang() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(32); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + for (int i = 0; i < 64; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + Task[] receives = new Task[16]; + for (int r = 0; r < receives.Length; r++) + { + receives[r] = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None)); + } + + client.Dispose(); + server.Dispose(); + + for (int r = 0; r < receives.Length; r++) + { + Exception? ex = await Record.ExceptionAsync(async () => await receives[r]); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error: {socketException.SocketErrorCode}"); + } + else if (ex is not ObjectDisposedException and not null) + { + throw ex; + } + } + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_ConcurrentCloseWithPendingReceive_DoesNotHang() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(16); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + byte[] receiveBuffer = new byte[1]; + for (int i = 0; i < 64; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + var pendingReceive = server.ReceiveAsync(receiveBuffer, SocketFlags.None); + + // Force teardown while an async receive is pending. + client.Dispose(); + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error: {socketException.SocketErrorCode}"); + } + else if (ex is not null) + { + throw ex; + } + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_ConcurrentRegistrationChurn_DoesNotHang() + { + await RemoteExecutor.Invoke(static async () => + { + const int WorkerCount = 8; + const int IterationsPerWorker = 64; + + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(WorkerCount * 2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var workers = new Task[WorkerCount]; + for (int worker = 0; worker < WorkerCount; worker++) + { + workers[worker] = Task.Run(async () => + { + byte[] sendBuffer = new byte[] { 0x5A }; + byte[] receiveBuffer = new byte[1]; + + for (int i = 0; i < IterationsPerWorker; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + var pendingReceive = server.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + if ((i & 1) == 0) + { + int sent = await client.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, sent); + } + else + { + client.Dispose(); + } + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error: {socketException.SocketErrorCode}"); + } + else if (ex is not ObjectDisposedException and not null) + { + throw ex; + } + } + }); + } + + await Task.WhenAll(workers); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_RepeatedRunStabilityGate() + { + await RemoteExecutor.Invoke(static async () => + { + const int Iterations = 50; + for (int i = 0; i < Iterations; i++) + { + await RunTcpRoundTripAsync(8); + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + } +} diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/MpscQueueTests.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/MpscQueueTests.cs new file mode 100644 index 00000000000000..ac95d87867222c --- /dev/null +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/MpscQueueTests.cs @@ -0,0 +1,204 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Reflection; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.DotNet.XUnitExtensions; +using Xunit; + +namespace System.Net.Sockets.Tests +{ + [PlatformSpecific(TestPlatforms.Linux)] // MPSC queue is used by Linux io_uring paths. + public class MpscQueueTests + { + private sealed class QueueProxy + { + private static readonly Type s_queueType = GetQueueType(); + + private readonly object _instance; + private readonly MethodInfo _enqueueMethod; + private readonly MethodInfo _tryDequeueMethod; + private readonly PropertyInfo _isEmptyProperty; + + public QueueProxy(int segmentSize) + { + ConstructorInfo ctor = s_queueType.GetConstructor( + BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic, + binder: null, + new[] { typeof(int) }, + modifiers: null)!; + + _instance = ctor.Invoke(new object[] { segmentSize }); + _enqueueMethod = s_queueType.GetMethod("Enqueue", BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic)!; + _tryDequeueMethod = s_queueType.GetMethod("TryDequeue", BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic)!; + _isEmptyProperty = s_queueType.GetProperty("IsEmpty", BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic)!; + } + + public void Enqueue(T item) => _enqueueMethod.Invoke(_instance, new object?[] { item }); + + public bool TryDequeue(out T item) + { + object?[] args = new object?[] { default(T) }; + bool result = (bool)_tryDequeueMethod.Invoke(_instance, args)!; + item = result ? (T)args[0]! : default!; + return result; + } + + public bool IsEmpty => (bool)_isEmptyProperty.GetValue(_instance)!; + + private static Type GetQueueType() + { + Type? openGeneric = typeof(Socket).Assembly.GetType("System.Net.Sockets.MpscQueue`1", throwOnError: false, ignoreCase: false); + Assert.NotNull(openGeneric); + return openGeneric!.MakeGenericType(typeof(T)); + } + } + + [Fact] + public void MpscQueue_SingleProducerSingleConsumer_PreservesOrder() + { + const int count = 1024; + var queue = new QueueProxy(segmentSize: 16); + + for (int i = 0; i < count; i++) + { + queue.Enqueue(i); + } + + for (int i = 0; i < count; i++) + { + Assert.True(queue.TryDequeue(out int value)); + Assert.Equal(i, value); + } + + Assert.True(queue.IsEmpty); + Assert.False(queue.TryDequeue(out _)); + } + + [Fact] + public async Task MpscQueue_MultiProducerSingleConsumer_ReceivesAllItems() + { + const int producerCount = 4; + const int itemsPerProducer = 2000; + const int totalItems = producerCount * itemsPerProducer; + var queue = new QueueProxy(segmentSize: 32); + + Task[] producers = new Task[producerCount]; + for (int p = 0; p < producerCount; p++) + { + int producerIndex = p; + producers[p] = Task.Run(() => + { + int baseValue = producerIndex * itemsPerProducer; + for (int i = 0; i < itemsPerProducer; i++) + { + queue.Enqueue(baseValue + i); + } + }); + } + + var seen = new bool[totalItems]; + int received = 0; + var spin = new SpinWait(); + while (received < totalItems) + { + if (queue.TryDequeue(out int value)) + { + Assert.InRange(value, 0, totalItems - 1); + Assert.False(seen[value], $"duplicate dequeue value: {value}"); + seen[value] = true; + received++; + } + else + { + spin.SpinOnce(); + } + } + + await Task.WhenAll(producers); + Assert.All(seen, Assert.True); + Assert.True(queue.IsEmpty); + } + + [Fact] + public void MpscQueue_EmptyQueue_ReportsEmptyAndTryDequeueFalse() + { + var queue = new QueueProxy(segmentSize: 8); + + Assert.True(queue.IsEmpty); + Assert.False(queue.TryDequeue(out _)); + } + + [Fact] + public void MpscQueue_SegmentCrossing_WorksAcrossMultipleSegments() + { + const int count = 37; + var queue = new QueueProxy(segmentSize: 2); + + for (int i = 0; i < count; i++) + { + queue.Enqueue(i); + } + + for (int i = 0; i < count; i++) + { + Assert.True(queue.TryDequeue(out int value)); + Assert.Equal(i, value); + } + + Assert.True(queue.IsEmpty); + } + + [Fact] + public async Task MpscQueue_Stress_NoLossAndNoDeadlock() + { + const int producerCount = 6; + const int itemsPerProducer = 4000; + const int totalItems = producerCount * itemsPerProducer; + var queue = new QueueProxy(segmentSize: 32); + + Task[] producers = new Task[producerCount]; + for (int p = 0; p < producerCount; p++) + { + int producerIndex = p; + producers[p] = Task.Run(() => + { + int baseValue = producerIndex * itemsPerProducer; + for (int i = 0; i < itemsPerProducer; i++) + { + queue.Enqueue(baseValue + i); + } + }); + } + + var seen = new HashSet(); + int received = 0; + var timeout = Stopwatch.StartNew(); + while (received < totalItems) + { + if (timeout.Elapsed > TimeSpan.FromSeconds(30)) + { + throw new TimeoutException($"Timed out draining MPSC queue. received={received}, expected={totalItems}"); + } + + if (queue.TryDequeue(out int value)) + { + Assert.True(seen.Add(value), $"duplicate dequeue value: {value}"); + received++; + } + else + { + await Task.Yield(); + } + } + + await Task.WhenAll(producers); + Assert.Equal(totalItems, seen.Count); + Assert.True(queue.IsEmpty); + } + } +} diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj b/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj index 43844aea397681..33b2dae6ea89f2 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj @@ -6,7 +6,90 @@ true true true + + default + <_SocketsIoUringTestModeSupported Condition="'$(SocketsIoUringTestMode)' == 'default' or '$(SocketsIoUringTestMode)' == 'enabled' or '$(SocketsIoUringTestMode)' == 'disabled'">true + true + + + + + + + + + + + + + + + + + <_IoUringVariantsRoot>$([MSBuild]::NormalizeDirectory('$(IntermediateOutputPath)', 'io_uring_variants')) + <_IoUringEnabledDir>$([MSBuild]::NormalizeDirectory('$(_IoUringVariantsRoot)', 'enabled')) + <_IoUringDisabledDir>$([MSBuild]::NormalizeDirectory('$(_IoUringVariantsRoot)', 'disabled')) + <_RunScriptName>RunTests.sh + <_EnabledArchivePath>$([MSBuild]::NormalizePath('$(TestArchiveTestsDir)', '$(TestProjectName).io_uring_enabled.zip')) + <_DisabledArchivePath>$([MSBuild]::NormalizePath('$(TestArchiveTestsDir)', '$(TestProjectName).io_uring_disabled.zip')) + + + + + + + <_OutDirFiles Include="$(OutDir)**/*" /> + + + + + + + + + + <_EnabledRunScriptLines Include="#!/usr/bin/env bash" /> + <_EnabledRunScriptLines Include="set -euo pipefail" /> + <_EnabledRunScriptLines Include="export DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1" /> + <_EnabledRunScriptLines Include="exec ./RunTests.base.sh "$@"" /> + <_DisabledRunScriptLines Include="#!/usr/bin/env bash" /> + <_DisabledRunScriptLines Include="set -euo pipefail" /> + <_DisabledRunScriptLines Include="export DOTNET_SYSTEM_NET_SOCKETS_IO_URING=0" /> + <_DisabledRunScriptLines Include="exec ./RunTests.base.sh "$@"" /> + + + + + + + + + + + @@ -22,6 +105,8 @@ + + diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs index 69f61fc180a49c..ba53804049058c 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs @@ -7,6 +7,8 @@ using System.Diagnostics.Tracing; using System.Linq; using System.Net.Test.Common; +using System.Reflection; +using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; using Microsoft.DotNet.RemoteExecutor; @@ -20,6 +22,18 @@ public class TelemetryTest { private const string ActivitySourceName = "Experimental.System.Net.Sockets"; private const string ActivityName = ActivitySourceName + ".Connect"; + private static readonly string[] s_ioUringCounterNames = GetIoUringCounterNames(); + private static readonly string[] s_expectedIoUringCounterNames = new[] + { + "io-uring-completion-slot-exhaustions", + "io-uring-cq-overflow", + "io-uring-prepare-nonpinnable-fallbacks", + "io-uring-prepare-queue-overflow-fallbacks", + "io-uring-prepare-queue-overflows", + "io-uring-socket-event-buffer-full", + "io-uring-sqpoll-submissions-skipped", + "io-uring-sqpoll-wakeups" + }; private static readonly Lazy> s_remoteServerIsReachable = new Lazy>(() => Task.Run(async () => { @@ -46,6 +60,24 @@ public TelemetryTest(ITestOutputHelper output) _output = output; } + private static string[] GetIoUringCounterNames() + { + Type? counterNamesType = + typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketsTelemetry+IoUringCounterNames", throwOnError: false); + + if (counterNamesType is null) + { + return Array.Empty(); + } + + return counterNamesType + .GetFields(BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Static) + .Where(field => field.IsLiteral && !field.IsInitOnly && field.FieldType == typeof(string)) + .Select(field => (string)field.GetRawConstantValue()!) + .OrderBy(name => name, StringComparer.Ordinal) + .ToArray(); + } + [Fact] [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] public static void EventSource_ExistsWithCorrectId() @@ -59,6 +91,160 @@ public static void EventSource_ExistsWithCorrectId() Assert.NotEmpty(EventSource.GenerateManifest(esType, esType.Assembly.Location)); } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // Socket engine backend event is emitted by Linux engine initialization. + [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] + public async Task EventSource_SocketEngineBackendSelected_Emitted() + { + await RemoteExecutor.Invoke(async () => + { + using var listener = new TestEventListener("System.Net.Sockets", EventLevel.Verbose, 0.1); + listener.AddActivityTracking(); + + var events = new ConcurrentQueue<(EventWrittenEventArgs Event, Guid ActivityId)>(); + await listener.RunWithCallbackAsync(e => events.Enqueue((e, e.ActivityId)), async () => + { + using var server = new Socket(SocketType.Stream, ProtocolType.Tcp); + server.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + server.Listen(); + + using var client = new Socket(SocketType.Stream, ProtocolType.Tcp); + Task connectTask = client.ConnectAsync(server.LocalEndPoint); + using var accepted = await server.AcceptAsync(); + await connectTask; + + await WaitForEventAsync(events, "SocketEngineBackendSelected"); + }); + + EventWrittenEventArgs[] backendEvents = events + .Where(e => e.Event.EventName == "SocketEngineBackendSelected") + .Select(e => e.Event) + .ToArray(); + + Assert.NotEmpty(backendEvents); + foreach (EventWrittenEventArgs backendEvent in backendEvents) + { + Assert.Equal(3, backendEvent.Payload?.Count ?? 0); + string backend = Assert.IsType(backendEvent.Payload![0]); + int isIoUringPort = Convert.ToInt32(backendEvent.Payload[1]); + int sqPollEnabled = Convert.ToInt32(backendEvent.Payload[2]); + + Assert.True( + backend == "epoll" || backend == "io_uring_completion", + $"Unexpected backend payload: {backend}"); + Assert.Equal(backend == "io_uring_completion" ? 1 : 0, isIoUringPort); + Assert.True(sqPollEnabled == 0 || sqPollEnabled == 1, $"Unexpected sqpoll payload: {sqPollEnabled}"); + if (backend == "epoll") + { + Assert.Equal(0, sqPollEnabled); + } + } + }).DisposeAsync(); + } + + [Fact] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring interop types are Linux-only. + [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] + public static void IoUringSocketEventPortDiagnostics_LayoutContract() + { + Type? type = typeof(Socket).Assembly.GetType("Interop+Sys+IoUringSocketEventPortDiagnostics", throwOnError: false, ignoreCase: false); + if (type is null) + { + return; + } + + Assert.True(type.IsLayoutSequential); + + Assert.Equal(0, Marshal.OffsetOf(type, "AsyncCancelRequestCqeCount").ToInt32()); + Assert.Equal(8, Marshal.OffsetOf(type, "AsyncCancelRequestCqeEnoentCount").ToInt32()); + Assert.Equal(16, Marshal.OffsetOf(type, "AsyncCancelRequestCqeEalreadyCount").ToInt32()); + Assert.Equal(24, Marshal.OffsetOf(type, "AsyncCancelRequestCqeOtherCount").ToInt32()); + Assert.Equal(32, Marshal.OffsetOf(type, "SocketEventBufferFullCount").ToInt32()); + + if (type.GetField("CompletionBufferFullCount", BindingFlags.Public | BindingFlags.Instance) is not null) + { + Assert.Equal(40, Marshal.OffsetOf(type, "CompletionBufferFullCount").ToInt32()); + Assert.Equal(48, Marshal.OffsetOf(type, "UnsupportedOpcodePrepareCount").ToInt32()); + Assert.Equal(56, Marshal.OffsetOf(type, "CqOverflowCount").ToInt32()); + Assert.Equal(64, Marshal.SizeOf(type)); + } + else + { + Assert.Equal(40, Marshal.OffsetOf(type, "UnsupportedOpcodePrepareCount").ToInt32()); + Assert.Equal(48, Marshal.OffsetOf(type, "CqOverflowCount").ToInt32()); + Assert.Equal(56, Marshal.SizeOf(type)); + } + } + + [Fact] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring interop types are Linux-only. + [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] + public static void IoUringProvidedBufferInterop_LayoutContract() + { + Type ioUringBufType = GetInteropSysNestedType("IoUringBuf"); + Assert.True(ioUringBufType.IsExplicitLayout); + Assert.Equal(0, Marshal.OffsetOf(ioUringBufType, "Address").ToInt32()); + Assert.Equal(8, Marshal.OffsetOf(ioUringBufType, "Length").ToInt32()); + Assert.Equal(12, Marshal.OffsetOf(ioUringBufType, "BufferId").ToInt32()); + Assert.Equal(14, Marshal.OffsetOf(ioUringBufType, "Reserved").ToInt32()); + Assert.Equal(16, Marshal.SizeOf(ioUringBufType)); + + Type ioUringBufRingHeaderType = GetInteropSysNestedType("IoUringBufRingHeader"); + Assert.True(ioUringBufRingHeaderType.IsExplicitLayout); + Assert.Equal(0, Marshal.OffsetOf(ioUringBufRingHeaderType, "Reserved1").ToInt32()); + Assert.Equal(8, Marshal.OffsetOf(ioUringBufRingHeaderType, "Reserved2").ToInt32()); + Assert.Equal(12, Marshal.OffsetOf(ioUringBufRingHeaderType, "Reserved3").ToInt32()); + Assert.Equal(14, Marshal.OffsetOf(ioUringBufRingHeaderType, "Tail").ToInt32()); + Assert.Equal(16, Marshal.SizeOf(ioUringBufRingHeaderType)); + + Type ioUringBufRegType = GetInteropSysNestedType("IoUringBufReg"); + Assert.True(ioUringBufRegType.IsExplicitLayout); + Assert.Equal(0, Marshal.OffsetOf(ioUringBufRegType, "RingAddress").ToInt32()); + Assert.Equal(8, Marshal.OffsetOf(ioUringBufRegType, "RingEntries").ToInt32()); + Assert.Equal(12, Marshal.OffsetOf(ioUringBufRegType, "BufferGroupId").ToInt32()); + Assert.Equal(14, Marshal.OffsetOf(ioUringBufRegType, "Padding").ToInt32()); + Assert.Equal(16, Marshal.OffsetOf(ioUringBufRegType, "Reserved0").ToInt32()); + Assert.Equal(24, Marshal.OffsetOf(ioUringBufRegType, "Reserved1").ToInt32()); + Assert.Equal(32, Marshal.OffsetOf(ioUringBufRegType, "Reserved2").ToInt32()); + Assert.Equal(40, Marshal.SizeOf(ioUringBufRegType)); + } + + [Fact] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring interop types are Linux-only. + [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] + public static void IoUringCompletionInteropType_IsAbsent() + { + Type? type = typeof(Socket).Assembly.GetType("Interop+Sys+IoUringCompletion", throwOnError: false, ignoreCase: false); + Assert.Null(type); + } + + [Fact] + [PlatformSpecific(TestPlatforms.AnyUnix)] + [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] + public static void MessageHeaderAndIoVector_LayoutContract() + { + Type messageHeaderType = GetInteropSysNestedType("MessageHeader"); + Type ioVectorType = GetInteropSysNestedType("IOVector"); + + Assert.True(messageHeaderType.IsLayoutSequential); + Assert.True(ioVectorType.IsLayoutSequential); + + int pointerSize = IntPtr.Size; + + Assert.Equal(0, Marshal.OffsetOf(ioVectorType, "Base").ToInt32()); + Assert.Equal(pointerSize, Marshal.OffsetOf(ioVectorType, "Count").ToInt32()); + Assert.Equal(pointerSize * 2, Marshal.SizeOf(ioVectorType)); + + Assert.Equal(0, Marshal.OffsetOf(messageHeaderType, "SocketAddress").ToInt32()); + Assert.Equal(pointerSize, Marshal.OffsetOf(messageHeaderType, "IOVectors").ToInt32()); + Assert.Equal(pointerSize * 2, Marshal.OffsetOf(messageHeaderType, "ControlBuffer").ToInt32()); + Assert.Equal(pointerSize * 3, Marshal.OffsetOf(messageHeaderType, "SocketAddressLen").ToInt32()); + Assert.Equal(pointerSize * 3 + sizeof(int), Marshal.OffsetOf(messageHeaderType, "IOVectorCount").ToInt32()); + Assert.Equal(pointerSize * 3 + sizeof(int) * 2, Marshal.OffsetOf(messageHeaderType, "ControlBufferLen").ToInt32()); + Assert.Equal(pointerSize * 3 + sizeof(int) * 3, Marshal.OffsetOf(messageHeaderType, "Flags").ToInt32()); + Assert.Equal(pointerSize * 3 + sizeof(int) * 4, Marshal.SizeOf(messageHeaderType)); + } + public static IEnumerable SocketMethods_MemberData() { if (!OperatingSystem.IsWasi()) yield return new[] { "Sync" }; @@ -110,6 +296,13 @@ private static SocketHelperBase GetHelperBase(string socketMethod) }; } + private static Type GetInteropSysNestedType(string nestedTypeName) + { + Type? type = typeof(Socket).Assembly.GetType($"Interop+Sys+{nestedTypeName}", throwOnError: false, ignoreCase: false); + Assert.NotNull(type); + return type!; + } + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] [MemberData(nameof(SocketMethods_WithBools_MemberData))] public async Task Connect_Success_ActivityRecorded(string connectMethod, bool ipv6) @@ -664,6 +857,20 @@ private static void VerifyEventCounters(ConcurrentQueue<(EventWrittenEventArgs E { Assert.True(datagramsSent[^1] > 0); } + + // Guard against telemetry drift: verify every canonical io_uring counter name exists. + // io_uring counters are only registered on Linux (OnEventCommand returns early on non-Linux). + if (OperatingSystem.IsLinux()) + { + Assert.Equal(s_expectedIoUringCounterNames, s_ioUringCounterNames); + foreach (string counterName in s_ioUringCounterNames) + { + Assert.True( + eventCounters.TryGetValue(counterName, out double[] ioUringCounterValues), + $"Missing io_uring EventCounter '{counterName}'."); + Assert.True(ioUringCounterValues[^1] >= 0, $"Unexpected negative counter value for '{counterName}'."); + } + } } } } diff --git a/src/native/libs/Common/pal_config.h.in b/src/native/libs/Common/pal_config.h.in index abc93358e69f6c..7f6475eeff8f0b 100644 --- a/src/native/libs/Common/pal_config.h.in +++ b/src/native/libs/Common/pal_config.h.in @@ -56,6 +56,7 @@ #cmakedefine01 HAVE_ETHTOOL_H #cmakedefine01 HAVE_SYS_POLL_H #cmakedefine01 HAVE_EPOLL +#cmakedefine01 HAVE_LINUX_IO_URING_H #cmakedefine01 HAVE_GETHOSTNAME #cmakedefine01 HAVE_GETNAMEINFO #cmakedefine01 HAVE_SOCKADDR_UN_SUN_PATH diff --git a/src/native/libs/System.Native/CMakeLists.txt b/src/native/libs/System.Native/CMakeLists.txt index 975e7c5d7c2bdf..f37d3e38981bac 100644 --- a/src/native/libs/System.Native/CMakeLists.txt +++ b/src/native/libs/System.Native/CMakeLists.txt @@ -11,6 +11,7 @@ set(NATIVE_SOURCES pal_maphardwaretype.c pal_memory.c pal_networking.c + pal_io_uring_shim.c pal_networkstatistics.c pal_random.c pal_runtimeinformation.c diff --git a/src/native/libs/System.Native/entrypoints.c b/src/native/libs/System.Native/entrypoints.c index 8414814970ea5c..03f11424e6603f 100644 --- a/src/native/libs/System.Native/entrypoints.c +++ b/src/native/libs/System.Native/entrypoints.c @@ -20,6 +20,7 @@ #include "pal_networkchange.h" #include "pal_networking.h" #include "pal_networkstatistics.h" +#include "pal_io_uring_shim.h" #include "pal_process.h" #include "pal_random.h" #include "pal_runtimeinformation.h" @@ -191,6 +192,16 @@ static const Entry s_sysNative[] = DllImportEntry(SystemNative_FreeSocketEventBuffer) DllImportEntry(SystemNative_TryChangeSocketEventRegistration) DllImportEntry(SystemNative_WaitForSocketEvents) + DllImportEntry(SystemNative_IoUringShimSetup) + DllImportEntry(SystemNative_IoUringShimEnter) + DllImportEntry(SystemNative_IoUringShimEnterExt) + DllImportEntry(SystemNative_IoUringShimRegister) + DllImportEntry(SystemNative_IoUringShimMmap) + DllImportEntry(SystemNative_IoUringShimMunmap) + DllImportEntry(SystemNative_IoUringShimCreateEventFd) + DllImportEntry(SystemNative_IoUringShimWriteEventFd) + DllImportEntry(SystemNative_IoUringShimReadEventFd) + DllImportEntry(SystemNative_IoUringShimCloseFd) DllImportEntry(SystemNative_GetWasiSocketDescriptor) DllImportEntry(SystemNative_PlatformSupportsDualModeIPv4PacketInfo) DllImportEntry(SystemNative_GetDomainSocketSizes) diff --git a/src/native/libs/System.Native/pal_io_uring_shim.c b/src/native/libs/System.Native/pal_io_uring_shim.c new file mode 100644 index 00000000000000..230f5d43ea3686 --- /dev/null +++ b/src/native/libs/System.Native/pal_io_uring_shim.c @@ -0,0 +1,305 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include "pal_config.h" +#include "pal_io_uring_shim.h" +#include "pal_errno.h" + +#include +#include +#include + +#if HAVE_LINUX_IO_URING_H && HAVE_SYS_POLL_H +#include +#include +#include +#include +#include +#include +#endif + +#include + +// Mirror the syscall-number defines from pal_io_uring.c for setup and enter. +// Register is gated separately because __NR_io_uring_register may not exist. +#if HAVE_LINUX_IO_URING_H && HAVE_SYS_POLL_H && \ + (defined(__NR_io_uring_setup) || defined(SYS_io_uring_setup)) && \ + (defined(__NR_io_uring_enter) || defined(SYS_io_uring_enter)) +#define SHIM_HAVE_IO_URING 1 +#else +#define SHIM_HAVE_IO_URING 0 +#endif + +#if SHIM_HAVE_IO_URING + +#if defined(__NR_io_uring_setup) +#define IO_URING_SYSCALL_SETUP __NR_io_uring_setup +#else +#define IO_URING_SYSCALL_SETUP SYS_io_uring_setup +#endif + +#if defined(__NR_io_uring_enter) +#define IO_URING_SYSCALL_ENTER __NR_io_uring_enter +#else +#define IO_URING_SYSCALL_ENTER SYS_io_uring_enter +#endif + +#if defined(__NR_io_uring_register) || defined(SYS_io_uring_register) +#define SHIM_HAVE_IO_URING_REGISTER 1 +#if defined(__NR_io_uring_register) +#define IO_URING_SYSCALL_REGISTER __NR_io_uring_register +#else +#define IO_URING_SYSCALL_REGISTER SYS_io_uring_register +#endif +#else +#define SHIM_HAVE_IO_URING_REGISTER 0 +#endif + +// The io_uring_getevents_arg struct for IORING_ENTER_EXT_ARG. +// Defined locally to avoid dependency on kernel header version. +typedef struct ShimIoUringGeteventsArg +{ + uint64_t sigmask; + uint32_t sigmask_sz; + uint32_t min_wait_usec; + uint64_t ts; +} ShimIoUringGeteventsArg; + +int32_t SystemNative_IoUringShimSetup(uint32_t entries, void* params, int32_t* ringFd) +{ + int fd = (int)syscall(IO_URING_SYSCALL_SETUP, entries, params); + if (fd < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *ringFd = fd; + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimEnter(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, int32_t* result) +{ + int ret; + while ((ret = (int)syscall(IO_URING_SYSCALL_ENTER, ringFd, toSubmit, minComplete, flags, NULL, 0)) < 0 && errno == EINTR); + + if (ret < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *result = ret; + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimEnterExt(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, void* arg, int32_t* result) +{ + int ret; + while ((ret = (int)syscall(IO_URING_SYSCALL_ENTER, ringFd, toSubmit, minComplete, flags, arg, arg == NULL ? 0 : sizeof(ShimIoUringGeteventsArg))) < 0 && errno == EINTR); + + if (ret < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *result = ret; + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimRegister(int32_t ringFd, uint32_t opcode, void* arg, uint32_t nrArgs, int32_t* result) +{ +#if SHIM_HAVE_IO_URING_REGISTER + int ret; + while ((ret = (int)syscall(IO_URING_SYSCALL_REGISTER, ringFd, opcode, arg, nrArgs)) < 0 && errno == EINTR); + + if (ret < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *result = ret; + return Error_SUCCESS; +#else + (void)ringFd; + (void)opcode; + (void)arg; + (void)nrArgs; + (void)result; + return Error_ENOSYS; +#endif +} + +int32_t SystemNative_IoUringShimMmap(int32_t ringFd, uint64_t size, uint64_t offset, void** mappedPtr) +{ + void* ptr = mmap(0, (size_t)size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, ringFd, (off_t)offset); + if (ptr == MAP_FAILED) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *mappedPtr = ptr; + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimMunmap(void* addr, uint64_t size) +{ + if (munmap(addr, (size_t)size) != 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimCreateEventFd(int32_t* eventFd) +{ + int fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (fd < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *eventFd = fd; + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimWriteEventFd(int32_t eventFd) +{ + uint64_t val = 1; + ssize_t written; + while ((written = write(eventFd, &val, sizeof(val))) < 0 && errno == EINTR); + + if (written < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + if (written != (ssize_t)sizeof(val)) + { + return Error_EIO; + } + + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimReadEventFd(int32_t eventFd, uint64_t* value) +{ + ssize_t bytesRead; + while ((bytesRead = read(eventFd, value, sizeof(*value))) < 0 && errno == EINTR); + + if (bytesRead < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + if ((size_t)bytesRead != sizeof(*value)) + { + return Error_EIO; + } + + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimCloseFd(int32_t fd) +{ + if (close(fd) != 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + return Error_SUCCESS; +} + +// Layout assertions for managed interop structs (kernel struct mirrors). +c_static_assert(sizeof(struct io_uring_cqe) == 16); +c_static_assert(offsetof(struct io_uring_cqe, user_data) == 0); +c_static_assert(offsetof(struct io_uring_cqe, res) == 8); +c_static_assert(offsetof(struct io_uring_cqe, flags) == 12); + +c_static_assert(sizeof(struct io_uring_params) == 120); +c_static_assert(offsetof(struct io_uring_params, sq_entries) == 0); +c_static_assert(offsetof(struct io_uring_params, cq_entries) == 4); +c_static_assert(offsetof(struct io_uring_params, flags) == 8); +c_static_assert(offsetof(struct io_uring_params, features) == 20); +c_static_assert(offsetof(struct io_uring_params, sq_off) == 40); +c_static_assert(offsetof(struct io_uring_params, cq_off) == 80); + +c_static_assert(sizeof(struct io_sqring_offsets) == 40); +c_static_assert(offsetof(struct io_sqring_offsets, head) == 0); +c_static_assert(offsetof(struct io_sqring_offsets, tail) == 4); +c_static_assert(offsetof(struct io_sqring_offsets, ring_mask) == 8); +c_static_assert(offsetof(struct io_sqring_offsets, ring_entries) == 12); +c_static_assert(offsetof(struct io_sqring_offsets, flags) == 16); +c_static_assert(offsetof(struct io_sqring_offsets, dropped) == 20); +c_static_assert(offsetof(struct io_sqring_offsets, array) == 24); + +c_static_assert(sizeof(struct io_cqring_offsets) == 40); +c_static_assert(offsetof(struct io_cqring_offsets, head) == 0); +c_static_assert(offsetof(struct io_cqring_offsets, tail) == 4); +c_static_assert(offsetof(struct io_cqring_offsets, overflow) == 16); +c_static_assert(offsetof(struct io_cqring_offsets, cqes) == 20); + +#else // !SHIM_HAVE_IO_URING + +// Stub implementations when io_uring is not available. + +int32_t SystemNative_IoUringShimSetup(uint32_t entries, void* params, int32_t* ringFd) +{ + (void)entries; (void)params; (void)ringFd; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimEnter(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, int32_t* result) +{ + (void)ringFd; (void)toSubmit; (void)minComplete; (void)flags; (void)result; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimEnterExt(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, void* arg, int32_t* result) +{ + (void)ringFd; (void)toSubmit; (void)minComplete; (void)flags; (void)arg; (void)result; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimRegister(int32_t ringFd, uint32_t opcode, void* arg, uint32_t nrArgs, int32_t* result) +{ + (void)ringFd; (void)opcode; (void)arg; (void)nrArgs; (void)result; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimMmap(int32_t ringFd, uint64_t size, uint64_t offset, void** mappedPtr) +{ + (void)ringFd; (void)size; (void)offset; (void)mappedPtr; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimMunmap(void* addr, uint64_t size) +{ + (void)addr; (void)size; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimCreateEventFd(int32_t* eventFd) +{ + (void)eventFd; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimWriteEventFd(int32_t eventFd) +{ + (void)eventFd; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimReadEventFd(int32_t eventFd, uint64_t* value) +{ + (void)eventFd; (void)value; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimCloseFd(int32_t fd) +{ + (void)fd; + return Error_ENOSYS; +} + +#endif // SHIM_HAVE_IO_URING diff --git a/src/native/libs/System.Native/pal_io_uring_shim.h b/src/native/libs/System.Native/pal_io_uring_shim.h new file mode 100644 index 00000000000000..2f7a07888827d2 --- /dev/null +++ b/src/native/libs/System.Native/pal_io_uring_shim.h @@ -0,0 +1,27 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma once + +#include "pal_compiler.h" +#include "pal_types.h" + +PALEXPORT int32_t SystemNative_IoUringShimSetup(uint32_t entries, void* params, int32_t* ringFd); + +PALEXPORT int32_t SystemNative_IoUringShimEnter(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, int32_t* result); + +PALEXPORT int32_t SystemNative_IoUringShimEnterExt(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, void* arg, int32_t* result); + +PALEXPORT int32_t SystemNative_IoUringShimRegister(int32_t ringFd, uint32_t opcode, void* arg, uint32_t nrArgs, int32_t* result); + +PALEXPORT int32_t SystemNative_IoUringShimMmap(int32_t ringFd, uint64_t size, uint64_t offset, void** mappedPtr); + +PALEXPORT int32_t SystemNative_IoUringShimMunmap(void* addr, uint64_t size); + +PALEXPORT int32_t SystemNative_IoUringShimCreateEventFd(int32_t* eventFd); + +PALEXPORT int32_t SystemNative_IoUringShimWriteEventFd(int32_t eventFd); + +PALEXPORT int32_t SystemNative_IoUringShimReadEventFd(int32_t eventFd, uint64_t* value); + +PALEXPORT int32_t SystemNative_IoUringShimCloseFd(int32_t fd); diff --git a/src/native/libs/configure.cmake b/src/native/libs/configure.cmake index 4da74e115c6db8..b2701be5757173 100644 --- a/src/native/libs/configure.cmake +++ b/src/native/libs/configure.cmake @@ -470,6 +470,10 @@ check_symbol_exists( sys/epoll.h HAVE_EPOLL) +check_include_files( + "linux/io_uring.h;sys/syscall.h" + HAVE_LINUX_IO_URING_H) + check_symbol_exists( gethostname unistd.h