diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs
new file mode 100644
index 00000000000000..1a2216d8d6723c
--- /dev/null
+++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs
@@ -0,0 +1,58 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Runtime.InteropServices;
+
+internal static partial class Interop
+{
+ internal static partial class Sys
+ {
+ /// Wraps io_uring_setup(2): creates an io_uring instance.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimSetup")]
+ internal static unsafe partial Error IoUringShimSetup(
+ uint entries, void* parms, int* ringFd);
+
+ /// Wraps io_uring_enter(2): submits SQEs and/or waits for CQEs.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimEnter")]
+ internal static unsafe partial Error IoUringShimEnter(
+ int ringFd, uint toSubmit, uint minComplete, uint flags, int* result);
+
+ /// Wraps io_uring_enter2(2) with IORING_ENTER_EXT_ARG for bounded waits.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimEnterExt")]
+ internal static unsafe partial Error IoUringShimEnterExt(
+ int ringFd, uint toSubmit, uint minComplete, uint flags, void* arg, int* result);
+
+ /// Wraps io_uring_register(2): registers resources (files, buffers, ring fds).
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimRegister")]
+ internal static unsafe partial Error IoUringShimRegister(
+ int ringFd, uint opcode, void* arg, uint nrArgs, int* result);
+
+ /// Wraps mmap(2): maps io_uring SQ/CQ ring memory.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimMmap")]
+ internal static unsafe partial Error IoUringShimMmap(
+ int ringFd, ulong size, ulong offset, void** mappedPtr);
+
+ /// Wraps munmap(2): unmaps io_uring ring memory.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimMunmap")]
+ internal static unsafe partial Error IoUringShimMunmap(
+ void* addr, ulong size);
+
+ /// Creates an eventfd for io_uring wakeup signaling.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimCreateEventFd")]
+ internal static unsafe partial Error IoUringShimCreateEventFd(
+ int* eventFd);
+
+ /// Writes to an eventfd to wake the io_uring event loop.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimWriteEventFd")]
+ internal static partial Error IoUringShimWriteEventFd(int eventFd);
+
+ /// Reads from an eventfd to consume a wakeup signal.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimReadEventFd")]
+ internal static unsafe partial Error IoUringShimReadEventFd(
+ int eventFd, ulong* value);
+
+ /// Wraps close(2): closes a file descriptor.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimCloseFd")]
+ internal static partial Error IoUringShimCloseFd(int fd);
+ }
+}
diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs
new file mode 100644
index 00000000000000..1472d04c8b676a
--- /dev/null
+++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs
@@ -0,0 +1,150 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Net.Sockets;
+using System.Runtime.InteropServices;
+
+internal static partial class Interop
+{
+ internal static partial class Sys
+ {
+ /// Derived SQ ring state computed after mmap, used by the managed submission path.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringSqRingInfo
+ {
+ public IntPtr SqeBase;
+ public IntPtr SqTailPtr;
+ public IntPtr SqHeadPtr;
+ public uint SqMask;
+ public uint SqEntries;
+ public uint SqeSize;
+ public byte UsesNoSqArray;
+ public int RingFd;
+ public int RegisteredRingFd;
+ public byte UsesEnterExtArg;
+ public byte UsesRegisteredFiles;
+ }
+
+ /// Mirrors kernel struct io_sqring_offsets (40 bytes). Fields at offset 28+ (resv1, user_addr) are unused.
+ [StructLayout(LayoutKind.Explicit, Size = 40)]
+ internal struct IoUringSqOffsets
+ {
+ [FieldOffset(0)] public uint Head;
+ [FieldOffset(4)] public uint Tail;
+ [FieldOffset(8)] public uint RingMask;
+ [FieldOffset(12)] public uint RingEntries;
+ [FieldOffset(16)] public uint Flags;
+ [FieldOffset(20)] public uint Dropped;
+ [FieldOffset(24)] public uint Array;
+ // resv1 at 28, user_addr at 32 - not needed by managed code
+ }
+
+ /// Mirrors kernel struct io_cqring_offsets (40 bytes). Fields at offset 28+ (resv1, user_addr) are unused.
+ [StructLayout(LayoutKind.Explicit, Size = 40)]
+ internal struct IoUringCqOffsets
+ {
+ [FieldOffset(0)] public uint Head;
+ [FieldOffset(4)] public uint Tail;
+ [FieldOffset(8)] public uint RingMask;
+ [FieldOffset(12)] public uint RingEntries;
+ [FieldOffset(16)] public uint Overflow;
+ [FieldOffset(20)] public uint Cqes;
+ [FieldOffset(24)] public uint Flags;
+ // resv1 at 28, user_addr at 32 - not needed by managed code
+ }
+
+ /// Mirrors kernel struct io_uring_params (120 bytes), passed to io_uring_setup.
+ [StructLayout(LayoutKind.Explicit, Size = 120)]
+ internal struct IoUringParams
+ {
+ [FieldOffset(0)] public uint SqEntries;
+ [FieldOffset(4)] public uint CqEntries;
+ [FieldOffset(8)] public uint Flags;
+ [FieldOffset(12)] public uint SqThreadCpu;
+ [FieldOffset(16)] public uint SqThreadIdle;
+ [FieldOffset(20)] public uint Features;
+ [FieldOffset(24)] public uint WqFd;
+ // resv[3] at 28-39
+ [FieldOffset(40)] public IoUringSqOffsets SqOff;
+ [FieldOffset(80)] public IoUringCqOffsets CqOff;
+ }
+
+ /// Mirrors kernel struct io_uring_cqe (16 bytes), read from the CQ ring.
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ internal struct IoUringCqe
+ {
+ [FieldOffset(0)] public ulong UserData;
+ [FieldOffset(8)] public int Result;
+ [FieldOffset(12)] public uint Flags;
+ }
+
+ /// Mirrors kernel struct io_uring_buf (16 bytes), used by provided-buffer rings.
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ internal struct IoUringBuf
+ {
+ [FieldOffset(0)] public ulong Address;
+ [FieldOffset(8)] public uint Length;
+ [FieldOffset(12)] public ushort BufferId;
+ [FieldOffset(14)] public ushort Reserved;
+ }
+
+ ///
+ /// Mirrors the header overlay of kernel struct io_uring_buf_ring (16 bytes).
+ /// In UAPI this shares offset 0 with the first io_uring_buf entry via a union.
+ ///
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ internal struct IoUringBufRingHeader
+ {
+ [FieldOffset(0)] public ulong Reserved1;
+ [FieldOffset(8)] public uint Reserved2;
+ [FieldOffset(12)] public ushort Reserved3;
+ [FieldOffset(14)] public ushort Tail;
+ }
+
+ /// Mirrors kernel struct io_uring_buf_reg (40 bytes), used for pbuf ring registration.
+ [StructLayout(LayoutKind.Explicit, Size = 40)]
+ internal struct IoUringBufReg
+ {
+ [FieldOffset(0)] public ulong RingAddress;
+ [FieldOffset(8)] public uint RingEntries;
+ [FieldOffset(12)] public ushort BufferGroupId;
+ [FieldOffset(14)] public ushort Padding;
+ [FieldOffset(16)] public ulong Reserved0;
+ [FieldOffset(24)] public ulong Reserved1;
+ [FieldOffset(32)] public ulong Reserved2;
+ }
+
+ /// Derived CQ ring state computed after mmap, used by the managed completion drain path.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringCqRingInfo
+ {
+ public IntPtr CqeBase; // io_uring_cqe* base of CQE array
+ public IntPtr CqTailPtr; // uint32_t* kernel writes CQ tail
+ public IntPtr CqHeadPtr; // uint32_t* managed advances CQ head
+ public uint CqMask; // CqEntries - 1
+ public uint CqEntries; // number of CQ slots
+ public uint CqeSize; // sizeof(io_uring_cqe) = 16
+ public IntPtr CqOverflowPtr; // uint32_t* kernel CQ overflow counter
+ }
+
+ /// Mirrors kernel struct io_uring_getevents_arg, used with IORING_ENTER_EXT_ARG.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringGeteventsArg
+ {
+ public ulong Sigmask;
+ public uint SigmaskSize;
+ public uint MinWaitUsec;
+ public ulong Ts;
+ }
+
+ /// Mirrors kernel struct __kernel_timespec, used for io_uring timeout arguments.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringKernelTimespec
+ {
+ public long TvSec;
+ public long TvNsec;
+ }
+
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj b/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj
index 2426a84e8a225c..d676f4a0010840 100644
--- a/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj
+++ b/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj
@@ -197,9 +197,20 @@
+
+
+
+
+
+
+
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs
new file mode 100644
index 00000000000000..4961bd76e759b0
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs
@@ -0,0 +1,816 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed partial class SocketAsyncEngine
+ {
+ private const int IoUringProvidedBufferRingEntries = (int)IoUringConstants.QueueEntries;
+ private const int IoUringProvidedBufferSizeDefault = 4096;
+ private const ushort IoUringProvidedBufferGroupIdDefault = 1;
+ private static readonly int s_ioUringProvidedBufferSize = GetConfiguredIoUringProvidedBufferSize();
+ private static readonly bool s_ioUringAdaptiveBufferSizingEnabled = IsAdaptiveIoUringProvidedBufferSizingEnabled();
+ private static readonly bool s_ioUringRegisterBuffersEnabled = IsIoUringRegisterBuffersEnabled();
+ private bool _adaptiveBufferSizingEnabled;
+
+ ///
+ /// Initializes a provided-buffer ring and registers it with the kernel when supported.
+ /// Failures are non-fatal and leave completion mode enabled without provided buffers.
+ ///
+ private void InitializeIoUringProvidedBufferRingIfSupported(int ringFd)
+ {
+ _supportsProvidedBufferRings = false;
+ _ioUringBuffersRegistered = false;
+ _adaptiveBufferSizingEnabled = false;
+ _ioUringProvidedBufferGroupId = 0;
+ _ioUringProvidedBufferRing = null;
+
+ if (!IoUringProvidedBufferRing.TryCreate(
+ IoUringProvidedBufferGroupIdDefault,
+ IoUringProvidedBufferRingEntries,
+ s_ioUringProvidedBufferSize,
+ s_ioUringAdaptiveBufferSizingEnabled,
+ out IoUringProvidedBufferRing? bufferRing) ||
+ bufferRing is null)
+ {
+ return;
+ }
+
+ Interop.Error registerError = bufferRing.Register(ringFd);
+ if (registerError != Interop.Error.SUCCESS)
+ {
+ bufferRing.Dispose();
+ return;
+ }
+
+ _ioUringProvidedBufferRing = bufferRing;
+ _ioUringProvidedBufferGroupId = bufferRing.BufferGroupId;
+ _supportsProvidedBufferRings = true;
+ _adaptiveBufferSizingEnabled = s_ioUringAdaptiveBufferSizingEnabled;
+ _ioUringBuffersRegistered = TryRegisterProvidedBuffersWithTelemetry(bufferRing, ringFd, isReregistration: false);
+
+ SocketsTelemetry.Log.IoUringProvidedBufferCurrentSize(bufferRing.BufferSize);
+ }
+
+ ///
+ /// Evaluates adaptive buffer-sizing recommendations and hot-swaps the provided-buffer ring when safe.
+ /// Must run on the event-loop thread.
+ ///
+ private void EvaluateProvidedBufferRingResize()
+ {
+ if (!_adaptiveBufferSizingEnabled || _managedRingFd < 0)
+ {
+ return;
+ }
+
+ IoUringProvidedBufferRing? currentRing = _ioUringProvidedBufferRing;
+ if (currentRing is null)
+ {
+ return;
+ }
+
+ int currentBufferSize = currentRing.BufferSize;
+ int recommendedBufferSize = currentRing.RecommendedBufferSize;
+ if (recommendedBufferSize == 0 || recommendedBufferSize == currentBufferSize)
+ {
+ return;
+ }
+
+ if (currentRing.InUseCount > 0)
+ {
+ return;
+ }
+
+ ushort newGroupId = _ioUringProvidedBufferGroupId == 1 ? (ushort)2 : (ushort)1;
+ if (!IoUringProvidedBufferRing.TryCreate(
+ newGroupId,
+ IoUringProvidedBufferRingEntries,
+ recommendedBufferSize,
+ adaptiveSizingEnabled: true,
+ out IoUringProvidedBufferRing? replacementRing) ||
+ replacementRing is null)
+ {
+ return;
+ }
+
+ bool restorePreviousBufferRegistration = _ioUringBuffersRegistered;
+ TryUnregisterProvidedBuffersIfRegistered(currentRing, _managedRingFd);
+
+ if (replacementRing.Register(_managedRingFd) != Interop.Error.SUCCESS)
+ {
+ replacementRing.Dispose();
+ if (restorePreviousBufferRegistration)
+ {
+ _ioUringBuffersRegistered = TryRegisterProvidedBuffersWithTelemetry(
+ currentRing,
+ _managedRingFd,
+ isReregistration: true);
+ }
+
+ return;
+ }
+
+ currentRing.Unregister(_managedRingFd);
+ currentRing.Dispose();
+
+ _ioUringProvidedBufferRing = replacementRing;
+ _ioUringProvidedBufferGroupId = replacementRing.BufferGroupId;
+ _supportsProvidedBufferRings = true;
+ RefreshIoUringMultishotRecvSupport();
+ _ioUringBuffersRegistered = TryRegisterProvidedBuffersWithTelemetry(
+ replacementRing,
+ _managedRingFd,
+ isReregistration: true);
+
+ SocketsTelemetry.Log.IoUringProvidedBufferResize();
+ SocketsTelemetry.Log.IoUringProvidedBufferCurrentSize(replacementRing.BufferSize);
+ }
+
+ private static int GetConfiguredIoUringProvidedBufferSize()
+ {
+#if DEBUG
+ string? configuredValue = Environment.GetEnvironmentVariable(
+ "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PROVIDED_BUFFER_SIZE");
+
+ if (!string.IsNullOrWhiteSpace(configuredValue))
+ {
+ return int.TryParse(configuredValue, out int parsedSize) && parsedSize > 0
+ ? parsedSize
+ : IoUringProvidedBufferSizeDefault;
+ }
+#endif
+
+ return IoUringProvidedBufferSizeDefault;
+ }
+
+ private static bool IsAdaptiveIoUringProvidedBufferSizingEnabled()
+ {
+#if DEBUG
+ string? configuredValue = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ADAPTIVE_BUFFER_SIZING");
+ return string.Equals(configuredValue, "1", StringComparison.Ordinal);
+#else
+ return false;
+#endif
+ }
+
+ private static bool IsIoUringRegisterBuffersEnabled()
+ {
+#if DEBUG
+ // Test-only override for deterministic tests.
+ string? configuredValue = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS");
+ if (string.Equals(configuredValue, "1", StringComparison.Ordinal))
+ {
+ return true;
+ }
+
+ if (string.Equals(configuredValue, "0", StringComparison.Ordinal))
+ {
+ return false;
+ }
+#endif
+
+ // Default: enabled.
+ return true;
+ }
+
+ private static bool TryRegisterProvidedBuffersWithTelemetry(
+ IoUringProvidedBufferRing bufferRing,
+ int ringFd,
+ bool isReregistration)
+ {
+ if (!s_ioUringRegisterBuffersEnabled || ringFd < 0)
+ {
+ return false;
+ }
+
+ // REGISTER_BUFFERS is orthogonal to provided-buffer selection (RECV + IOSQE_BUFFER_SELECT).
+ // Any performance benefit for this path is kernel-dependent and must be validated empirically.
+ bool registered = bufferRing.TryRegisterBuffersWithKernel(ringFd);
+ if (isReregistration)
+ {
+ SocketsTelemetry.Log.IoUringRegisteredBuffersReregistration(registered);
+ }
+ else
+ {
+ SocketsTelemetry.Log.IoUringRegisteredBuffersResult(
+ registered,
+ IoUringProvidedBufferRingEntries,
+ bufferRing.BufferSize);
+ }
+
+ return registered;
+ }
+
+ private void TryUnregisterProvidedBuffersIfRegistered(IoUringProvidedBufferRing bufferRing, int ringFd)
+ {
+ if (!_ioUringBuffersRegistered || ringFd < 0)
+ {
+ return;
+ }
+
+ bufferRing.TryUnregisterBuffersFromKernel(ringFd);
+ _ioUringBuffersRegistered = false;
+ }
+
+ /// Unregisters and disposes the provided-buffer ring.
+ private void FreeIoUringProvidedBufferRing()
+ {
+ IoUringProvidedBufferRing? bufferRing = _ioUringProvidedBufferRing;
+ _ioUringProvidedBufferRing = null;
+ _supportsProvidedBufferRings = false;
+ _adaptiveBufferSizingEnabled = false;
+ _ioUringProvidedBufferGroupId = 0;
+
+ if (bufferRing is null)
+ {
+ return;
+ }
+
+ int recycledForTeardown = bufferRing.RecycleCheckedOutBuffersForTeardown();
+ if (recycledForTeardown > 0)
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferRecycle(recycledForTeardown);
+ }
+
+ TryUnregisterProvidedBuffersIfRegistered(bufferRing, _managedRingFd);
+
+ if (_managedRingFd >= 0)
+ {
+ bufferRing.Unregister(_managedRingFd);
+ }
+
+ bufferRing.Dispose();
+ _ioUringBuffersRegistered = false;
+ }
+
+ ///
+ /// Owns a managed provided-buffer ring registration: native ring memory, pinned managed
+ /// buffers, buffer-id lifecycle, and recycle counters.
+ ///
+ private sealed unsafe class IoUringProvidedBufferRing : IDisposable
+ {
+ private const int AdaptiveWindowCompletionCount = 256;
+ private const int AdaptiveMinBufferSize = 128;
+ private const int AdaptiveMaxBufferSize = 65536;
+ private const int PreparedReceiveMinimumReserve = 8;
+ private const int PreparedReceiveMaximumReserve = 64;
+ private const byte BufferStatePosted = 1;
+ private const byte BufferStateCheckedOut = 2;
+
+ private readonly ushort _bufferGroupId;
+ private readonly int _bufferSize;
+ private readonly uint _ringEntries;
+ private readonly uint _ringMask;
+ private readonly bool _adaptiveSizingEnabled;
+ private readonly GCHandle[] _bufferHandles;
+ private readonly byte[] _bufferStates;
+ private readonly Interop.Sys.IoUringBuf* _ringBuffers;
+ private readonly Interop.Sys.IoUringBufRingHeader* _ringHeader;
+ private readonly void* _ringMemory;
+ private bool _registered;
+ private bool _disposed;
+ private int _availableCount;
+ private int _inUseCount;
+ private long _recycledCount;
+ private long _allocationFailureCount;
+ private long _totalCompletionBytes;
+ private long _totalCompletionCount;
+ private long _completionsAboveHighWatermark;
+ private long _completionsBelowLowWatermark;
+ private int _recommendedBufferSize;
+ private uint _nextPreparedReceiveBufferHint;
+ private int _debugOwningThreadId;
+
+ internal ushort BufferGroupId => _bufferGroupId;
+ internal int BufferSize => _bufferSize;
+ internal int AvailableCount => Volatile.Read(ref _availableCount);
+ internal int InUseCount => Volatile.Read(ref _inUseCount);
+ internal long RecycledCount => Interlocked.Read(ref _recycledCount);
+ internal long AllocationFailureCount => Interlocked.Read(ref _allocationFailureCount);
+ internal int RecommendedBufferSize => Volatile.Read(ref _recommendedBufferSize);
+
+ private IoUringProvidedBufferRing(ushort bufferGroupId, int ringEntries, int bufferSize, bool adaptiveSizingEnabled)
+ {
+ ArgumentOutOfRangeException.ThrowIfNegativeOrZero(ringEntries);
+ if (!BitOperations.IsPow2((uint)ringEntries) || ringEntries > ushort.MaxValue)
+ {
+ throw new ArgumentOutOfRangeException(nameof(ringEntries));
+ }
+
+ ArgumentOutOfRangeException.ThrowIfNegativeOrZero(bufferSize);
+
+ _bufferGroupId = bufferGroupId;
+ _bufferSize = bufferSize;
+ _adaptiveSizingEnabled = adaptiveSizingEnabled;
+ _ringEntries = (uint)ringEntries;
+ _ringMask = (uint)ringEntries - 1;
+ _availableCount = ringEntries;
+ _recommendedBufferSize = bufferSize;
+ _bufferHandles = new GCHandle[ringEntries];
+ _bufferStates = GC.AllocateUninitializedArray(ringEntries);
+
+ nuint ringByteCount = checked((nuint)ringEntries * (nuint)sizeof(Interop.Sys.IoUringBuf));
+ _ringMemory = NativeMemory.AlignedAlloc(ringByteCount, (nuint)Environment.SystemPageSize);
+ if (_ringMemory is null)
+ {
+ throw new OutOfMemoryException();
+ }
+
+ NativeMemory.Clear(_ringMemory, ringByteCount);
+ _ringBuffers = (Interop.Sys.IoUringBuf*)_ringMemory;
+ _ringHeader = (Interop.Sys.IoUringBufRingHeader*)_ringMemory;
+
+ int initializedCount = 0;
+ try
+ {
+ for (int i = 0; i < ringEntries; i++)
+ {
+ byte[] buffer = GC.AllocateUninitializedArray(bufferSize);
+ GCHandle handle = GCHandle.Alloc(buffer, GCHandleType.Pinned);
+
+ _bufferHandles[i] = handle;
+ _bufferStates[i] = BufferStatePosted;
+
+ WriteBufferDescriptor((uint)i, (ushort)i);
+ initializedCount++;
+ }
+
+ PublishTail((ushort)initializedCount);
+ }
+ catch
+ {
+ Interlocked.Increment(ref _allocationFailureCount);
+ ReleasePinnedBuffers(initializedCount);
+ NativeMemory.AlignedFree(_ringMemory);
+ throw;
+ }
+ }
+
+ internal static bool TryCreate(
+ ushort bufferGroupId,
+ int ringEntries,
+ int bufferSize,
+ bool adaptiveSizingEnabled,
+ out IoUringProvidedBufferRing? bufferRing)
+ {
+ try
+ {
+ bufferRing = new IoUringProvidedBufferRing(bufferGroupId, ringEntries, bufferSize, adaptiveSizingEnabled);
+ return true;
+ }
+ catch (ArgumentOutOfRangeException)
+ {
+ }
+ catch (OutOfMemoryException)
+ {
+ }
+
+ bufferRing = null;
+ return false;
+ }
+
+ /// Records a completion's bytes-transferred for adaptive sizing decisions.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void RecordCompletionUtilization(int bytesTransferred)
+ {
+ AssertSingleThreadAccess();
+ if (!_adaptiveSizingEnabled || bytesTransferred <= 0)
+ {
+ return;
+ }
+
+ int clampedBytes = Math.Min(bytesTransferred, _bufferSize);
+ _totalCompletionBytes += clampedBytes;
+ long count = ++_totalCompletionCount;
+
+ int highWatermark = (_bufferSize * 3) / 4;
+ int lowWatermark = _bufferSize / 4;
+ if (clampedBytes > highWatermark)
+ {
+ _completionsAboveHighWatermark++;
+ }
+ else if (clampedBytes < lowWatermark)
+ {
+ _completionsBelowLowWatermark++;
+ }
+
+ if ((count & (AdaptiveWindowCompletionCount - 1)) == 0)
+ {
+ EvaluateAdaptiveResize();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void EvaluateAdaptiveResize()
+ {
+ AssertSingleThreadAccess();
+ if (!_adaptiveSizingEnabled)
+ {
+ return;
+ }
+
+ long windowBytes = _totalCompletionBytes;
+ long aboveHigh = _completionsAboveHighWatermark;
+ long belowLow = _completionsBelowLowWatermark;
+ _totalCompletionBytes = 0;
+ _completionsAboveHighWatermark = 0;
+ _completionsBelowLowWatermark = 0;
+
+ int currentSize = _bufferSize;
+ int recommendedSize = currentSize;
+ if (aboveHigh > AdaptiveWindowCompletionCount / 2 ||
+ windowBytes > (long)AdaptiveWindowCompletionCount * ((long)currentSize * 3 / 4))
+ {
+ recommendedSize = Math.Min(currentSize * 2, AdaptiveMaxBufferSize);
+ }
+ else if (belowLow > AdaptiveWindowCompletionCount / 2 ||
+ windowBytes < (long)AdaptiveWindowCompletionCount * ((long)currentSize / 4))
+ {
+ recommendedSize = Math.Max(currentSize / 2, AdaptiveMinBufferSize);
+ }
+
+ Volatile.Write(ref _recommendedBufferSize, recommendedSize);
+ }
+
+ internal Interop.Error Register(int ringFd)
+ {
+ Debug.Assert(!_disposed);
+
+ if (_registered)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Interop.Sys.IoUringBufReg registration = default;
+ registration.RingAddress = (ulong)(nuint)_ringMemory;
+ registration.RingEntries = _ringEntries;
+ registration.BufferGroupId = _bufferGroupId;
+
+ int result;
+ Interop.Error registerError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.RegisterPbufRing,
+ ®istration,
+ 1u,
+ &result);
+ if (registerError == Interop.Error.SUCCESS)
+ {
+ _registered = true;
+ }
+
+ return registerError;
+ }
+
+ internal Interop.Error Unregister(int ringFd)
+ {
+ if (!_registered)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Interop.Sys.IoUringBufReg registration = default;
+ registration.BufferGroupId = _bufferGroupId;
+ int result;
+ Interop.Error unregisterError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.UnregisterPbufRing,
+ ®istration,
+ 1u,
+ &result);
+ if (unregisterError == Interop.Error.SUCCESS)
+ {
+ _registered = false;
+ }
+
+ return unregisterError;
+ }
+
+ ///
+ /// Attempts to register pinned buffer payload pages with the kernel via IORING_REGISTER_BUFFERS.
+ /// Failure is non-fatal and callers should gracefully continue with unregistered buffers.
+ /// This does not switch recv SQEs to fixed-buffer opcodes; provided-buffer recv stays on
+ /// IORING_OP_RECV + IOSQE_BUFFER_SELECT.
+ ///
+ internal bool TryRegisterBuffersWithKernel(int ringFd)
+ {
+ if (_disposed || ringFd < 0 || _bufferHandles.Length == 0)
+ {
+ return false;
+ }
+
+ nuint allocationSize = checked((nuint)_bufferHandles.Length * (nuint)sizeof(Interop.Sys.IOVector));
+ Interop.Sys.IOVector* iovecArray;
+ try
+ {
+ iovecArray = (Interop.Sys.IOVector*)NativeMemory.Alloc(allocationSize);
+ }
+ catch (OutOfMemoryException)
+ {
+ return false;
+ }
+
+ try
+ {
+ for (int i = 0; i < _bufferHandles.Length; i++)
+ {
+ GCHandle handle = _bufferHandles[i];
+ if (!handle.IsAllocated)
+ {
+ return false;
+ }
+
+ iovecArray[i].Base = (byte*)handle.AddrOfPinnedObject();
+ iovecArray[i].Count = (UIntPtr)_bufferSize;
+ }
+
+ int result;
+ Interop.Error registerError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.RegisterBuffers,
+ iovecArray,
+ (uint)_bufferHandles.Length,
+ &result);
+ return registerError == Interop.Error.SUCCESS;
+ }
+ finally
+ {
+ NativeMemory.Free(iovecArray);
+ }
+ }
+
+ /// Unregisters previously registered pinned buffers via IORING_UNREGISTER_BUFFERS.
+ internal bool TryUnregisterBuffersFromKernel(int ringFd)
+ {
+ if (_disposed || ringFd < 0)
+ {
+ return false;
+ }
+
+ int result;
+ Interop.Error unregisterError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.UnregisterBuffers,
+ null,
+ 0u,
+ &result);
+ return unregisterError == Interop.Error.SUCCESS;
+ }
+
+ /// Acquires a kernel-selected buffer id for completion processing.
+ internal bool TryAcquireBufferForCompletion(ushort bufferId, out byte* buffer, out int bufferLength)
+ {
+ AssertSingleThreadAccess();
+ buffer = null;
+ bufferLength = 0;
+
+ if (bufferId >= _ringEntries)
+ {
+ Interlocked.Increment(ref _allocationFailureCount);
+ return false;
+ }
+
+ byte state = _bufferStates[bufferId];
+ if (state != BufferStatePosted)
+ {
+ Debug.Assert(
+ state == BufferStateCheckedOut,
+ $"Unexpected provided-buffer state during acquire: id={bufferId}, state={state}");
+ Interlocked.Increment(ref _allocationFailureCount);
+ return false;
+ }
+
+ _bufferStates[bufferId] = BufferStateCheckedOut;
+ Debug.Assert(_availableCount > 0, "Provided-buffer available count underflow.");
+ _availableCount--;
+ _inUseCount++;
+
+ GCHandle handle = _bufferHandles[bufferId];
+ if (!handle.IsAllocated)
+ {
+ _bufferStates[bufferId] = BufferStatePosted;
+ _availableCount++;
+ _inUseCount--;
+ Interlocked.Increment(ref _allocationFailureCount);
+ return false;
+ }
+
+ buffer = (byte*)handle.AddrOfPinnedObject();
+ bufferLength = _bufferSize;
+ return true;
+ }
+
+ ///
+ /// Acquires any currently posted provided buffer for fixed-recv submission.
+ /// The acquired buffer remains checked out until completion recycles it.
+ ///
+ internal bool TryAcquireBufferForPreparedReceive(out ushort bufferId, out byte* buffer, out int bufferLength)
+ {
+ AssertSingleThreadAccess();
+ bufferId = 0;
+ buffer = null;
+ bufferLength = 0;
+
+ // Keep a reserve for kernel-selected (IOSQE_BUFFER_SELECT) receive completions so
+ // fixed-recv one-shots don't deplete the provided-buffer pool under sustained load.
+ int reserveCount = GetPreparedReceiveReserveCount();
+ if (Volatile.Read(ref _availableCount) <= reserveCount)
+ {
+ return false;
+ }
+
+ uint start = _nextPreparedReceiveBufferHint;
+ for (uint i = 0; i < _ringEntries; i++)
+ {
+ uint candidate = (start + i) & _ringMask;
+ ushort candidateId = (ushort)candidate;
+ if (_bufferStates[candidateId] != BufferStatePosted)
+ {
+ continue;
+ }
+
+ if (TryAcquireBufferForCompletion(candidateId, out buffer, out bufferLength))
+ {
+ bufferId = candidateId;
+ _nextPreparedReceiveBufferHint = (candidate + 1) & _ringMask;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private int GetPreparedReceiveReserveCount()
+ {
+ int ringEntryCount = (int)_ringEntries;
+ int dynamicReserve = ringEntryCount / 16;
+ return Math.Clamp(dynamicReserve, PreparedReceiveMinimumReserve, PreparedReceiveMaximumReserve);
+ }
+
+ /// Returns the pointer/length for a buffer that is already checked out.
+ internal bool TryGetCheckedOutBuffer(ushort bufferId, out byte* buffer, out int bufferLength)
+ {
+ buffer = null;
+ bufferLength = 0;
+
+ if (bufferId >= _ringEntries || _bufferStates[bufferId] != BufferStateCheckedOut)
+ {
+ return false;
+ }
+
+ GCHandle handle = _bufferHandles[bufferId];
+ if (!handle.IsAllocated)
+ {
+ Interlocked.Increment(ref _allocationFailureCount);
+ return false;
+ }
+
+ buffer = (byte*)handle.AddrOfPinnedObject();
+ bufferLength = _bufferSize;
+ return true;
+ }
+
+ /// Returns a previously acquired buffer id back to the provided-buffer ring.
+ internal bool TryRecycleBufferFromCompletion(ushort bufferId)
+ {
+ AssertSingleThreadAccess();
+ if (bufferId >= _ringEntries)
+ {
+ return false;
+ }
+
+ byte state = _bufferStates[bufferId];
+ if (state != BufferStateCheckedOut)
+ {
+ Debug.Assert(
+ state == BufferStatePosted,
+ $"Unexpected provided-buffer state during recycle: id={bufferId}, state={state}");
+ return false;
+ }
+
+ RecycleCheckedOutBuffer(bufferId);
+ return true;
+ }
+
+ ///
+ /// Recycles any still-checked-out ids back into the ring during teardown.
+ /// Returns the number of ids recycled.
+ ///
+ internal int RecycleCheckedOutBuffersForTeardown()
+ {
+ AssertSingleThreadAccess();
+ int recycledCount = 0;
+ for (ushort bufferId = 0; bufferId < _ringEntries; bufferId++)
+ {
+ if (_bufferStates[bufferId] != BufferStateCheckedOut)
+ {
+ continue;
+ }
+
+ RecycleCheckedOutBuffer(bufferId);
+ recycledCount++;
+ }
+
+ return recycledCount;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecycleCheckedOutBuffer(ushort bufferId)
+ {
+ ushort tail = ReadTail();
+ uint ringIndex = (uint)tail & _ringMask;
+ WriteBufferDescriptor(ringIndex, bufferId);
+ _bufferStates[bufferId] = BufferStatePosted;
+ _availableCount++;
+ Debug.Assert(_inUseCount > 0, "Provided-buffer in-use count underflow.");
+ _inUseCount--;
+ PublishTail(unchecked((ushort)(tail + 1)));
+ Interlocked.Increment(ref _recycledCount);
+ }
+
+ [Conditional("DEBUG")]
+ private void AssertSingleThreadAccess()
+ {
+ int currentThreadId = Environment.CurrentManagedThreadId;
+ int ownerThreadId = Volatile.Read(ref _debugOwningThreadId);
+ if (ownerThreadId == 0)
+ {
+ int prior = Interlocked.CompareExchange(ref _debugOwningThreadId, currentThreadId, comparand: 0);
+ ownerThreadId = prior == 0 ? currentThreadId : prior;
+ }
+
+ Debug.Assert(
+ ownerThreadId == currentThreadId,
+ $"IoUringProvidedBufferRing mutable state must be accessed from one thread. Owner={ownerThreadId}, current={currentThreadId}");
+ }
+
+ public void Dispose()
+ {
+ if (_disposed)
+ {
+ return;
+ }
+
+#if DEBUG
+ int checkedOutBufferCount = 0;
+ for (int i = 0; i < _bufferStates.Length; i++)
+ {
+ if (_bufferStates[i] == BufferStateCheckedOut)
+ {
+ checkedOutBufferCount++;
+ }
+ }
+
+ Debug.Assert(
+ checkedOutBufferCount == 0,
+ $"Disposing provided-buffer ring with outstanding checked-out buffers: {checkedOutBufferCount}");
+#endif
+
+ _registered = false;
+ ReleasePinnedBuffers(_bufferHandles.Length);
+ NativeMemory.AlignedFree(_ringMemory);
+ _disposed = true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private ushort ReadTail() =>
+ Volatile.Read(ref Unsafe.AsRef(&_ringHeader->Tail));
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void PublishTail(ushort tail) =>
+ Volatile.Write(ref Unsafe.AsRef(&_ringHeader->Tail), tail);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void WriteBufferDescriptor(uint ringIndex, ushort bufferId)
+ {
+ Debug.Assert(ringIndex < _ringEntries);
+ Debug.Assert(bufferId < _ringEntries);
+ Debug.Assert(_bufferHandles[bufferId].IsAllocated);
+
+ Interop.Sys.IoUringBuf* bufferSlot = _ringBuffers + ringIndex;
+ bufferSlot->Address = (ulong)(nuint)_bufferHandles[bufferId].AddrOfPinnedObject();
+ bufferSlot->Length = (uint)_bufferSize;
+ bufferSlot->BufferId = bufferId;
+ bufferSlot->Reserved = 0;
+ }
+
+ private void ReleasePinnedBuffers(int count)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ if (_bufferHandles[i].IsAllocated)
+ {
+ _bufferHandles[i].Free();
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs
new file mode 100644
index 00000000000000..e4548a7cbe5294
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs
@@ -0,0 +1,276 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ ///
+ /// Lock-free multi-producer, single-consumer queue optimized for the io_uring
+ /// event loop pattern where many threads enqueue work items but exactly one
+ /// thread drains them.
+ ///
+ /// Liveness contract:
+ /// TryDequeue/IsEmpty may observe a producer between index claim and publish
+ /// (Interlocked.Increment followed by Volatile.Write), and can transiently report
+ /// no available item even though an enqueue is in progress. Callers must provide
+ /// their own wakeup/progress mechanism after Enqueue.
+ ///
+ internal sealed class MpscQueue
+ {
+ private const int DefaultSegmentSize = 256;
+
+ private readonly int _segmentSize;
+ private PaddedSegment _head;
+ private PaddedSegment _tail;
+ // Safe to recycle only segments that lost the tail->next link race and were never published.
+ // Reusing drained, previously-linked segments would require producer quiescence tracking to
+ // avoid stale producer references writing into a reset segment.
+ private Segment? _cachedUnlinkedSegment;
+
+ internal MpscQueue(int segmentSize = DefaultSegmentSize)
+ {
+ ArgumentOutOfRangeException.ThrowIfNegativeOrZero(segmentSize);
+ _segmentSize = segmentSize;
+ Segment initial = new Segment(segmentSize);
+ _head.Value = initial;
+ _tail.Value = initial;
+ }
+
+ ///
+ /// Enqueues an item.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void Enqueue(T item)
+ {
+ if (!TryEnqueueFast(item))
+ {
+ EnqueueSlow(item);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryEnqueueFast(T item)
+ {
+ Segment tail = Volatile.Read(ref _tail.Value)!;
+ int index = Interlocked.Increment(ref tail.EnqueueIndex.Value) - 1;
+ if ((uint)index < (uint)tail.States.Length)
+ {
+ // Publish item data before making the slot visible to the consumer.
+ tail.Items[index] = item;
+ Volatile.Write(ref tail.States[index], 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void EnqueueSlow(T item)
+ {
+ while (true)
+ {
+ Segment tail = Volatile.Read(ref _tail.Value)!;
+ int index = Interlocked.Increment(ref tail.EnqueueIndex.Value) - 1;
+ if ((uint)index < (uint)tail.States.Length)
+ {
+ tail.Items[index] = item;
+ Volatile.Write(ref tail.States[index], 1);
+ return;
+ }
+
+ Segment? next = Volatile.Read(ref tail.Next);
+ if (next is null)
+ {
+ Segment newSegment = RentUnlinkedSegment();
+ if (Interlocked.CompareExchange(ref tail.Next, newSegment, null) is null)
+ {
+ next = newSegment;
+ }
+ else
+ {
+ // Another producer linked its own segment first. Reuse ours later.
+ ReturnUnlinkedSegment(newSegment);
+ next = Volatile.Read(ref tail.Next);
+ }
+ }
+
+ if (next is not null)
+ {
+ Interlocked.CompareExchange(ref _tail.Value, next, tail);
+ }
+ }
+ }
+
+ ///
+ /// Attempts to dequeue an item. Must only be called by the single consumer thread.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryDequeue(out T item)
+ {
+ if (TryDequeueFast(out item))
+ {
+ return true;
+ }
+
+ return TryDequeueSlow(out item);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool TryDequeueFromSegment(Segment head, out T item)
+ {
+ int index = head.DequeueIndex;
+ if ((uint)index >= (uint)head.States.Length)
+ {
+ item = default!;
+ return false;
+ }
+
+ // Acquire published slot before reading the item value.
+ if (Volatile.Read(ref head.States[index]) != 1)
+ {
+ item = default!;
+ return false;
+ }
+
+ item = head.Items[index];
+ if (RuntimeHelpers.IsReferenceOrContainsReferences())
+ {
+ head.Items[index] = default!;
+ }
+
+ head.DequeueIndex = index + 1;
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryDequeueFast(out T item)
+ {
+ Segment head = Volatile.Read(ref _head.Value)!;
+ return TryDequeueFromSegment(head, out item);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private bool TryDequeueSlow(out T item)
+ {
+ Segment head = Volatile.Read(ref _head.Value)!;
+ while ((uint)head.DequeueIndex >= (uint)head.States.Length)
+ {
+ Segment? next = Volatile.Read(ref head.Next);
+ if (next is null)
+ {
+ item = default!;
+ return false;
+ }
+
+ _head.Value = next;
+ head = next;
+ }
+
+ return TryDequeueFromSegment(head, out item);
+ }
+
+ ///
+ /// Returns whether the queue currently appears empty (snapshot, not linearizable).
+ /// A return value of can also mean an enqueue is mid-flight.
+ ///
+ internal bool IsEmpty
+ {
+ get
+ {
+ Segment head = Volatile.Read(ref _head.Value)!;
+ while (true)
+ {
+ int index = head.DequeueIndex;
+ if ((uint)index >= (uint)head.States.Length)
+ {
+ Segment? next = Volatile.Read(ref head.Next);
+ if (next is null)
+ {
+ return true;
+ }
+
+ head = next;
+ continue;
+ }
+
+ return Volatile.Read(ref head.States[index]) != 1;
+ }
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private Segment RentUnlinkedSegment()
+ {
+ Segment? segment = Interlocked.Exchange(ref _cachedUnlinkedSegment, null);
+ if (segment is null)
+ {
+ return new Segment(_segmentSize);
+ }
+
+ segment.ResetForReuse();
+ return segment;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ReturnUnlinkedSegment(Segment segment)
+ {
+ segment.ResetForReuse();
+ Interlocked.CompareExchange(ref _cachedUnlinkedSegment, segment, null);
+ }
+
+ private sealed class Segment
+ {
+ internal readonly T[] Items;
+ internal readonly int[] States;
+ internal PaddedInt32 EnqueueIndex;
+ internal int DequeueIndex;
+ internal Segment? Next;
+
+ internal Segment(int size)
+ {
+ Items = new T[size];
+ States = new int[size];
+ ResetForReuse();
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void ResetForReuse()
+ {
+ EnqueueIndex.Value = 0;
+ DequeueIndex = 0;
+ Next = null;
+ Array.Clear(States);
+ if (RuntimeHelpers.IsReferenceOrContainsReferences())
+ {
+ Array.Clear(Items);
+ }
+ }
+ }
+
+#if TARGET_ARM64 || TARGET_LOONGARCH64
+ private const int CacheLineWordCount = 16; // 128-byte cache line / sizeof(nint)
+#else
+ private const int CacheLineWordCount = 8; // 64-byte cache line / sizeof(nint)
+#endif
+
+ [InlineArray(CacheLineWordCount - 1)]
+ private struct CacheLinePadding
+ {
+ internal nint _element0;
+ }
+
+ private struct PaddedSegment
+ {
+ internal Segment? Value;
+ internal CacheLinePadding _padding;
+ }
+
+ private struct PaddedInt32
+ {
+ internal int Value;
+ internal CacheLinePadding _padding;
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs
new file mode 100644
index 00000000000000..80b1dfa07abc78
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs
@@ -0,0 +1,2501 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Buffers;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed partial class SocketAsyncContext
+ {
+ private static long s_ioUringNonPinnablePrepareFallbackCount;
+ private const int MultishotAcceptQueueMaxSize = 256;
+ private const int PersistentMultishotRecvDataQueueMaxSize = 16;
+ private ConcurrentQueue? _multishotAcceptQueue;
+ private int _multishotAcceptArmed; // 0=not armed, 1=armed, 2=arming
+ private ulong _multishotAcceptUserData;
+ private ulong _persistentMultishotRecvUserData; // user_data of armed multishot recv SQE
+ private int _persistentMultishotRecvArmed; // 0=not armed, 1=armed
+ private ConcurrentQueue? _persistentMultishotRecvDataQueue;
+ private BufferedPersistentMultishotRecvData? _persistentMultishotRecvDataHead;
+ private int _persistentMultishotRecvDataHeadOffset;
+ private int _persistentMultishotRecvDataQueueCount;
+ private int _persistentMultishotRecvDataConsumerGate;
+
+ private readonly struct BufferedPersistentMultishotRecvData
+ {
+ internal readonly byte[] Data;
+ internal readonly int Length;
+ internal readonly bool UsesPooledBuffer;
+
+ internal BufferedPersistentMultishotRecvData(byte[] data, int length, bool usesPooledBuffer)
+ {
+ Data = data;
+ Length = length;
+ UsesPooledBuffer = usesPooledBuffer;
+ }
+ }
+
+ /// Holds a pre-accepted connection's fd and socket address from a multishot accept CQE.
+ private readonly struct PreAcceptedConnection
+ {
+ internal readonly IntPtr FileDescriptor;
+ internal readonly byte[] SocketAddressData;
+ internal readonly int SocketAddressLength;
+ internal readonly bool UsesPooledBuffer;
+
+ internal PreAcceptedConnection(IntPtr fileDescriptor, byte[] socketAddressData, int socketAddressLength, bool usesPooledBuffer)
+ {
+ FileDescriptor = fileDescriptor;
+ SocketAddressData = socketAddressData;
+ SocketAddressLength = socketAddressLength;
+ UsesPooledBuffer = usesPooledBuffer;
+ }
+ }
+
+ /// Returns whether this context's engine is using io_uring completion mode.
+ private bool IsIoUringCompletionModeEnabled()
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ return engine is not null && engine.IsIoUringCompletionModeEnabled;
+ }
+
+ /// Returns the global count of non-pinnable buffer prepare fallbacks for telemetry.
+ internal static long GetIoUringNonPinnablePrepareFallbackCount() =>
+ Interlocked.Read(ref s_ioUringNonPinnablePrepareFallbackCount);
+
+ /// Returns whether a multishot accept SQE is currently armed for this context.
+ internal bool IsMultishotAcceptArmed => Volatile.Read(ref _multishotAcceptArmed) != 0;
+
+ /// Returns the user_data payload for the armed multishot accept SQE, if any.
+ internal ulong MultishotAcceptUserData => Volatile.Read(ref _multishotAcceptUserData);
+
+ /// Clears multishot accept armed-state for this context.
+ internal void DisarmMultishotAccept()
+ {
+ Volatile.Write(ref _multishotAcceptUserData, 0);
+ Volatile.Write(ref _multishotAcceptArmed, 0);
+ }
+
+ /// Returns whether a persistent multishot recv SQE is currently armed for this context.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool IsPersistentMultishotRecvArmed() =>
+ Volatile.Read(ref _persistentMultishotRecvArmed) != 0;
+
+ /// Records that a persistent multishot recv SQE has been armed for this context.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void SetPersistentMultishotRecvArmed(ulong userData)
+ {
+ Volatile.Write(ref _persistentMultishotRecvUserData, userData);
+ Volatile.Write(ref _persistentMultishotRecvArmed, 1);
+ }
+
+ /// Clears this context's armed persistent multishot recv state.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void ClearPersistentMultishotRecvArmed()
+ {
+ Volatile.Write(ref _persistentMultishotRecvUserData, 0);
+ Volatile.Write(ref _persistentMultishotRecvArmed, 0);
+ }
+
+ /// Gets the user_data of the armed persistent multishot recv SQE, or 0 if none is armed.
+ internal ulong PersistentMultishotRecvUserData =>
+ Volatile.Read(ref _persistentMultishotRecvUserData);
+
+ ///
+ /// Clears persistent multishot recv armed-state and requests ASYNC_CANCEL for
+ /// the armed user_data when available.
+ ///
+ internal void RequestPersistentMultishotRecvCancel()
+ {
+ ulong recvUserData = Volatile.Read(ref _persistentMultishotRecvUserData);
+ ClearPersistentMultishotRecvArmed();
+ if (recvUserData != 0)
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ engine?.TryRequestIoUringCancellation(recvUserData);
+ }
+ }
+
+ /// Copies an early multishot-recv payload into the per-socket replay queue.
+ internal bool TryBufferEarlyPersistentMultishotRecvData(ReadOnlySpan payload)
+ {
+ if (payload.Length == 0)
+ {
+ return true;
+ }
+
+ EnsurePersistentMultishotRecvDataQueueInitialized();
+ ConcurrentQueue? queue = _persistentMultishotRecvDataQueue;
+ if (queue is null)
+ {
+ return false;
+ }
+
+ byte[] copy = ArrayPool.Shared.Rent(payload.Length);
+ payload.CopyTo(copy);
+ if (Interlocked.Increment(ref _persistentMultishotRecvDataQueueCount) > PersistentMultishotRecvDataQueueMaxSize)
+ {
+ Interlocked.Decrement(ref _persistentMultishotRecvDataQueueCount);
+ ArrayPool.Shared.Return(copy);
+ return false;
+ }
+
+ queue.Enqueue(new BufferedPersistentMultishotRecvData(copy, payload.Length, usesPooledBuffer: true));
+ return true;
+ }
+
+ /// Attempts to drain buffered multishot-recv payload into the caller destination.
+ internal bool TryConsumeBufferedPersistentMultishotRecvData(Memory destination, out int bytesTransferred)
+ {
+ bytesTransferred = 0;
+ if (destination.Length == 0)
+ {
+ return false;
+ }
+
+ EnterPersistentMultishotRecvDataConsumerGate();
+ try
+ {
+ if (!TryAcquirePersistentMultishotRecvDataHead(out BufferedPersistentMultishotRecvData buffered))
+ {
+ return false;
+ }
+
+ int headOffset = _persistentMultishotRecvDataHeadOffset;
+ int remaining = buffered.Length - headOffset;
+ if (remaining <= 0)
+ {
+ ReleasePersistentMultishotRecvDataHead();
+ return false;
+ }
+
+ int toCopy = Math.Min(destination.Length, remaining);
+ buffered.Data.AsSpan(headOffset, toCopy).CopyTo(destination.Span);
+ _persistentMultishotRecvDataHeadOffset = headOffset + toCopy;
+ bytesTransferred = toCopy;
+
+ if (_persistentMultishotRecvDataHeadOffset >= buffered.Length)
+ {
+ ReleasePersistentMultishotRecvDataHead();
+ }
+
+ return true;
+ }
+ finally
+ {
+ ExitPersistentMultishotRecvDataConsumerGate();
+ }
+ }
+
+ /// Ensures the pre-accepted connection queue exists.
+ private void EnsureMultishotAcceptQueueInitialized()
+ {
+ if (_multishotAcceptQueue is null)
+ {
+ Interlocked.CompareExchange(ref _multishotAcceptQueue, new ConcurrentQueue(), null);
+ }
+ }
+
+ ///
+ /// Attempts to enqueue a pre-accepted connection from a multishot accept CQE.
+ /// Caller is responsible for closing when this returns false.
+ ///
+ internal bool TryEnqueuePreAcceptedConnection(IntPtr acceptedFd, ReadOnlySpan socketAddressData, int socketAddressLen)
+ {
+ EnsureMultishotAcceptQueueInitialized();
+ ConcurrentQueue? queue = _multishotAcceptQueue;
+ if (queue is null || queue.Count >= MultishotAcceptQueueMaxSize)
+ {
+ return false;
+ }
+
+ int length = socketAddressLen;
+ if (length < 0)
+ {
+ length = 0;
+ }
+
+ if ((uint)length > (uint)socketAddressData.Length)
+ {
+ length = socketAddressData.Length;
+ }
+
+ byte[] copy;
+ if (length != 0)
+ {
+ copy = ArrayPool.Shared.Rent(length);
+ socketAddressData.Slice(0, length).CopyTo(copy);
+ }
+ else
+ {
+ copy = Array.Empty();
+ }
+
+ queue.Enqueue(new PreAcceptedConnection(acceptedFd, copy, length, usesPooledBuffer: length != 0));
+ return true;
+ }
+
+ ///
+ /// Attempts to dequeue a pre-accepted connection from the multishot accept queue.
+ /// Returns true if a connection was available, populating the operation fields.
+ ///
+ internal bool TryDequeuePreAcceptedConnection(AcceptOperation operation)
+ {
+ EnsureMultishotAcceptQueueInitialized();
+ ConcurrentQueue? queue = _multishotAcceptQueue;
+ if (queue is null || !queue.TryDequeue(out PreAcceptedConnection accepted))
+ {
+ return false;
+ }
+
+ try
+ {
+ operation.AcceptedFileDescriptor = accepted.FileDescriptor;
+ int socketAddressLen = accepted.SocketAddressLength;
+ if ((uint)socketAddressLen > (uint)operation.SocketAddress.Length)
+ {
+ socketAddressLen = operation.SocketAddress.Length;
+ }
+
+ if (socketAddressLen != 0)
+ {
+ accepted.SocketAddressData.AsSpan(0, socketAddressLen).CopyTo(operation.SocketAddress.Span);
+ }
+
+ operation.AcceptSocketAddressLength = socketAddressLen;
+ operation.SocketAddress = operation.SocketAddress.Slice(0, socketAddressLen);
+ operation.ErrorCode = SocketError.Success;
+ return true;
+ }
+ finally
+ {
+ ReturnPooledBufferIfNeeded(accepted.SocketAddressData, accepted.UsesPooledBuffer);
+ }
+ }
+
+ /// Removes a completed io_uring operation from its queue and signals or dispatches its callback.
+ internal bool TryCompleteIoUringOperation(AsyncOperation operation)
+ {
+ bool removed =
+ operation is ReadOperation readOperation ? _receiveQueue.TryRemoveCompletedOperation(this, readOperation) :
+ operation is WriteOperation writeOperation ? _sendQueue.TryRemoveCompletedOperation(this, writeOperation) :
+ false;
+ if (!removed)
+ {
+ return false;
+ }
+
+ ManualResetEventSlim? e = operation.Event;
+ if (e is not null)
+ {
+ e.Set();
+ return true;
+ }
+
+ operation.CancellationRegistration.Dispose();
+ if (operation.ShouldDispatchCallback)
+ {
+ ThreadPool.UnsafeQueueUserWorkItem(static o => ((AsyncOperation)o!).InvokeCallback(allowPooling: true), operation, preferLocal: false);
+ }
+
+ return true;
+ }
+
+ /// Enqueues an operation for deferred SQE preparation on the event loop thread.
+ private bool TryEnqueueIoUringPreparation(AsyncOperation operation, long prepareSequence)
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ return engine is not null && engine.TryEnqueueIoUringPreparation(operation, prepareSequence);
+ }
+
+ /// Applies cancellation and/or untracking to an operation's io_uring state.
+ private void HandleIoUringCancellationTransition(
+ AsyncOperation operation,
+ bool requestKernelCancellation,
+ bool untrackAndClear)
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ ulong userData = operation.IoUringUserData;
+ if (userData == 0)
+ {
+ return;
+ }
+
+ if (requestKernelCancellation)
+ {
+ engine?.TryRequestIoUringCancellation(userData);
+ }
+
+ if (untrackAndClear)
+ {
+ bool clearAllowed = engine?.TryUntrackIoUringOperation(userData, operation) ?? true;
+ if (clearAllowed)
+ {
+ operation.ClearIoUringUserData();
+ }
+ }
+ }
+
+ /// Requests kernel-level ASYNC_CANCEL for an in-flight operation.
+ private void TryRequestIoUringCancellation(AsyncOperation operation)
+ {
+ HandleIoUringCancellationTransition(
+ operation,
+ requestKernelCancellation: true,
+ untrackAndClear: false);
+ }
+
+ /// Removes an operation from the registry and clears its user_data.
+ internal void TryUntrackIoUringOperation(AsyncOperation operation)
+ {
+ HandleIoUringCancellationTransition(
+ operation,
+ requestKernelCancellation: false,
+ untrackAndClear: true);
+ }
+
+ /// Stages an operation for io_uring preparation if completion mode is active.
+ static partial void LinuxTryStageIoUringOperation(AsyncOperation operation)
+ {
+ if (operation.Event is null && operation.AssociatedContext.IsIoUringCompletionModeEnabled())
+ {
+ if (!operation.TryQueueIoUringPreparation())
+ {
+ operation.EmitReadinessFallbackForQueueOverflow();
+ }
+ }
+ }
+
+ partial void LinuxTryDequeuePreAcceptedConnection(AcceptOperation operation, ref bool dequeued)
+ {
+ dequeued = TryDequeuePreAcceptedConnection(operation);
+ }
+
+ partial void LinuxTryConsumeBufferedPersistentMultishotRecvData(Memory destination, ref bool consumed, ref int bytesTransferred)
+ {
+ consumed = TryConsumeBufferedPersistentMultishotRecvData(destination, out bytesTransferred);
+ }
+
+ /// Cleans up multishot-accept state and queued pre-accepted descriptors during abort.
+ partial void LinuxOnStopAndAbort()
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ if (IsPersistentMultishotRecvArmed())
+ {
+ RequestPersistentMultishotRecvCancel();
+ }
+
+ ulong armedUserData = GetArmedMultishotAcceptUserDataForCancellation();
+ if (engine is not null && armedUserData != 0)
+ {
+ engine.TryRequestIoUringCancellation(armedUserData);
+ }
+
+ DisarmMultishotAccept();
+
+ if (_multishotAcceptQueue is not null)
+ {
+ while (_multishotAcceptQueue.TryDequeue(out PreAcceptedConnection accepted))
+ {
+ Interop.Sys.Close(accepted.FileDescriptor);
+ ReturnPooledBufferIfNeeded(accepted.SocketAddressData, accepted.UsesPooledBuffer);
+ }
+ }
+
+ EnterPersistentMultishotRecvDataConsumerGate();
+ try
+ {
+ ReleasePersistentMultishotRecvDataHead();
+
+ if (_persistentMultishotRecvDataQueue is not null)
+ {
+ while (_persistentMultishotRecvDataQueue.TryDequeue(out BufferedPersistentMultishotRecvData buffered))
+ {
+ Interlocked.Decrement(ref _persistentMultishotRecvDataQueueCount);
+ ReturnPooledBufferIfNeeded(buffered.Data, buffered.UsesPooledBuffer);
+ }
+ }
+ }
+ finally
+ {
+ ExitPersistentMultishotRecvDataConsumerGate();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void EnsurePersistentMultishotRecvDataQueueInitialized()
+ {
+ if (_persistentMultishotRecvDataQueue is null)
+ {
+ Interlocked.CompareExchange(
+ ref _persistentMultishotRecvDataQueue,
+ new ConcurrentQueue(),
+ comparand: null);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryAcquirePersistentMultishotRecvDataHead(out BufferedPersistentMultishotRecvData buffered)
+ {
+ if (_persistentMultishotRecvDataHead is BufferedPersistentMultishotRecvData existingHead)
+ {
+ buffered = existingHead;
+ return true;
+ }
+
+ if (_persistentMultishotRecvDataQueue is null ||
+ !_persistentMultishotRecvDataQueue.TryDequeue(out BufferedPersistentMultishotRecvData dequeued))
+ {
+ buffered = default;
+ return false;
+ }
+
+ _persistentMultishotRecvDataHead = dequeued;
+ _persistentMultishotRecvDataHeadOffset = 0;
+ buffered = dequeued;
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ReleasePersistentMultishotRecvDataHead()
+ {
+ if (_persistentMultishotRecvDataHead is not BufferedPersistentMultishotRecvData head)
+ {
+ return;
+ }
+
+ _persistentMultishotRecvDataHead = null;
+ _persistentMultishotRecvDataHeadOffset = 0;
+ Interlocked.Decrement(ref _persistentMultishotRecvDataQueueCount);
+ ReturnPooledBufferIfNeeded(head.Data, head.UsesPooledBuffer);
+ }
+
+ private void EnterPersistentMultishotRecvDataConsumerGate()
+ {
+ SpinWait spinWait = default;
+ while (Interlocked.CompareExchange(
+ ref _persistentMultishotRecvDataConsumerGate,
+ value: 1,
+ comparand: 0) != 0)
+ {
+ spinWait.SpinOnce();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ExitPersistentMultishotRecvDataConsumerGate()
+ {
+ Volatile.Write(ref _persistentMultishotRecvDataConsumerGate, 0);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ReturnPooledBufferIfNeeded(byte[] buffer, bool usesPooledBuffer)
+ {
+ if (usesPooledBuffer)
+ {
+ ArrayPool.Shared.Return(buffer);
+ }
+ }
+
+ private ulong GetArmedMultishotAcceptUserDataForCancellation()
+ {
+ ulong userData = Volatile.Read(ref _multishotAcceptUserData);
+ if (userData != 0 || Volatile.Read(ref _multishotAcceptArmed) == 0)
+ {
+ return userData;
+ }
+
+ SpinWait spinner = default;
+ for (int i = 0; i < 64; i++)
+ {
+ spinner.SpinOnce();
+ userData = Volatile.Read(ref _multishotAcceptUserData);
+ if (userData != 0 || Volatile.Read(ref _multishotAcceptArmed) == 0)
+ {
+ break;
+ }
+ }
+
+ return userData;
+ }
+
+ internal abstract partial class AsyncOperation
+ {
+ /// Outcome of processing an io_uring CQE, determining the dispatch action.
+ internal enum IoUringCompletionResult
+ {
+ Completed = 0,
+ Pending = 1,
+ Canceled = 2,
+ Ignored = 3
+ }
+
+ /// Tri-state result from direct (managed) SQE preparation.
+ internal enum IoUringDirectPrepareResult
+ {
+ Unsupported = 0, // Direct path unavailable for this shape; caller keeps operation pending.
+ Prepared = 1, // SQE written
+ PrepareFailed = 2 // Direct preparation failed; caller handles retry/fallback without native prepare.
+ }
+
+ /// Tracks whether a receive operation prepared as one-shot or multishot.
+ internal enum IoUringReceiveSubmissionMode : byte
+ {
+ None = 0,
+ OneShot = 1,
+ Multishot = 2
+ }
+
+ private long _ioUringPrepareSequence;
+ private int _ioUringPrepareQueued;
+ private int _ioUringPreparationReusable;
+ private MemoryHandle _ioUringPinnedBuffer;
+ private int _ioUringPinnedBufferActive;
+ private int _ioUringCompletionSocketAddressLen;
+ private int _ioUringCompletionControlBufferLen;
+ private int _ioUringReceiveSubmissionMode;
+ private int _ioUringSlotExhaustionRetryCount;
+ internal ulong IoUringUserData;
+
+ /// Requests kernel cancellation if the flag is set.
+ partial void LinuxRequestIoUringCancellationIfNeeded(bool requestIoUringCancellation)
+ {
+ if (requestIoUringCancellation)
+ {
+ AssociatedContext.TryRequestIoUringCancellation(this);
+ }
+ }
+
+ /// Untracks this operation unless it is in the Canceled state awaiting a terminal CQE.
+ partial void LinuxUntrackIoUringOperation()
+ {
+ // Canceled operations remain tracked until the terminal CQE arrives so that
+ // pinned/user-owned resources are not released while the kernel may still
+ // reference them. Dispatch will clear resources on that terminal completion.
+ if (_state == State.Canceled)
+ {
+ return;
+ }
+
+ AssociatedContext.TryUntrackIoUringOperation(this);
+ }
+
+ /// Resets all io_uring preparation state and advances the prepare sequence.
+ partial void ResetIoUringState()
+ {
+ ReleaseIoUringPreparationResources();
+ IoUringUserData = 0;
+ Volatile.Write(ref _ioUringPreparationReusable, 0);
+ _ioUringCompletionSocketAddressLen = 0;
+ _ioUringCompletionControlBufferLen = 0;
+ _ioUringReceiveSubmissionMode = (int)IoUringReceiveSubmissionMode.None;
+ _ioUringSlotExhaustionRetryCount = 0;
+ long nextPrepareSequence = unchecked(_ioUringPrepareSequence + 1);
+ // Keep sequence strictly positive so stale queued work from previous resets never matches.
+ if (nextPrepareSequence <= 0)
+ {
+ nextPrepareSequence = 1;
+ }
+
+ Volatile.Write(ref _ioUringPrepareSequence, nextPrepareSequence);
+ Volatile.Write(ref _ioUringPrepareQueued, 0);
+ }
+
+ /// Marks this operation as ready for SQE preparation and returns its sequence number.
+ internal long MarkReadyForIoUringPreparation()
+ {
+ long prepareSequence = Volatile.Read(ref _ioUringPrepareSequence);
+ Debug.Assert(prepareSequence > 0);
+ Volatile.Write(ref _ioUringPrepareQueued, 1);
+ return prepareSequence;
+ }
+
+ /// Cancels a pending preparation if the sequence number still matches.
+ internal void CancelPendingIoUringPreparation(long prepareSequence)
+ {
+ if (Volatile.Read(ref _ioUringPrepareSequence) == prepareSequence)
+ {
+ Volatile.Write(ref _ioUringPrepareQueued, 0);
+ }
+ }
+
+ /// Attempts to prepare an SQE for this operation via the managed direct path.
+ internal bool TryPrepareIoUring(SocketAsyncContext context, long prepareSequence)
+ {
+ if (prepareSequence <= 0 ||
+ Volatile.Read(ref _ioUringPrepareSequence) != prepareSequence ||
+ Interlocked.Exchange(ref _ioUringPrepareQueued, 0) == 0 ||
+ _state != State.Waiting)
+ {
+ return false;
+ }
+
+ if (Interlocked.Exchange(ref _ioUringPreparationReusable, 0) == 0)
+ {
+ ReleaseIoUringPreparationResources();
+ }
+
+ SocketAsyncEngine? engine = Volatile.Read(ref context._asyncEngine);
+ if (engine is null || !engine.IsIoUringDirectSqeEnabled)
+ {
+ // Managed completion mode assumes direct SQE submission.
+ // If direct submission is unavailable, keep operation pending for fallback handling.
+ ErrorCode = SocketError.Success;
+ IoUringUserData = 0;
+ return false;
+ }
+
+ IoUringDirectPrepareResult directResult = IoUringPrepareDirect(context, engine, out ulong directUserData);
+ if (directResult == IoUringDirectPrepareResult.Prepared)
+ {
+ _ioUringSlotExhaustionRetryCount = 0;
+ IoUringUserData = ErrorCode == SocketError.Success ? directUserData : 0;
+ return true;
+ }
+
+ if (directResult == IoUringDirectPrepareResult.PrepareFailed)
+ {
+ IoUringUserData = 0;
+ return false;
+ }
+
+ // Direct preparation unsupported for this operation shape.
+ // Leave operation pending so caller can use completion-path fallback semantics.
+ ErrorCode = SocketError.Success;
+ IoUringUserData = 0;
+ return false;
+ }
+
+ /// Queues this operation for deferred preparation on the event loop thread.
+ internal bool TryQueueIoUringPreparation()
+ {
+ if (!AssociatedContext.IsIoUringCompletionModeEnabled())
+ {
+ return false;
+ }
+
+ long prepareSequence = MarkReadyForIoUringPreparation();
+ if (AssociatedContext.TryEnqueueIoUringPreparation(this, prepareSequence))
+ {
+ return true;
+ }
+
+ CancelPendingIoUringPreparation(prepareSequence);
+ return false;
+ }
+
+ /// Returns whether this operation is currently in the waiting state.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool IsInWaitingState() => _state == State.Waiting;
+
+ /// Increments and returns the slot-exhaustion retry count for this operation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal int IncrementIoUringSlotExhaustionRetryCount() => ++_ioUringSlotExhaustionRetryCount;
+
+ /// Resets slot-exhaustion retry tracking for this operation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void ResetIoUringSlotExhaustionRetryCount() => _ioUringSlotExhaustionRetryCount = 0;
+
+ ///
+ /// Emits a readiness fallback event when io_uring prepare-queue staging fails.
+ ///
+ internal void EmitReadinessFallbackForQueueOverflow()
+ {
+ Interop.Sys.SocketEvents fallbackEvents = GetIoUringFallbackSocketEvents();
+ if (fallbackEvents == Interop.Sys.SocketEvents.None)
+ {
+ return;
+ }
+
+ SocketAsyncContext context = AssociatedContext;
+ SocketAsyncEngine? engine = Volatile.Read(ref context._asyncEngine);
+ if (engine is null)
+ {
+ return;
+ }
+
+ engine.EnqueueReadinessFallbackEvent(
+ context,
+ fallbackEvents,
+ countAsPrepareQueueOverflowFallback: true);
+ }
+
+ /// Processes a CQE result and returns the dispatch action for the completion handler.
+ internal IoUringCompletionResult ProcessIoUringCompletionResult(int result, uint flags, uint auxiliaryData)
+ {
+ Trace($"Enter, result={result}, flags={flags}, auxiliaryData={auxiliaryData}");
+
+ // Claim ownership of completion processing; if cancellation already won, do not publish completion.
+ State oldState = Interlocked.CompareExchange(ref _state, State.Running, State.Waiting);
+ if (oldState == State.Canceled)
+ {
+ Trace("Exit, previously canceled");
+ return IoUringCompletionResult.Canceled;
+ }
+
+ if (oldState != State.Waiting)
+ {
+ Trace("Exit, ignored");
+ return IoUringCompletionResult.Ignored;
+ }
+
+ if (ProcessIoUringCompletion(AssociatedContext, result, flags, auxiliaryData))
+ {
+ _state = State.Complete;
+ Trace("Exit, completed");
+ return IoUringCompletionResult.Completed;
+ }
+
+ // Incomplete path (e.g. transient retry): mirror TryComplete state transition handling.
+ State newState;
+ while (true)
+ {
+ State state = _state;
+ Debug.Assert(state is State.Running or State.RunningWithPendingCancellation, $"Unexpected operation state: {(State)state}");
+
+ newState = (state == State.Running ? State.Waiting : State.Canceled);
+ if (state == Interlocked.CompareExchange(ref _state, newState, state))
+ {
+ break;
+ }
+ }
+
+ if (newState == State.Canceled)
+ {
+ ProcessCancellation();
+ Trace("Exit, canceled while pending");
+ return IoUringCompletionResult.Canceled;
+ }
+
+ Trace("Exit, pending");
+ return IoUringCompletionResult.Pending;
+ }
+
+ /// Stores recvmsg output lengths from the CQE for post-completion processing.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void SetIoUringCompletionMessageMetadata(int socketAddressLen, int controlBufferLen)
+ {
+ _ioUringCompletionSocketAddressLen = socketAddressLen;
+ _ioUringCompletionControlBufferLen = controlBufferLen;
+ }
+
+ /// Releases preparation resources and resets the user_data to zero.
+ internal void ClearIoUringUserData()
+ {
+ ReleaseIoUringPreparationResources();
+ IoUringUserData = 0;
+ Volatile.Write(ref _ioUringPreparationReusable, 0);
+ _ioUringCompletionSocketAddressLen = 0;
+ _ioUringCompletionControlBufferLen = 0;
+ _ioUringReceiveSubmissionMode = (int)IoUringReceiveSubmissionMode.None;
+ _ioUringSlotExhaustionRetryCount = 0;
+ }
+
+ /// Clears user_data without releasing preparation resources for pending requeue.
+ internal void ResetIoUringUserDataForRequeue()
+ {
+ IoUringUserData = 0;
+ _ioUringCompletionSocketAddressLen = 0;
+ _ioUringCompletionControlBufferLen = 0;
+ }
+
+ /// Records whether the current receive preparation uses one-shot or multishot mode.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected void SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode mode)
+ {
+ Volatile.Write(ref _ioUringReceiveSubmissionMode, (int)mode);
+ }
+
+ /// Marks preparation resources as reusable so the next prepare skips re-pinning.
+ internal void MarkIoUringPreparationReusable()
+ {
+ Volatile.Write(ref _ioUringPreparationReusable, 1);
+ }
+
+ /// Socket address length reported by the kernel in the CQE.
+ protected int IoUringCompletionSocketAddressLen => _ioUringCompletionSocketAddressLen;
+ /// Control buffer length reported by the kernel in the CQE.
+ protected int IoUringCompletionControlBufferLen => _ioUringCompletionControlBufferLen;
+
+ /// Pins a buffer and returns the raw pointer, recording the handle for later release.
+ protected unsafe byte* PinIoUringBuffer(Memory buffer)
+ {
+ ReleasePinnedIoUringBuffer();
+ if (buffer.Length == 0)
+ {
+ return null;
+ }
+
+ _ioUringPinnedBuffer = buffer.Pin();
+ Volatile.Write(ref _ioUringPinnedBufferActive, 1);
+ return (byte*)_ioUringPinnedBuffer.Pointer;
+ }
+
+ /// Attempts to pin a buffer, falling back to the readiness path if not pinnable.
+ protected unsafe bool TryPinIoUringBuffer(Memory buffer, out byte* pinnedBuffer)
+ {
+ if (Volatile.Read(ref _ioUringPinnedBufferActive) != 0)
+ {
+ pinnedBuffer = (byte*)_ioUringPinnedBuffer.Pointer;
+ if (buffer.Length > 0 && pinnedBuffer is null)
+ {
+ ReleasePinnedIoUringBuffer();
+ RecordIoUringNonPinnablePrepareFallback("null-reused-pin-pointer", buffer.Length);
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ return true;
+ }
+
+ try
+ {
+ pinnedBuffer = PinIoUringBuffer(buffer);
+ if (buffer.Length > 0 && pinnedBuffer is null)
+ {
+ ReleasePinnedIoUringBuffer();
+ RecordIoUringNonPinnablePrepareFallback("null-pin-pointer", buffer.Length);
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ return true;
+ }
+ catch (NotSupportedException)
+ {
+ pinnedBuffer = null;
+ RecordIoUringNonPinnablePrepareFallback("pin-not-supported", buffer.Length);
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+ }
+
+ /// Transfers ownership of the active pinned buffer to the caller.
+ internal MemoryHandle TransferPinnedBuffer()
+ {
+ if (Interlocked.Exchange(ref _ioUringPinnedBufferActive, 0) == 0)
+ {
+ return default;
+ }
+
+ MemoryHandle pinnedBuffer = _ioUringPinnedBuffer;
+ _ioUringPinnedBuffer = default;
+ return pinnedBuffer;
+ }
+
+ ///
+ /// Attempts to pin a socket address buffer, reusing an existing pin when possible.
+ /// Caller is responsible for setting operation ErrorCode on failure if needed.
+ ///
+ protected static unsafe bool TryPinIoUringSocketAddress(
+ Memory socketAddress,
+ ref MemoryHandle pinnedSocketAddress,
+ ref int pinnedSocketAddressActive,
+ out byte* rawSocketAddress)
+ {
+ rawSocketAddress = null;
+ if (socketAddress.Length == 0)
+ {
+ return true;
+ }
+
+ if (Volatile.Read(ref pinnedSocketAddressActive) != 0)
+ {
+ rawSocketAddress = (byte*)pinnedSocketAddress.Pointer;
+ if (rawSocketAddress is null)
+ {
+ pinnedSocketAddress.Dispose();
+ pinnedSocketAddress = default;
+ Volatile.Write(ref pinnedSocketAddressActive, 0);
+ return false;
+ }
+
+ return true;
+ }
+
+ try
+ {
+ pinnedSocketAddress = socketAddress.Pin();
+ Volatile.Write(ref pinnedSocketAddressActive, 1);
+ }
+ catch (NotSupportedException)
+ {
+ rawSocketAddress = null;
+ return false;
+ }
+
+ rawSocketAddress = (byte*)pinnedSocketAddress.Pointer;
+ if (rawSocketAddress is null)
+ {
+ pinnedSocketAddress.Dispose();
+ pinnedSocketAddress = default;
+ Volatile.Write(ref pinnedSocketAddressActive, 0);
+ return false;
+ }
+
+ return true;
+ }
+
+ ///
+ /// Pins a socket address buffer and normalizes pinning failures to a non-terminal fallback signal.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected unsafe bool TryPinIoUringSocketAddressForPrepare(
+ Memory socketAddress,
+ ref MemoryHandle pinnedSocketAddress,
+ ref int pinnedSocketAddressActive,
+ out byte* rawSocketAddress)
+ {
+ if (TryPinIoUringSocketAddress(
+ socketAddress,
+ ref pinnedSocketAddress,
+ ref pinnedSocketAddressActive,
+ out rawSocketAddress))
+ {
+ return true;
+ }
+
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ /// Releases an operation-owned pinned socket-address buffer and message-header allocation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected static unsafe void ReleaseIoUringSocketAddressAndMessageHeader(
+ ref MemoryHandle pinnedSocketAddress,
+ ref int pinnedSocketAddressActive,
+ ref IntPtr messageHeader)
+ {
+ if (Interlocked.Exchange(ref pinnedSocketAddressActive, 0) != 0)
+ {
+ pinnedSocketAddress.Dispose();
+ pinnedSocketAddress = default;
+ }
+
+ IntPtr header = Interlocked.Exchange(ref messageHeader, IntPtr.Zero);
+ if (header != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)header);
+ }
+ }
+
+ /// Records a telemetry counter for a non-pinnable buffer fallback.
+ private void RecordIoUringNonPinnablePrepareFallback(string reason, int bufferLength)
+ {
+ if (!AssociatedContext.IsIoUringCompletionModeEnabled())
+ {
+ return;
+ }
+
+ long count = Interlocked.Increment(ref s_ioUringNonPinnablePrepareFallbackCount);
+ if (NetEventSource.Log.IsEnabled() && (count & 0x3F) == 1)
+ {
+ LogIoUringNonPinnablePrepareFallback(reason, bufferLength, count);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ void LogIoUringNonPinnablePrepareFallback(string fallbackReason, int fallbackBufferLength, long fallbackCount)
+ {
+ NetEventSource.Info(
+ AssociatedContext,
+ $"io_uring prepare fallback due to non-pinnable buffer: reason={fallbackReason}, length={fallbackBufferLength}, count={fallbackCount}");
+ }
+ }
+
+ /// Releases the currently pinned buffer handle if active.
+ private void ReleasePinnedIoUringBuffer()
+ {
+ if (Interlocked.Exchange(ref _ioUringPinnedBufferActive, 0) != 0)
+ {
+ _ioUringPinnedBuffer.Dispose();
+ _ioUringPinnedBuffer = default;
+ }
+ }
+
+ /// Releases the pinned buffer when the operation shape (single vs list) changes.
+ protected void ReleaseIoUringPinnedBufferForShapeTransition() =>
+ ReleasePinnedIoUringBuffer();
+
+ /// Releases all preparation resources including the pinned buffer and subclass resources.
+ private void ReleaseIoUringPreparationResources()
+ {
+ ReleasePinnedIoUringBuffer();
+ ReleaseIoUringPreparationResourcesCore();
+ }
+
+ /// Subclass hook to release operation-specific preparation resources.
+ protected virtual void ReleaseIoUringPreparationResourcesCore()
+ {
+ }
+
+ /// Frees a set of GCHandles used for buffer list pinning.
+ protected static void ReleasePinnedHandles(GCHandle[] pinnedHandles, int count)
+ {
+ if (count <= 0)
+ {
+ return;
+ }
+
+ int releaseCount = count < pinnedHandles.Length ? count : pinnedHandles.Length;
+ for (int i = 0; i < releaseCount; i++)
+ {
+ if (pinnedHandles[i].IsAllocated)
+ {
+ pinnedHandles[i].Free();
+ }
+ }
+ }
+
+ /// Rents an array from the shared pool for temporary io_uring preparation use.
+ private static T[] RentIoUringArray(int minimumLength) =>
+ minimumLength == 0 ? Array.Empty() : ArrayPool.Shared.Rent(minimumLength);
+
+ /// Returns a rented array to the shared pool.
+ private static void ReturnIoUringArray(T[] array, bool clearArray = false)
+ {
+ if (array.Length != 0)
+ {
+ ArrayPool.Shared.Return(array, clearArray);
+ }
+ }
+
+ /// Releases pinned handles and returns the iovec array to the pool.
+ protected static void ReleaseIoUringPinnedHandlesAndIovecs(
+ ref GCHandle[]? pinnedHandles,
+ ref Interop.Sys.IOVector[]? iovecs,
+ ref int pinnedHandleCount)
+ {
+ GCHandle[]? handles = Interlocked.Exchange(ref pinnedHandles, null);
+ int handleCount = Interlocked.Exchange(ref pinnedHandleCount, 0);
+ if (handles is not null)
+ {
+ ReleasePinnedHandles(handles, handleCount);
+ ReturnIoUringArray(handles);
+ }
+
+ Interop.Sys.IOVector[]? vectors = Interlocked.Exchange(ref iovecs, null);
+ if (vectors is not null)
+ {
+ ReturnIoUringArray(vectors);
+ }
+ }
+
+ /// Pins a list of buffer segments and builds an iovec array for scatter/gather I/O.
+ protected static unsafe bool TryPinBufferListForIoUring(
+ IList> buffers,
+ int startIndex,
+ int startOffset,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out int iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode)
+ {
+ iovCount = 0;
+ pinnedHandleCount = 0;
+ if ((uint)startIndex > (uint)buffers.Count)
+ {
+ errorCode = SocketError.InvalidArgument;
+ pinnedHandles = Array.Empty();
+ iovecs = Array.Empty();
+ return false;
+ }
+
+ int remainingBufferCount = buffers.Count - startIndex;
+ pinnedHandles = RentIoUringArray(remainingBufferCount);
+ iovecs = RentIoUringArray(remainingBufferCount);
+
+ int currentOffset = startOffset;
+ byte[]? lastPinnedArray = null;
+ GCHandle lastPinnedHandle = default;
+ try
+ {
+ for (int i = 0; i < remainingBufferCount; i++, currentOffset = 0)
+ {
+ ArraySegment buffer = buffers[startIndex + i];
+ RangeValidationHelpers.ValidateSegment(buffer);
+
+ if ((uint)currentOffset > (uint)buffer.Count)
+ {
+ ReleasePinnedHandles(pinnedHandles, pinnedHandleCount);
+ ReturnIoUringArray(pinnedHandles);
+ ReturnIoUringArray(iovecs);
+ errorCode = SocketError.InvalidArgument;
+ return false;
+ }
+
+ int bufferCount = buffer.Count - currentOffset;
+ byte* basePtr = null;
+ if (bufferCount != 0)
+ {
+ byte[] array = buffer.Array!;
+ GCHandle handle;
+ if (ReferenceEquals(array, lastPinnedArray))
+ {
+ handle = lastPinnedHandle;
+ }
+ else
+ {
+ handle = GCHandle.Alloc(array, GCHandleType.Pinned);
+ pinnedHandles[pinnedHandleCount] = handle;
+ pinnedHandleCount++;
+ lastPinnedArray = array;
+ lastPinnedHandle = handle;
+ }
+
+ basePtr = &((byte*)handle.AddrOfPinnedObject())[buffer.Offset + currentOffset];
+ }
+
+ iovecs[i].Base = basePtr;
+ iovecs[i].Count = (UIntPtr)bufferCount;
+ iovCount++;
+ }
+ }
+ catch
+ {
+ ReleasePinnedHandles(pinnedHandles, pinnedHandleCount);
+ ReturnIoUringArray(pinnedHandles);
+ ReturnIoUringArray(iovecs);
+ throw;
+ }
+
+ errorCode = SocketError.Success;
+ return true;
+ }
+
+ /// Prepares an SQE via the managed direct path. Override in subclasses for direct submission.
+ protected virtual IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ return IoUringDirectPrepareResult.Unsupported;
+ }
+
+ /// Routes a CQE to the success or error handler based on the result sign.
+ protected virtual bool ProcessIoUringCompletion(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ return result >= 0 ?
+ ProcessIoUringCompletionSuccess(context, result, flags, auxiliaryData) :
+ ProcessIoUringCompletionError(context, result, flags, auxiliaryData);
+ }
+
+ /// Processes a successful (non-negative) io_uring completion result.
+ protected virtual bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ Debug.Assert(result >= 0, $"Expected non-negative io_uring result, got {result}");
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ /// Processes a failed (negative) io_uring completion result.
+ protected virtual bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}");
+ ErrorCode = SocketPal.GetSocketErrorForErrorCode(GetIoUringPalError(result));
+ return true;
+ }
+
+ /// Whether preparation resources should be preserved when the operation is requeued.
+ internal virtual bool ShouldReuseIoUringPreparationResourcesOnPending => false;
+
+ /// Returns whether the negative result represents EAGAIN/EWOULDBLOCK.
+ protected static bool IsIoUringRetryableError(int result)
+ {
+ if (result >= 0)
+ {
+ return false;
+ }
+
+ Interop.Error error = GetIoUringPalError(result);
+ return error == Interop.Error.EAGAIN || error == Interop.Error.EWOULDBLOCK;
+ }
+
+ /// Converts a negative io_uring result to a SocketError, returning false for retryable errors.
+ protected static bool ProcessIoUringErrorResult(int result, out SocketError errorCode)
+ {
+ Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}");
+
+ if (IsIoUringRetryableError(result))
+ {
+ errorCode = SocketError.Success;
+ return false;
+ }
+
+ errorCode = SocketPal.GetSocketErrorForErrorCode(GetIoUringPalError(result));
+ return true;
+ }
+
+ /// Converts a negative io_uring CQE result (raw -errno) to PAL error space.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected static Interop.Error GetIoUringPalError(int result)
+ {
+ Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}");
+ int platformErrno = -result;
+ return Interop.Sys.ConvertErrorPlatformToPal(platformErrno);
+ }
+
+ /// Returns the epoll event mask to use when falling back from io_uring to readiness notification.
+ internal virtual Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.None;
+
+ ///
+ /// Copies payload bytes from a provided-buffer ring selection into the operation's target memory.
+ /// Returns false when this operation shape does not support provided-buffer payload materialization.
+ ///
+ internal virtual unsafe bool TryProcessIoUringProvidedBufferCompletion(
+ byte* providedBuffer,
+ int providedBufferLength,
+ int bytesTransferred,
+ ref uint auxiliaryData)
+ {
+ _ = providedBuffer;
+ _ = providedBufferLength;
+ _ = bytesTransferred;
+ _ = auxiliaryData;
+ return false;
+ }
+ }
+
+ internal abstract partial class ReadOperation
+ {
+ ///
+ protected override bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) =>
+ ProcessIoUringErrorResult(result, out ErrorCode);
+
+ ///
+ // Retained only for defensive fallback paths; regular completion mode avoids readiness fallback.
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Read;
+ }
+
+ private abstract partial class WriteOperation
+ {
+ ///
+ protected override bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData) =>
+ ProcessIoUringErrorResult(result, out ErrorCode);
+
+ ///
+ // Retained only for defensive fallback paths; regular completion mode avoids readiness fallback.
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Write;
+ }
+
+ private abstract partial class SendOperation
+ {
+ ///
+ protected override bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ if (result == 0)
+ {
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ Debug.Assert(result > 0, $"Expected positive io_uring send completion size, got {result}");
+ Debug.Assert(result <= Count, $"Unexpected io_uring send completion size: result={result}, count={Count}");
+
+ int sent = Math.Min(result, Count);
+ BytesTransferred += sent;
+ Offset += sent;
+ Count -= sent;
+ ErrorCode = SocketError.Success;
+ return Count == 0;
+ }
+ }
+
+ private partial class BufferMemorySendOperation
+ {
+ private IntPtr _ioUringMessageHeader;
+ private MemoryHandle _ioUringPinnedSocketAddress;
+ private int _ioUringPinnedSocketAddressActive;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringSocketAddressAndMessageHeader(
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ ref _ioUringMessageHeader);
+ }
+
+ /// Gets a message header buffer and sets the common sendmsg fields.
+ private unsafe Interop.Sys.MessageHeader* GetOrCreateIoUringSendMessageHeader(byte* rawSocketAddress)
+ {
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ messageHeader->Flags = SocketFlags.None;
+ return messageHeader;
+ }
+
+ /// Configures a message header with zero or one iovec entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void ConfigureSingleIov(
+ Interop.Sys.MessageHeader* messageHeader,
+ byte* rawBuffer,
+ int bufferLength,
+ Interop.Sys.IOVector* iov)
+ {
+ if (bufferLength == 0)
+ {
+ messageHeader->IOVectors = null;
+ messageHeader->IOVectorCount = 0;
+ return;
+ }
+
+ iov->Base = rawBuffer;
+ iov->Count = (UIntPtr)bufferLength;
+ messageHeader->IOVectors = iov;
+ messageHeader->IOVectorCount = 1;
+ }
+
+ /// Builds a connected send or sendmsg preparation request.
+ private unsafe IoUringDirectPrepareResult IoUringPrepareDirectSendMessage(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (!TryPinIoUringSocketAddressForPrepare(
+ SocketAddress,
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (rawBuffer is not null)
+ {
+ rawBuffer += Offset;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = GetOrCreateIoUringSendMessageHeader(rawSocketAddress);
+ Interop.Sys.IOVector sendIov;
+ ConfigureSingleIov(messageHeader, rawBuffer, Count, &sendIov);
+
+ IoUringDirectPrepareResult sendMessagePrepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ context._socket,
+ messageHeader,
+ Count,
+ Flags,
+ out userData,
+ out SocketError sendMessageErrorCode);
+ ErrorCode = sendMessageErrorCode;
+ return sendMessagePrepareResult;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (SocketAddress.Length == 0)
+ {
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (rawBuffer is not null)
+ {
+ rawBuffer += Offset;
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectSendWithZeroCopyFallback(
+ context._socket,
+ rawBuffer,
+ Count,
+ Flags,
+ out bool usedZeroCopy,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ if (usedZeroCopy && prepareResult == IoUringDirectPrepareResult.Prepared)
+ {
+ engine.TransferIoUringZeroCopyPinHold(userData, TransferPinnedBuffer());
+ }
+
+ return prepareResult;
+ }
+
+ return IoUringPrepareDirectSendMessage(context, engine, out userData);
+ }
+ }
+
+ private sealed partial class BufferListSendOperation
+ {
+ private GCHandle[]? _ioUringPinnedBufferHandles;
+ private Interop.Sys.IOVector[]? _ioUringIovecs;
+ private int _ioUringPinnedHandleCount;
+ private int _ioUringPreparedBufferCount = -1;
+ private int _ioUringPreparedStartIndex = -1;
+ private int _ioUringPreparedStartOffset = -1;
+ private int _ioUringPreparedIovCount;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedBufferCount = -1;
+ _ioUringPreparedStartIndex = -1;
+ _ioUringPreparedStartOffset = -1;
+ _ioUringPreparedIovCount = 0;
+ }
+
+ /// Pins buffer segments starting at BufferIndex/Offset and builds the iovec array.
+ private bool TryPinIoUringBuffers(
+ IList> buffers,
+ int startIndex,
+ int startOffset,
+ out int iovCount)
+ {
+ if (_ioUringPinnedBufferHandles is not null &&
+ _ioUringIovecs is not null &&
+ _ioUringPreparedBufferCount == buffers.Count &&
+ _ioUringPreparedStartIndex == startIndex &&
+ _ioUringPreparedStartOffset == startOffset &&
+ _ioUringPreparedIovCount <= _ioUringIovecs.Length)
+ {
+ iovCount = _ioUringPreparedIovCount;
+ return true;
+ }
+
+ // Release any existing pinned handles and rented arrays before creating new ones.
+ // This handles the partial-send case where BufferIndex/Offset advanced, causing the
+ // reuse check above to fail while old resources are still held.
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+
+ if (!TryPinBufferListForIoUring(
+ buffers,
+ startIndex,
+ startOffset,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode))
+ {
+ ErrorCode = errorCode;
+ return false;
+ }
+
+ _ioUringPinnedBufferHandles = pinnedHandles;
+ _ioUringIovecs = iovecs;
+ _ioUringPinnedHandleCount = pinnedHandleCount;
+ _ioUringPreparedBufferCount = buffers.Count;
+ _ioUringPreparedStartIndex = startIndex;
+ _ioUringPreparedStartOffset = startOffset;
+ _ioUringPreparedIovCount = iovCount;
+ return true;
+ }
+
+ /// Advances the buffer position after a partial send, returning true when all data is sent.
+ private bool AdvanceSendBufferPosition(int bytesSent)
+ {
+ IList>? buffers = Buffers;
+ if (buffers is null || bytesSent <= 0)
+ {
+ return buffers is null || BufferIndex >= buffers.Count;
+ }
+
+ int remaining = bytesSent;
+ int index = BufferIndex;
+ int offset = Offset;
+
+ while (remaining > 0 && index < buffers.Count)
+ {
+ int available = buffers[index].Count - offset;
+ Debug.Assert(available >= 0, "Unexpected negative buffer availability during io_uring send completion.");
+
+ if (available > remaining)
+ {
+ offset += remaining;
+ break;
+ }
+
+ remaining -= Math.Max(available, 0);
+ index++;
+ offset = 0;
+ }
+
+ BufferIndex = index;
+ Offset = offset;
+ return index >= buffers.Count;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ IList>? buffers = Buffers;
+ if (buffers is null)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if ((uint)BufferIndex > (uint)buffers.Count)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringBuffers(buffers, BufferIndex, Offset, out int iovCount))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ byte* rawSocketAddress = null;
+ if (SocketAddress.Length != 0 && !TryPinIoUringBuffer(SocketAddress, out rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader messageHeader;
+ messageHeader.SocketAddress = rawSocketAddress;
+ messageHeader.SocketAddressLen = SocketAddress.Length;
+ messageHeader.ControlBuffer = null;
+ messageHeader.ControlBufferLen = 0;
+ messageHeader.Flags = SocketFlags.None;
+
+ Interop.Sys.IOVector[] iovecs = _ioUringIovecs!;
+ if (iovCount != 0)
+ {
+ fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0])
+ {
+ messageHeader.IOVectors = iovecsPtr;
+ messageHeader.IOVectorCount = iovCount;
+ // Buffer-list sends can be many small segments (e.g. 4KB chunks). Use
+ // aggregate payload size for zero-copy eligibility, not per-segment size.
+ long totalPayloadBytes = 0;
+ for (int i = 0; i < iovCount; i++)
+ {
+ totalPayloadBytes += (long)(nuint)iovecs[i].Count;
+ if (totalPayloadBytes >= int.MaxValue)
+ {
+ totalPayloadBytes = int.MaxValue;
+ break;
+ }
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ context._socket,
+ &messageHeader,
+ (int)totalPayloadBytes,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+ }
+
+ messageHeader.IOVectors = null;
+ messageHeader.IOVectorCount = 0;
+ IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ context._socket,
+ &messageHeader,
+ payloadLength: 0,
+ Flags,
+ out userData,
+ out SocketError zeroIovErrorCode);
+ ErrorCode = zeroIovErrorCode;
+ return zeroIovPrepareResult;
+ }
+
+ ///
+ protected override bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ if (result == 0)
+ {
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ Debug.Assert(result > 0, $"Expected positive io_uring send completion size, got {result}");
+ BytesTransferred += result;
+ bool complete = AdvanceSendBufferPosition(result);
+ ErrorCode = SocketError.Success;
+ return complete;
+ }
+ }
+
+ private sealed partial class BufferMemoryReceiveOperation
+ {
+ private IntPtr _ioUringMessageHeader;
+ private MemoryHandle _ioUringPinnedSocketAddress;
+ private int _ioUringPinnedSocketAddressActive;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringSocketAddressAndMessageHeader(
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ ref _ioUringMessageHeader);
+ }
+
+ /// Gets a message header buffer and sets the common recvmsg fields.
+ private unsafe Interop.Sys.MessageHeader* GetOrCreateIoUringReceiveMessageHeader(byte* rawSocketAddress)
+ {
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ InitializeReceiveMessageHeader(messageHeader, rawSocketAddress);
+ return messageHeader;
+ }
+
+ /// Initializes recvmsg header fields shared by direct preparation variants.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void InitializeReceiveMessageHeader(Interop.Sys.MessageHeader* messageHeader, byte* rawSocketAddress)
+ {
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ messageHeader->Flags = SocketFlags.None;
+ }
+
+ /// Configures a message header with a single iovec entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void ConfigureSingleIov(
+ Interop.Sys.MessageHeader* messageHeader,
+ byte* rawBuffer,
+ int bufferLength,
+ Interop.Sys.IOVector* iov)
+ {
+ // Keep a single iovec even for zero-length receives so recvmsg preserves
+ // completion-mode readiness probe behavior for zero-byte operations.
+ iov->Base = rawBuffer;
+ iov->Count = (UIntPtr)bufferLength;
+ messageHeader->IOVectors = iov;
+ messageHeader->IOVectorCount = 1;
+ }
+
+ /// Builds a connected or receive-from recvmsg operation.
+ private unsafe IoUringDirectPrepareResult IoUringPrepareDirectReceiveMessage(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringSocketAddressForPrepare(
+ SocketAddress,
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = GetOrCreateIoUringReceiveMessageHeader(rawSocketAddress);
+ Interop.Sys.IOVector receiveIov;
+ ConfigureSingleIov(messageHeader, rawBuffer, Buffer.Length, &receiveIov);
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+
+ ///
+ /// Returns whether this operation shape is eligible for multishot recv submission.
+ /// Eligible: connected TCP receive (no socket address, no recvmsg flags) with non-empty buffer.
+ /// Ineligible: zero-byte probes, recvmsg-based receive paths (SetReceivedFlags/socket address).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsEligibleForIoUringMultishotRecv()
+ {
+ if (SetReceivedFlags || SocketAddress.Length != 0)
+ {
+ return false;
+ }
+
+ return Buffer.Length != 0;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (SetReceivedFlags || SocketAddress.Length != 0)
+ {
+ if (context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.OneShot);
+ IoUringDirectPrepareResult receiveMessagePrepareResult =
+ IoUringPrepareDirectReceiveMessage(context, engine, out userData);
+ if (receiveMessagePrepareResult != IoUringDirectPrepareResult.Prepared || ErrorCode != SocketError.Success)
+ {
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None);
+ }
+
+ return receiveMessagePrepareResult;
+ }
+
+ bool allowMultishotRecv = IsEligibleForIoUringMultishotRecv() && engine.SupportsMultishotRecv;
+ if (!allowMultishotRecv && context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ SetIoUringReceiveSubmissionMode(
+ allowMultishotRecv ? IoUringReceiveSubmissionMode.Multishot : IoUringReceiveSubmissionMode.OneShot);
+
+ // Persistent multishot receive: if one is already armed, attach this operation to
+ // that existing user_data instead of submitting a new recv SQE.
+ if (allowMultishotRecv && context.IsPersistentMultishotRecvArmed())
+ {
+ ulong armedUserData = context.PersistentMultishotRecvUserData;
+ if (armedUserData != 0 &&
+ engine.TryReplaceIoUringTrackedOperation(armedUserData, this))
+ {
+ SocketsTelemetry.Log.IoUringPersistentMultishotRecvReuse();
+ userData = armedUserData;
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.Prepared;
+ }
+
+ // Stale armed-state; clear and submit a fresh SQE below.
+ context.ClearPersistentMultishotRecvArmed();
+ }
+
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ ErrorCode = SocketError.Success;
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None);
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectRecv(
+ context._socket,
+ rawBuffer,
+ Buffer.Length,
+ Flags,
+ allowMultishotRecv,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ if (allowMultishotRecv &&
+ prepareResult == IoUringDirectPrepareResult.Prepared &&
+ errorCode == SocketError.Success)
+ {
+ context.SetPersistentMultishotRecvArmed(userData);
+ }
+
+ if (prepareResult != IoUringDirectPrepareResult.Prepared || errorCode != SocketError.Success)
+ {
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None);
+ }
+
+ return prepareResult;
+ }
+
+ ///
+ protected override bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ BytesTransferred = result;
+ ReceivedFlags = SetReceivedFlags ? (SocketFlags)(int)auxiliaryData : SocketFlags.None;
+
+ if (SocketAddress.Length != 0)
+ {
+ int socketAddressLen = IoUringCompletionSocketAddressLen;
+ if (socketAddressLen < 0)
+ {
+ socketAddressLen = 0;
+ }
+
+ if ((uint)socketAddressLen > (uint)SocketAddress.Length)
+ {
+ socketAddressLen = SocketAddress.Length;
+ }
+
+ SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+ }
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ ///
+ internal override unsafe bool TryProcessIoUringProvidedBufferCompletion(
+ byte* providedBuffer,
+ int providedBufferLength,
+ int bytesTransferred,
+ ref uint auxiliaryData)
+ {
+ _ = auxiliaryData;
+
+ if (bytesTransferred <= 0)
+ {
+ return true;
+ }
+
+ if (SetReceivedFlags || SocketAddress.Length != 0)
+ {
+ return false;
+ }
+
+ if ((uint)bytesTransferred > (uint)providedBufferLength ||
+ (uint)bytesTransferred > (uint)Buffer.Length)
+ {
+ return false;
+ }
+
+ new ReadOnlySpan(providedBuffer, bytesTransferred).CopyTo(Buffer.Span);
+ return true;
+ }
+ }
+
+ private sealed partial class BufferListReceiveOperation
+ {
+ private GCHandle[]? _ioUringPinnedBufferHandles;
+ private Interop.Sys.IOVector[]? _ioUringIovecs;
+ private int _ioUringPinnedHandleCount;
+ private IntPtr _ioUringMessageHeader;
+ private int _ioUringPreparedIovCount;
+ private int _ioUringPreparedBufferCount = -1;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedIovCount = 0;
+ _ioUringPreparedBufferCount = -1;
+
+ IntPtr messageHeader = Interlocked.Exchange(ref _ioUringMessageHeader, IntPtr.Zero);
+ if (messageHeader != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)messageHeader);
+ }
+ }
+
+ /// Pins all buffer segments and builds the iovec array.
+ private bool TryPinIoUringBuffers(IList> buffers, out int iovCount)
+ {
+ if (_ioUringPinnedBufferHandles is not null &&
+ _ioUringIovecs is not null &&
+ _ioUringPreparedIovCount != 0 &&
+ _ioUringPreparedIovCount <= _ioUringIovecs.Length &&
+ _ioUringPreparedBufferCount == buffers.Count)
+ {
+ iovCount = _ioUringPreparedIovCount;
+ return true;
+ }
+
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+
+ if (!TryPinBufferListForIoUring(
+ buffers,
+ startIndex: 0,
+ startOffset: 0,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode))
+ {
+ ErrorCode = errorCode;
+ return false;
+ }
+
+ _ioUringPinnedBufferHandles = pinnedHandles;
+ _ioUringIovecs = iovecs;
+ _ioUringPinnedHandleCount = pinnedHandleCount;
+ _ioUringPreparedIovCount = iovCount;
+ _ioUringPreparedBufferCount = buffers.Count;
+ return true;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ IList>? buffers = Buffers;
+ if (buffers is null)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringBuffers(buffers, out int iovCount))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ byte* rawSocketAddress = null;
+ if (SocketAddress.Length != 0 && !TryPinIoUringBuffer(SocketAddress, out rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ messageHeader->Flags = SocketFlags.None;
+
+ Interop.Sys.IOVector[] iovecs = _ioUringIovecs!;
+ if (iovCount != 0)
+ {
+ fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0])
+ {
+ messageHeader->IOVectors = iovecsPtr;
+ messageHeader->IOVectorCount = iovCount;
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+ }
+
+ messageHeader->IOVectors = null;
+ messageHeader->IOVectorCount = 0;
+ IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError zeroIovErrorCode);
+ ErrorCode = zeroIovErrorCode;
+ return zeroIovPrepareResult;
+ }
+
+ ///
+ protected override unsafe bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ BytesTransferred = result;
+ ReceivedFlags = (SocketFlags)(int)auxiliaryData;
+ ErrorCode = SocketError.Success;
+
+ if (_ioUringMessageHeader != IntPtr.Zero && SocketAddress.Length != 0)
+ {
+ int socketAddressLen = IoUringCompletionSocketAddressLen;
+ if (socketAddressLen < 0)
+ {
+ socketAddressLen = 0;
+ }
+
+ if ((uint)socketAddressLen > (uint)SocketAddress.Length)
+ {
+ socketAddressLen = SocketAddress.Length;
+ }
+
+ SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+ }
+
+ return true;
+ }
+ }
+
+ private sealed partial class ReceiveMessageFromOperation
+ {
+ private GCHandle[]? _ioUringPinnedBufferHandles;
+ private Interop.Sys.IOVector[]? _ioUringIovecs;
+ private int _ioUringPinnedHandleCount;
+ private int _ioUringPreparedIovCount;
+ private int _ioUringPreparedBufferListCount = -1;
+ private IntPtr _ioUringMessageHeader;
+ private IntPtr _ioUringControlBuffer;
+ private int _ioUringControlBufferLength;
+ private MemoryHandle _ioUringPinnedSocketAddress;
+ private int _ioUringPinnedSocketAddressActive;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedIovCount = 0;
+ _ioUringPreparedBufferListCount = -1;
+
+ IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero);
+ if (controlBuffer != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)controlBuffer);
+ }
+ _ioUringControlBufferLength = 0;
+
+ ReleaseIoUringSocketAddressAndMessageHeader(
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ ref _ioUringMessageHeader);
+ }
+
+ /// Pins buffer segments and builds the iovec array for recvmsg.
+ private bool TryPinIoUringBuffers(IList> buffers, out int iovCount)
+ {
+ if (_ioUringPinnedBufferHandles is not null &&
+ _ioUringIovecs is not null &&
+ _ioUringPreparedIovCount <= _ioUringIovecs.Length &&
+ _ioUringPreparedBufferListCount == buffers.Count)
+ {
+ iovCount = _ioUringPreparedIovCount;
+ return true;
+ }
+
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+
+ if (!TryPinBufferListForIoUring(
+ buffers,
+ startIndex: 0,
+ startOffset: 0,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode))
+ {
+ ErrorCode = errorCode;
+ return false;
+ }
+
+ _ioUringPinnedBufferHandles = pinnedHandles;
+ _ioUringIovecs = iovecs;
+ _ioUringPinnedHandleCount = pinnedHandleCount;
+ _ioUringPreparedIovCount = iovCount;
+ _ioUringPreparedBufferListCount = buffers.Count;
+ return true;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ IList>? buffers = Buffers;
+ byte* rawBuffer = null;
+ int iovCount;
+ if (buffers is not null)
+ {
+ ReleaseIoUringPinnedBufferForShapeTransition();
+ if (!TryPinIoUringBuffers(buffers, out iovCount))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+ }
+ else
+ {
+ if (!TryPinIoUringBuffer(Buffer, out rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (_ioUringPinnedBufferHandles is not null || _ioUringIovecs is not null)
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedIovCount = 0;
+ _ioUringPreparedBufferListCount = -1;
+ }
+
+ iovCount = 1;
+ }
+
+ if (!TryPinIoUringSocketAddressForPrepare(
+ SocketAddress,
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->Flags = SocketFlags.None;
+
+ int controlBufferLen = Interop.Sys.GetControlMessageBufferSize(Convert.ToInt32(IsIPv4), Convert.ToInt32(IsIPv6));
+ if (controlBufferLen < 0)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (controlBufferLen != 0)
+ {
+ if (_ioUringControlBuffer == IntPtr.Zero || _ioUringControlBufferLength != controlBufferLen)
+ {
+ IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero);
+ if (controlBuffer != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)controlBuffer);
+ }
+
+ void* rawControlBuffer = NativeMemory.Alloc((nuint)controlBufferLen);
+ _ioUringControlBuffer = (IntPtr)rawControlBuffer;
+ _ioUringControlBufferLength = controlBufferLen;
+ }
+
+ messageHeader->ControlBuffer = (byte*)_ioUringControlBuffer;
+ messageHeader->ControlBufferLen = controlBufferLen;
+ }
+ else
+ {
+ IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero);
+ if (controlBuffer != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)controlBuffer);
+ }
+
+ _ioUringControlBufferLength = 0;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ }
+
+ if (buffers is not null)
+ {
+ Interop.Sys.IOVector[] iovecs = _ioUringIovecs!;
+ if (iovCount != 0)
+ {
+ fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0])
+ {
+ messageHeader->IOVectors = iovecsPtr;
+ messageHeader->IOVectorCount = iovCount;
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+ }
+
+ messageHeader->IOVectors = null;
+ messageHeader->IOVectorCount = 0;
+ IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError zeroIovErrorCode);
+ ErrorCode = zeroIovErrorCode;
+ return zeroIovPrepareResult;
+ }
+
+ Interop.Sys.IOVector iov;
+ iov.Base = rawBuffer;
+ iov.Count = (UIntPtr)Buffer.Length;
+ messageHeader->IOVectors = &iov;
+ messageHeader->IOVectorCount = 1;
+ IoUringDirectPrepareResult singleBufferPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError singleBufferErrorCode);
+ ErrorCode = singleBufferErrorCode;
+ return singleBufferPrepareResult;
+ }
+
+ ///
+ protected override unsafe bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ BytesTransferred = result;
+ ReceivedFlags = (SocketFlags)(int)auxiliaryData;
+ ErrorCode = SocketError.Success;
+ IPPacketInformation = default;
+
+ if (_ioUringMessageHeader != IntPtr.Zero)
+ {
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ int socketAddressCapacity = SocketAddress.Length;
+ int socketAddressLen = IoUringCompletionSocketAddressLen;
+ if (socketAddressLen < 0)
+ {
+ socketAddressLen = 0;
+ }
+
+ if ((uint)socketAddressLen > (uint)socketAddressCapacity)
+ {
+ socketAddressLen = socketAddressCapacity;
+ }
+
+ if (socketAddressLen == 0 && socketAddressCapacity != 0)
+ {
+ socketAddressLen = socketAddressCapacity;
+ SocketAddress.Span.Clear();
+ }
+
+ int controlBufferCapacity = messageHeader->ControlBufferLen;
+ int controlBufferLen = IoUringCompletionControlBufferLen;
+ if (controlBufferLen < 0)
+ {
+ controlBufferLen = 0;
+ }
+
+ if ((uint)controlBufferLen > (uint)controlBufferCapacity)
+ {
+ controlBufferLen = controlBufferCapacity;
+ }
+
+ messageHeader->SocketAddressLen = socketAddressLen;
+ messageHeader->ControlBufferLen = controlBufferLen;
+ messageHeader->Flags = ReceivedFlags;
+
+ SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+
+ IPPacketInformation = SocketPal.GetIoUringIPPacketInformation(messageHeader, IsIPv4, IsIPv6);
+ }
+
+ return true;
+ }
+
+ ///
+ protected override bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ if (!ProcessIoUringErrorResult(result, out ErrorCode))
+ {
+ return false;
+ }
+
+ IPPacketInformation = default;
+ return true;
+ }
+ }
+
+ internal sealed partial class AcceptOperation
+ {
+ ///
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Read;
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ AcceptSocketAddressLength = SocketAddress.Length;
+ if (!TryPinIoUringBuffer(SocketAddress, out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (engine.SupportsMultishotAccept &&
+ Interlocked.CompareExchange(ref context._multishotAcceptArmed, 2, 0) == 0)
+ {
+ context.EnsureMultishotAcceptQueueInitialized();
+ IoUringDirectPrepareResult multishotPrepareResult = engine.TryPrepareIoUringDirectMultishotAccept(
+ context._socket,
+ rawSocketAddress,
+ SocketAddress.Length,
+ out userData,
+ out SocketError multishotErrorCode);
+ if (multishotPrepareResult == IoUringDirectPrepareResult.Prepared)
+ {
+ Volatile.Write(ref context._multishotAcceptUserData, userData);
+ Volatile.Write(ref context._multishotAcceptArmed, 1);
+ ErrorCode = multishotErrorCode;
+ return multishotPrepareResult;
+ }
+
+ context.DisarmMultishotAccept();
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectAccept(
+ context._socket,
+ rawSocketAddress,
+ SocketAddress.Length,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+
+ ///
+ protected override bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ AcceptedFileDescriptor = (IntPtr)result;
+ ErrorCode = SocketError.Success;
+ // Keep parity with readiness path: always honor reported address length, including 0.
+ AcceptSocketAddressLength = auxiliaryData > (uint)SocketAddress.Length ? SocketAddress.Length : (int)auxiliaryData;
+ SocketAddress = SocketAddress.Slice(0, AcceptSocketAddressLength);
+ return true;
+ }
+
+ ///
+ protected override bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ AcceptedFileDescriptor = (IntPtr)(-1);
+ return base.ProcessIoUringCompletionError(context, result, flags, auxiliaryData);
+ }
+ }
+
+ private sealed partial class ConnectOperation
+ {
+ ///
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Write;
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (!TryPinIoUringBuffer(SocketAddress, out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectConnect(
+ context._socket,
+ rawSocketAddress,
+ SocketAddress.Length,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+
+ ///
+ protected override bool ProcessIoUringCompletionError(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ Interop.Error error = GetIoUringPalError(result);
+ if (error == Interop.Error.EINPROGRESS)
+ {
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ if (!base.ProcessIoUringCompletionError(context, result, flags, auxiliaryData))
+ {
+ return false;
+ }
+
+ context._socket.RegisterConnectResult(ErrorCode);
+ return true;
+ }
+
+ ///
+ protected override bool ProcessIoUringCompletionSuccess(SocketAsyncContext context, int result, uint flags, uint auxiliaryData)
+ {
+ ErrorCode = SocketError.Success;
+ context._socket.RegisterConnectResult(ErrorCode);
+
+ if (Buffer.Length > 0)
+ {
+ Action, SocketFlags, SocketError>? callback = Callback;
+ Debug.Assert(callback is not null);
+ SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, default, ref BytesTransferred, callback!, default);
+ if (error == SocketError.IOPending)
+ {
+ // Callback ownership moved to the async send operation.
+ Callback = null;
+ Buffer = default;
+ }
+ else
+ {
+ if (error != SocketError.Success)
+ {
+ ErrorCode = error;
+ context._socket.RegisterConnectResult(ErrorCode);
+ }
+
+ // Follow-up send completed synchronously (success/error), so invoke
+ // Connect callback from this operation path.
+ Buffer = default;
+ }
+ }
+
+ return true;
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs
index 4e2e117984084c..94d6838e1a890d 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs
@@ -43,10 +43,10 @@ internal sealed partial class SocketAsyncContext
private BufferListReceiveOperation? _cachedBufferListReceiveOperation;
private BufferMemorySendOperation? _cachedBufferMemorySendOperation;
private BufferListSendOperation? _cachedBufferListSendOperation;
-
private void ReturnOperation(AcceptOperation operation)
{
operation.Reset();
+ operation.AcceptSocketAddressLength = 0;
operation.Callback = null;
operation.SocketAddress = default;
Volatile.Write(ref _cachedAcceptOperation, operation); // benign race condition
@@ -83,6 +83,7 @@ private void ReturnOperation(BufferListSendOperation operation)
{
operation.Reset();
operation.Buffers = null;
+ operation.SetBufferPosition(bufferIndex: 0, offset: 0);
operation.Callback = null;
operation.SocketAddress = default;
Volatile.Write(ref _cachedBufferListSendOperation, operation); // benign race condition
@@ -108,7 +109,14 @@ private BufferListSendOperation RentBufferListSendOperation() =>
Interlocked.Exchange(ref _cachedBufferListSendOperation, null) ??
new BufferListSendOperation(this);
- private abstract class AsyncOperation : IThreadPoolWorkItem
+ // Partial method hooks for io_uring completion-mode staging (Linux-only).
+ // No-op on non-Linux; implemented in SocketAsyncContext.IoUring.Linux.cs.
+ static partial void LinuxTryStageIoUringOperation(AsyncOperation operation);
+ partial void LinuxTryDequeuePreAcceptedConnection(AcceptOperation operation, ref bool dequeued);
+ partial void LinuxTryConsumeBufferedPersistentMultishotRecvData(Memory destination, ref bool consumed, ref int bytesTransferred);
+ partial void LinuxOnStopAndAbort();
+
+ internal abstract partial class AsyncOperation : IThreadPoolWorkItem
{
private enum State
{
@@ -141,6 +149,7 @@ public AsyncOperation(SocketAsyncContext context)
public void Reset()
{
+ ResetIoUringState();
_state = State.Waiting;
Event = null;
Next = this;
@@ -202,6 +211,16 @@ public OperationResult TryComplete(SocketAsyncContext context)
}
public bool TryCancel()
+ {
+ return TryCancelCore(requestIoUringCancellation: true);
+ }
+
+ internal bool TryCancelForTeardown()
+ {
+ return TryCancelCore(requestIoUringCancellation: false);
+ }
+
+ private bool TryCancelCore(bool requestIoUringCancellation)
{
Trace("Enter");
@@ -232,6 +251,9 @@ public bool TryCancel()
return false;
}
+ // Best effort: if completion-mode io_uring work was already submitted, request kernel-side cancellation now.
+ // Partial method: no-op on non-Linux; implemented in SocketAsyncContext.IoUring.Linux.cs.
+ LinuxRequestIoUringCancellationIfNeeded(requestIoUringCancellation);
ProcessCancellation();
// Note, we leave the operation in the OperationQueue.
@@ -245,6 +267,7 @@ public void ProcessCancellation()
Debug.Assert(_state == State.Canceled);
+ LinuxUntrackIoUringOperation();
ErrorCode = SocketError.OperationAborted;
ManualResetEventSlim? e = Event;
@@ -305,17 +328,29 @@ void IThreadPoolWorkItem.Execute()
// We could also add an abstract method that the base interface implementation
// invokes, but that adds an extra virtual dispatch.
Debug.Fail("Expected derived type to implement IThreadPoolWorkItem");
- throw new InvalidOperationException();
+ ThrowExpectedDerivedTypeToImplementThreadPoolWorkItem();
}
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowExpectedDerivedTypeToImplementThreadPoolWorkItem() =>
+ throw new InvalidOperationException();
+
// Called when op is not in the queue yet, so can't be otherwise executing
public void DoAbort()
{
+ LinuxUntrackIoUringOperation();
ErrorCode = SocketError.OperationAborted;
}
protected abstract bool DoTryComplete(SocketAsyncContext context);
+ partial void ResetIoUringState();
+ partial void LinuxRequestIoUringCancellationIfNeeded(bool requestIoUringCancellation);
+ partial void LinuxUntrackIoUringOperation();
+
+ internal virtual bool ShouldDispatchCallback => true;
+
public abstract void InvokeCallback(bool allowPooling);
[Conditional("SOCKETASYNCCONTEXT_TRACE")]
@@ -333,21 +368,21 @@ public void TraceWithContext(SocketAsyncContext context, string message, [Caller
// These two abstract classes differentiate the operations that go in the
// read queue vs the ones that go in the write queue.
- private abstract class ReadOperation : AsyncOperation, IThreadPoolWorkItem
+ internal abstract partial class ReadOperation : AsyncOperation, IThreadPoolWorkItem
{
public ReadOperation(SocketAsyncContext context) : base(context) { }
void IThreadPoolWorkItem.Execute() => AssociatedContext.ProcessAsyncReadOperation(this);
}
- private abstract class WriteOperation : AsyncOperation, IThreadPoolWorkItem
+ private abstract partial class WriteOperation : AsyncOperation, IThreadPoolWorkItem
{
public WriteOperation(SocketAsyncContext context) : base(context) { }
void IThreadPoolWorkItem.Execute() => AssociatedContext.ProcessAsyncWriteOperation(this);
}
- private abstract class SendOperation : WriteOperation
+ private abstract partial class SendOperation : WriteOperation
{
public SocketFlags Flags;
public int BytesTransferred;
@@ -360,9 +395,10 @@ public SendOperation(SocketAsyncContext context) : base(context) { }
public override void InvokeCallback(bool allowPooling) =>
Callback!(BytesTransferred, SocketAddress, SocketFlags.None, ErrorCode);
+
}
- private class BufferMemorySendOperation : SendOperation
+ private partial class BufferMemorySendOperation : SendOperation
{
public Memory Buffer;
@@ -390,7 +426,7 @@ public override void InvokeCallback(bool allowPooling)
}
}
- private sealed class BufferListSendOperation : SendOperation
+ private sealed partial class BufferListSendOperation : SendOperation
{
public IList>? Buffers;
public int BufferIndex;
@@ -402,6 +438,12 @@ protected override bool DoTryComplete(SocketAsyncContext context)
return SocketPal.TryCompleteSendTo(context._socket, default(ReadOnlySpan), Buffers, ref BufferIndex, ref Offset, ref Count, Flags, SocketAddress.Span, ref BytesTransferred, out ErrorCode);
}
+ internal void SetBufferPosition(int bufferIndex, int offset)
+ {
+ BufferIndex = bufferIndex;
+ Offset = offset;
+ }
+
public override void InvokeCallback(bool allowPooling)
{
var cb = Callback!;
@@ -446,7 +488,7 @@ public override void InvokeCallback(bool allowPooling) =>
Callback!(BytesTransferred, SocketAddress, ReceivedFlags, ErrorCode);
}
- private sealed class BufferMemoryReceiveOperation : ReceiveOperation
+ private sealed partial class BufferMemoryReceiveOperation : ReceiveOperation
{
public Memory Buffer;
public bool SetReceivedFlags;
@@ -455,6 +497,19 @@ public BufferMemoryReceiveOperation(SocketAsyncContext context) : base(context)
protected override bool DoTryComplete(SocketAsyncContext context)
{
+ bool consumedBufferedData = false;
+ int bufferedBytes = 0;
+ context.LinuxTryConsumeBufferedPersistentMultishotRecvData(Buffer, ref consumedBufferedData, ref bufferedBytes);
+ if (!SetReceivedFlags &&
+ SocketAddress.Length == 0 &&
+ consumedBufferedData)
+ {
+ BytesTransferred = bufferedBytes;
+ ReceivedFlags = SocketFlags.None;
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
// Zero byte read is performed to know when data is available.
// We don't have to call receive, our caller is interested in the event.
if (Buffer.Length == 0 && Flags == SocketFlags.None && SocketAddress.Length == 0)
@@ -502,7 +557,7 @@ public override void InvokeCallback(bool allowPooling)
}
}
- private sealed class BufferListReceiveOperation : ReceiveOperation
+ private sealed partial class BufferListReceiveOperation : ReceiveOperation
{
public IList>? Buffers;
@@ -553,7 +608,7 @@ protected override bool DoTryComplete(SocketAsyncContext context)
}
}
- private sealed class ReceiveMessageFromOperation : ReadOperation
+ private sealed partial class ReceiveMessageFromOperation : ReadOperation
{
public Memory Buffer;
public SocketFlags Flags;
@@ -613,9 +668,10 @@ public override void InvokeCallback(bool allowPooling) =>
Callback!(BytesTransferred, SocketAddress, ReceivedFlags, IPPacketInformation, ErrorCode);
}
- private sealed class AcceptOperation : ReadOperation
+ internal sealed partial class AcceptOperation : ReadOperation
{
public IntPtr AcceptedFileDescriptor;
+ public int AcceptSocketAddressLength;
public AcceptOperation(SocketAsyncContext context) : base(context) { }
@@ -623,11 +679,19 @@ public AcceptOperation(SocketAsyncContext context) : base(context) { }
protected override bool DoTryComplete(SocketAsyncContext context)
{
+ bool dequeuedPreAcceptedConnection = false;
+ context.LinuxTryDequeuePreAcceptedConnection(this, ref dequeuedPreAcceptedConnection);
+ if (dequeuedPreAcceptedConnection)
+ {
+ return true;
+ }
+
bool completed = SocketPal.TryCompleteAccept(context._socket, SocketAddress, out int socketAddressLen, out AcceptedFileDescriptor, out ErrorCode);
+ AcceptSocketAddressLength = socketAddressLen;
Debug.Assert(ErrorCode == SocketError.Success || AcceptedFileDescriptor == (IntPtr)(-1), $"Unexpected values: ErrorCode={ErrorCode}, AcceptedFileDescriptor={AcceptedFileDescriptor}");
if (ErrorCode == SocketError.Success)
{
- SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+ SocketAddress = SocketAddress.Slice(0, AcceptSocketAddressLength);
}
return completed;
}
@@ -648,7 +712,7 @@ public override void InvokeCallback(bool allowPooling)
}
}
- private sealed class ConnectOperation : BufferMemorySendOperation
+ private sealed partial class ConnectOperation : BufferMemorySendOperation
{
public ConnectOperation(SocketAsyncContext context) : base(context) { }
@@ -659,28 +723,47 @@ protected override bool DoTryComplete(SocketAsyncContext context)
if (result && ErrorCode == SocketError.Success && Buffer.Length > 0)
{
- SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, Memory.Empty, ref BytesTransferred, Callback!, default);
- if (error != SocketError.Success && error != SocketError.IOPending)
+ Action, SocketFlags, SocketError>? callback = Callback;
+ Debug.Assert(callback != null);
+ SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, Memory.Empty, ref BytesTransferred, callback!, default);
+ if (error == SocketError.IOPending)
{
- context._socket.RegisterConnectResult(ErrorCode);
+ // Callback ownership moved to the async send operation.
+ Callback = null;
+ Buffer = default;
+ }
+ else
+ {
+ if (error != SocketError.Success)
+ {
+ ErrorCode = error;
+ context._socket.RegisterConnectResult(ErrorCode);
+ }
+
+ // Follow-up send completed synchronously (success/error), so invoke
+ // Connect callback from this operation path.
+ Buffer = default;
}
}
return result;
}
+ internal override bool ShouldDispatchCallback => Buffer.Length == 0 && Callback is not null;
+
public override void InvokeCallback(bool allowPooling)
{
- var cb = Callback!;
+ Action, SocketFlags, SocketError>? cb = Callback;
int bt = BytesTransferred;
Memory sa = SocketAddress;
SocketError ec = ErrorCode;
Memory buffer = Buffer;
- if (buffer.Length == 0)
+ if (cb != null && (buffer.Length == 0 || ec == SocketError.OperationAborted))
{
// Invoke callback only when we are completely done.
// In case data were provided for Connect we may or may not send them all.
- // If we did not we will need follow-up with Send operation
+ // If we did not we will need follow-up with Send operation.
+ // On cancellation, always invoke — the send was never started.
cb(bt, sa, SocketFlags.None, ec);
}
}
@@ -890,6 +973,9 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation
operation.CancellationRegistration = cancellationToken.UnsafeRegister(s => ((TOperation)s!).TryCancel(), operation);
}
+ // Completion-mode staging: partial method is no-op on non-Linux.
+ LinuxTryStageIoUringOperation(operation);
+
return true;
case QueueState.Stopped:
@@ -898,7 +984,7 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation
break;
default:
- Environment.FailFast("unexpected queue state");
+ FailFastUnexpectedQueueState(_state);
break;
}
}
@@ -939,7 +1025,7 @@ static void HandleFailedRegistration(SocketAsyncContext context, TOperation oper
}
else
{
- throw new InternalException(error);
+ ThrowInternalException(error);
}
}
}
@@ -986,7 +1072,7 @@ static void HandleFailedRegistration(SocketAsyncContext context, TOperation oper
return null;
default:
- Environment.FailFast("unexpected queue state");
+ FailFastUnexpectedQueueState(_state);
return null;
}
}
@@ -1022,7 +1108,10 @@ internal void ProcessAsyncOperation(TOperation op)
// request for a previous operation could affect a subsequent one)
// and here we know the operation has completed.
op.CancellationRegistration.Dispose();
- op.InvokeCallback(allowPooling: true);
+ if (op.ShouldDispatchCallback)
+ {
+ op.InvokeCallback(allowPooling: true);
+ }
}
}
@@ -1129,6 +1218,59 @@ public OperationResult ProcessQueuedOperation(TOperation op)
return result;
}
+ public bool TryRemoveCompletedOperation(SocketAsyncContext context, TOperation operation)
+ {
+ using (Lock())
+ {
+ if (_tail == null || _state == QueueState.Stopped)
+ {
+ return false;
+ }
+
+ AsyncOperation? previous = _tail;
+ AsyncOperation? current = _tail.Next;
+ while (!ReferenceEquals(current, operation))
+ {
+ if (ReferenceEquals(current, _tail))
+ {
+ return false;
+ }
+
+ previous = current;
+ current = current!.Next;
+ }
+
+ Debug.Assert(previous != null && current != null);
+ bool removedHead = ReferenceEquals(current, _tail.Next);
+ bool removedTail = ReferenceEquals(current, _tail);
+
+ if (removedHead && removedTail)
+ {
+ _tail = null;
+ _isNextOperationSynchronous = false;
+ _state = QueueState.Ready;
+ _sequenceNumber++;
+ Trace(context, $"Removed completed {IdOf(operation)} (queue empty)");
+ return true;
+ }
+
+ previous!.Next = current!.Next;
+ if (removedTail)
+ {
+ _tail = (TOperation)previous;
+ }
+
+ if (removedHead)
+ {
+ Debug.Assert(_tail != null);
+ _isNextOperationSynchronous = _tail.Next.Event != null;
+ }
+
+ Trace(context, $"Removed completed {IdOf(operation)}");
+ return true;
+ }
+ }
+
public void CancelAndContinueProcessing(TOperation op)
{
// Note, only sync operations use this method.
@@ -1244,6 +1386,17 @@ public bool StopAndAbort(SocketAsyncContext context)
return aborted;
}
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowInternalException(Interop.Error error) =>
+ throw new InternalException(error);
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void FailFastUnexpectedQueueState(QueueState state) =>
+ Environment.FailFast($"unexpected queue state: {state}");
+
[Conditional("SOCKETASYNCCONTEXT_TRACE")]
public void Trace(SocketAsyncContext context, string message, [CallerMemberName] string? memberName = null)
{
@@ -1328,6 +1481,7 @@ public bool StopAndAbort()
// Drain queues
aborted |= _sendQueue.StopAndAbort(this);
aborted |= _receiveQueue.StopAndAbort(this);
+ LinuxOnStopAndAbort();
// We don't need to synchronize with Register.
// This method is called when the handle gets released.
@@ -1360,7 +1514,7 @@ public void SetHandleNonBlocking()
{
if (Interop.Sys.Fcntl.SetIsNonBlocking(_socket, 1) != 0)
{
- throw new SocketException((int)SocketPal.GetSocketErrorForErrorCode(Interop.Sys.GetLastError()));
+ ThrowSocketExceptionFromLastError();
}
_isHandleNonBlocking = true;
@@ -1369,11 +1523,36 @@ public void SetHandleNonBlocking()
public bool IsHandleNonBlocking => _isHandleNonBlocking;
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ThrowIfThreadsAreNotSupported()
+ {
+ if (!Socket.OSSupportsThreads)
+ {
+ ThrowPlatformNotSupportedForMissingThreadSupport();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ValidateSyncOperationPreconditions(int timeout)
+ {
+ ThrowIfThreadsAreNotSupported();
+ Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ }
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowPlatformNotSupportedForMissingThreadSupport() =>
+ throw new PlatformNotSupportedException();
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowSocketExceptionFromLastError() =>
+ throw new SocketException((int)SocketPal.GetSocketErrorForErrorCode(Interop.Sys.GetLastError()));
+
private void PerformSyncOperation(ref OperationQueue queue, TOperation operation, int timeout, int observedSequenceNumber)
where TOperation : AsyncOperation
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
using (var e = new ManualResetEventSlim(false, 0))
{
@@ -1509,7 +1688,7 @@ public SocketError AcceptAsync(Memory socketAddress, out int socketAddress
public SocketError Connect(Memory socketAddress)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
+ ThrowIfThreadsAreNotSupported();
Debug.Assert(socketAddress.Length > 0, $"Unexpected socketAddressLen: {socketAddress.Length}");
// Connect is different than the usual "readiness" pattern of other operations.
@@ -1603,9 +1782,7 @@ public SocketError ReceiveAsync(Memory buffer, SocketFlags flags, out int
public SocketError ReceiveFrom(Memory buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1636,7 +1813,7 @@ public SocketError ReceiveFrom(Memory buffer, ref SocketFlags flags, Memor
public unsafe SocketError ReceiveFrom(Span buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1748,9 +1925,7 @@ public SocketError ReceiveAsync(IList> buffers, SocketFlags f
public SocketError ReceiveFrom(IList> buffers, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1817,9 +1992,7 @@ public SocketError ReceiveFromAsync(IList> buffers, SocketFla
public SocketError ReceiveMessageFrom(
Memory buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, bool isIPv4, bool isIPv6, int timeout, out IPPacketInformation ipPacketInformation, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1854,9 +2027,7 @@ public SocketError ReceiveMessageFrom(
public unsafe SocketError ReceiveMessageFrom(
Span buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, bool isIPv4, bool isIPv6, int timeout, out IPPacketInformation ipPacketInformation, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1946,9 +2117,7 @@ public SocketError SendAsync(Memory buffer, int offset, int count, SocketF
public SocketError SendTo(byte[] buffer, int offset, int count, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
SocketError errorCode;
@@ -1978,9 +2147,7 @@ public SocketError SendTo(byte[] buffer, int offset, int count, SocketFlags flag
public unsafe SocketError SendTo(ReadOnlySpan buffer, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
SocketError errorCode;
@@ -2057,9 +2224,7 @@ public SocketError SendAsync(IList> buffers, SocketFlags flag
public SocketError SendTo(IList> buffers, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
int bufferIndex = 0;
@@ -2127,9 +2292,7 @@ public SocketError SendToAsync(IList> buffers, SocketFlags fl
public SocketError SendFile(SafeFileHandle fileHandle, long offset, long count, int timeout, out long bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
SocketError errorCode;
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs
new file mode 100644
index 00000000000000..1232968a0433d6
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs
@@ -0,0 +1,5664 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ /// Lock-free slot-based registry mapping io_uring user_data to managed instances.
+ private sealed class IoUringOperationRegistry
+ {
+ /// Result of attempting to remove a tracked operation by user_data.
+ internal enum RemoveResult
+ {
+ Removed,
+ NotFound,
+ Mismatch
+ }
+
+ private const int GenerationShift = IoUringConstants.SlotIndexBits;
+
+ private struct RegistrySlot
+ {
+ public SocketAsyncContext.AsyncOperation? Operation;
+ public uint Generation;
+ }
+
+ private readonly RegistrySlot[] _slots;
+ private int _count;
+
+ /// Initializes the registry with the specified number of completion slots.
+ internal IoUringOperationRegistry(int slotCapacity)
+ {
+ ArgumentOutOfRangeException.ThrowIfNegativeOrZero(slotCapacity);
+
+ _slots = new RegistrySlot[slotCapacity];
+ }
+
+ /// Returns true when no operations are currently tracked.
+ internal bool IsEmpty => Volatile.Read(ref _count) == 0;
+ /// Returns the current number of tracked operations.
+ internal int Count => Volatile.Read(ref _count);
+
+ /// Registers an operation by its user_data, returning false on slot collision.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryTrack(SocketAsyncContext.AsyncOperation operation)
+ {
+ ulong userData = operation.IoUringUserData;
+ if (!TryDecodeUserData(userData, out int slotIndex, out uint generation))
+ {
+ return false;
+ }
+
+ ref RegistrySlot slot = ref _slots[slotIndex];
+ if (Interlocked.CompareExchange(ref slot.Operation, operation, null) is not null)
+ {
+ return false;
+ }
+
+ // The generation write is ordered after the operation write. A concurrent reader
+ // (TryTake) that sees the new operation but a stale generation will correctly reject
+ // the take, since the generation mismatch means the CQE references a prior slot
+ // incarnation. This is safe because such rejection is treated as a benign late completion.
+ Volatile.Write(ref slot.Generation, generation);
+ Interlocked.Increment(ref _count);
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Prepared,
+ IoUringOperationLifecycleState.Submitted);
+ return true;
+ }
+
+ /// Atomically removes and returns the operation matching the user_data and generation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryTake(ulong userData, out SocketAsyncContext.AsyncOperation? operation)
+ {
+ operation = null;
+ if (!TryDecodeUserData(userData, out int slotIndex, out uint generation))
+ {
+ return false;
+ }
+
+ ref RegistrySlot slot = ref _slots[slotIndex];
+ while (true)
+ {
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref slot.Operation);
+ if (currentOperation is null)
+ {
+ return false;
+ }
+
+ if (Volatile.Read(ref slot.Generation) != generation)
+ {
+ return false;
+ }
+
+ if (Interlocked.CompareExchange(ref slot.Operation, null, currentOperation) != currentOperation)
+ {
+ continue;
+ }
+
+ Interlocked.Decrement(ref _count);
+ operation = currentOperation;
+ return true;
+ }
+ }
+
+ ///
+ /// Re-attaches a completion owner after dispatch-side deferral (for example, SEND_ZC waiting on NOTIF CQE).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryReattach(ulong userData, SocketAsyncContext.AsyncOperation operation)
+ {
+ if (!TryDecodeUserData(userData, out int slotIndex, out uint generation))
+ {
+ return false;
+ }
+
+ ref RegistrySlot slot = ref _slots[slotIndex];
+ if (Interlocked.CompareExchange(ref slot.Operation, operation, null) is not null)
+ {
+ return false;
+ }
+
+ Volatile.Write(ref slot.Generation, generation);
+ Interlocked.Increment(ref _count);
+ return true;
+ }
+
+ /// Removes a tracked operation, optionally verifying it matches an expected reference.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal RemoveResult TryUntrack(
+ ulong userData,
+ SocketAsyncContext.AsyncOperation? expectedOperation,
+ out SocketAsyncContext.AsyncOperation? removedOperation)
+ {
+ removedOperation = null;
+ if (!TryDecodeUserData(userData, out int slotIndex, out uint generation))
+ {
+ return RemoveResult.NotFound;
+ }
+
+ ref RegistrySlot slot = ref _slots[slotIndex];
+ while (true)
+ {
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref slot.Operation);
+ if (currentOperation is null)
+ {
+ return RemoveResult.NotFound;
+ }
+
+ if (Volatile.Read(ref slot.Generation) != generation)
+ {
+ return RemoveResult.NotFound;
+ }
+
+ if (expectedOperation is not null && !ReferenceEquals(currentOperation, expectedOperation))
+ {
+ return RemoveResult.Mismatch;
+ }
+
+ if (Interlocked.CompareExchange(ref slot.Operation, null, currentOperation) != currentOperation)
+ {
+ continue;
+ }
+
+ Interlocked.Decrement(ref _count);
+ removedOperation = currentOperation;
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Canceled);
+ return RemoveResult.Removed;
+ }
+ }
+
+ /// Returns whether an operation with the given user_data and generation is currently tracked.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool Contains(ulong userData)
+ {
+ if (!TryDecodeUserData(userData, out int slotIndex, out uint generation))
+ {
+ return false;
+ }
+
+ ref RegistrySlot slot = ref _slots[slotIndex];
+ return Volatile.Read(ref slot.Operation) is not null &&
+ Volatile.Read(ref slot.Generation) == generation;
+ }
+
+ /// Returns the tracked operation for the given user_data without untracking it.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryGet(ulong userData, out SocketAsyncContext.AsyncOperation? operation)
+ {
+ operation = null;
+ if (!TryDecodeUserData(userData, out int slotIndex, out uint generation))
+ {
+ return false;
+ }
+
+ ref RegistrySlot slot = ref _slots[slotIndex];
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref slot.Operation);
+ if (currentOperation is null)
+ {
+ return false;
+ }
+
+ if (Volatile.Read(ref slot.Generation) != generation)
+ {
+ return false;
+ }
+
+ operation = currentOperation;
+ return true;
+ }
+
+ ///
+ /// Atomically replaces the tracked operation for the given user_data.
+ /// Used by persistent multishot receive to attach the next managed operation
+ /// to an already-armed kernel multishot request.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryReplace(ulong userData, SocketAsyncContext.AsyncOperation newOperation)
+ {
+ if (!TryDecodeUserData(userData, out int slotIndex, out uint generation))
+ {
+ return false;
+ }
+
+ ref RegistrySlot slot = ref _slots[slotIndex];
+ while (true)
+ {
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref slot.Operation);
+ if (currentOperation is null)
+ {
+ return false;
+ }
+
+ if (Volatile.Read(ref slot.Generation) != generation)
+ {
+ return false;
+ }
+
+ if (Interlocked.CompareExchange(ref slot.Operation, newOperation, currentOperation) == currentOperation)
+ {
+ return true;
+ }
+ }
+ }
+
+ /// Removes and yields all tracked operations during teardown.
+ internal IEnumerable DrainAllTrackedOperations()
+ {
+ for (int i = 0; i < _slots.Length; i++)
+ {
+ SocketAsyncContext.AsyncOperation? operation = Interlocked.Exchange(ref _slots[i].Operation, null);
+ if (operation is not null)
+ {
+ Interlocked.Decrement(ref _count);
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Detached);
+ yield return operation;
+ }
+ }
+ }
+
+ /// Extracts the slot index and generation from an encoded user_data value.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryDecodeUserData(ulong userData, out int slotIndex, out uint generation)
+ {
+ if (userData == 0)
+ {
+ slotIndex = 0;
+ generation = 0;
+ return false;
+ }
+
+ slotIndex = (int)(userData & IoUringConstants.SlotIndexMask);
+ if ((uint)slotIndex >= (uint)_slots.Length)
+ {
+ generation = 0;
+ return false;
+ }
+
+ generation = (uint)((userData >> GenerationShift) & IoUringConstants.GenerationMask);
+ return true;
+ }
+ }
+
+ /// Indicates which io_uring dispatch mode is active for this engine instance.
+ private enum LinuxIoUringMode : byte
+ {
+ Disabled = 0,
+ CompletionMode = 1
+ }
+
+ /// Distinguishes cancellation requests issued during normal runtime from those during engine teardown.
+ private enum IoUringCancellationOrigin : byte
+ {
+ Runtime = 0,
+ Teardown = 1
+ }
+
+ /// Tracks the lifecycle of an io_uring operation for debug assertions on valid state transitions.
+ private enum IoUringOperationLifecycleState : byte
+ {
+ Queued = 0,
+ Prepared = 1,
+ Submitted = 2,
+ Completed = 3,
+ Canceled = 4,
+ Detached = 5
+ }
+
+ /// Immutable snapshot of negotiated io_uring capabilities for this engine instance.
+ private readonly struct LinuxIoUringCapabilities
+ {
+ /// Whether the engine's port was created as an io_uring instance.
+ internal bool IsIoUringPort { get; }
+ /// The active io_uring dispatch mode.
+ internal LinuxIoUringMode Mode { get; }
+ /// Whether multishot recv can be used by this engine instance.
+ internal bool SupportsMultishotRecv { get; }
+ /// Whether multishot accept can be used by this engine instance.
+ internal bool SupportsMultishotAccept { get; }
+ /// Whether zero-copy send is enabled for this engine instance.
+ internal bool SupportsZeroCopySend { get; }
+ /// Whether SQPOLL mode is enabled for this engine instance.
+ internal bool SqPollEnabled { get; }
+
+ /// Whether the engine is operating in full completion mode.
+ internal bool IsCompletionMode =>
+ Mode == LinuxIoUringMode.CompletionMode;
+
+ /// Creates a capabilities snapshot with the given port type and mode.
+ internal LinuxIoUringCapabilities(
+ bool isIoUringPort,
+ LinuxIoUringMode mode,
+ bool supportsMultishotRecv,
+ bool supportsMultishotAccept,
+ bool supportsZeroCopySend,
+ bool sqPollEnabled)
+ {
+ IsIoUringPort = isIoUringPort;
+ Mode = mode;
+ SupportsMultishotRecv = supportsMultishotRecv;
+ SupportsMultishotAccept = supportsMultishotAccept;
+ SupportsZeroCopySend = supportsZeroCopySend;
+ SqPollEnabled = sqPollEnabled;
+ }
+ }
+
+ /// Mirrors kernel struct io_uring_sqe (64 bytes), written to the SQ ring for submission.
+ [StructLayout(LayoutKind.Explicit, Size = 64)]
+ internal struct IoUringSqe
+ {
+ [FieldOffset(0)]
+ internal byte Opcode;
+ [FieldOffset(1)]
+ internal byte Flags;
+ [FieldOffset(2)]
+ internal ushort Ioprio;
+ [FieldOffset(4)]
+ internal int Fd;
+ [FieldOffset(8)]
+ internal ulong Off;
+ [FieldOffset(16)]
+ internal ulong Addr;
+ [FieldOffset(24)]
+ internal uint Len;
+ [FieldOffset(28)]
+ internal uint RwFlags;
+ [FieldOffset(32)]
+ internal ulong UserData;
+ [FieldOffset(40)]
+ internal ushort BufIndex;
+ [FieldOffset(42)]
+ internal ushort Personality;
+ [FieldOffset(44)]
+ internal int SpliceFdIn;
+ [FieldOffset(48)]
+ internal ulong Addr3;
+ }
+
+ /// Mirrors kernel struct io_uring_probe_op (8 bytes per entry in the probe ops array).
+ [StructLayout(LayoutKind.Explicit, Size = 8)]
+ private struct IoUringProbeOp
+ {
+ [FieldOffset(0)] internal byte Op;
+ [FieldOffset(1)] internal byte Resv;
+ [FieldOffset(2)] internal ushort Flags;
+ // 4 bytes reserved at offset 4
+ }
+
+ /// Mirrors kernel struct io_uring_probe (16-byte header preceding the variable-length ops array).
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ private struct IoUringProbeHeader
+ {
+ [FieldOffset(0)] internal byte LastOp;
+ [FieldOffset(1)] internal byte OpsLen;
+ // 14 bytes reserved at offset 2
+ }
+
+ ///
+ /// Kernel ABI opcode constants as a static class (not an enum) to avoid byte-cast noise
+ /// at every SQE write site, since the SQE Opcode field is typed as byte.
+ ///
+ private static class IoUringOpcodes
+ {
+ internal const byte ReadFixed = 4;
+ internal const byte Send = 26;
+ internal const byte Recv = 27;
+ internal const byte SendMsg = 9;
+ internal const byte RecvMsg = 10;
+ internal const byte Accept = 13;
+ internal const byte Connect = 16;
+ internal const byte SendZc = 53;
+ internal const byte SendMsgZc = 54;
+ internal const byte AsyncCancel = 14;
+ internal const byte PollAdd = 6;
+ internal const byte PollRemove = 7;
+ }
+
+ ///
+ /// Centralizes io_uring ABI constants that mirror the native definitions in pal_io_uring.c.
+ /// These are used by managed code that directly interacts with the io_uring submission
+ /// and completion rings (e.g., direct SQE writes via mmap'd ring access).
+ ///
+ private static class IoUringConstants
+ {
+ // Setup flags (io_uring_setup params.flags)
+ internal const uint SetupCqSize = 1u << 3;
+ internal const uint SetupSqPoll = 1u << 5;
+ internal const uint SetupSubmitAll = 1u << 7;
+ internal const uint SetupCoopTaskrun = 1u << 8;
+ internal const uint SetupSingleIssuer = 1u << 12;
+ internal const uint SetupDeferTaskrun = 1u << 13;
+ internal const uint SetupNoSqArray = 1u << 16;
+
+ // Feature flags (io_uring_params.features)
+ internal const uint FeatureSingleMmap = 1u << 0;
+ internal const uint FeatureExtArg = 1u << 8;
+
+ // Enter flags (io_uring_enter flags parameter)
+ internal const uint EnterGetevents = 1u << 0;
+ internal const uint EnterSqWakeup = 1u << 1;
+ internal const uint EnterExtArg = 1u << 3;
+ internal const uint EnterRegisteredRing = 1u << 4;
+
+ // SQ ring flags (sq_ring->flags)
+ internal const uint SqNeedWakeup = 1u << 0;
+
+ // Register opcodes
+ internal const uint RegisterBuffers = 0;
+ internal const uint UnregisterBuffers = 1;
+ internal const uint RegisterFiles = 2;
+ internal const uint UnregisterFiles = 3;
+ internal const uint RegisterFilesUpdate = 6;
+ internal const uint RegisterProbe = 8;
+ internal const uint RegisterRingFds = 20;
+ internal const uint UnregisterRingFds = 21;
+ internal const uint RegisterPbufRing = 22;
+ internal const uint UnregisterPbufRing = 23;
+
+ // Register helper values
+ internal const uint RegisterOffsetAuto = 0xFFFFFFFFU;
+
+ // Probe op flags
+ internal const uint ProbeOpFlagSupported = 1u << 0;
+
+ // Poll flags
+ internal const uint PollAddFlagMulti = 1u << 0;
+
+ // CQE flags
+ internal const uint CqeFBuffer = 1u << 0; // IORING_CQE_F_BUFFER (buffer id in upper bits)
+ internal const uint CqeFMore = 1u << 1; // IORING_CQE_F_MORE (multishot)
+ internal const uint CqeFNotif = 1u << 2; // IORING_CQE_F_NOTIF (zero-copy notification)
+ internal const int CqeBufferShift = 16; // IORING_CQE_BUFFER_SHIFT
+
+ // Recv ioprio flags
+ internal const ushort RecvMultishot = 1 << 1; // IORING_RECV_MULTISHOT
+ // Accept ioprio flags
+ internal const ushort AcceptMultishot = 1 << 0; // IORING_ACCEPT_MULTISHOT
+
+ // SQE flags
+ internal const byte SqeFixedFile = 1 << 0; // IOSQE_FIXED_FILE
+ internal const byte SqeBufferSelect = 1 << 5; // IOSQE_BUFFER_SELECT
+
+ // Sizing
+ internal const uint QueueEntries = 1024;
+ internal const uint CqEntriesFactor = 4;
+ internal const uint MaxCqeDrainBatch = 128;
+ internal const long BoundedWaitTimeoutNanos = 50L * 1000 * 1000; // 50ms
+
+ // Registration sizing
+ internal const uint RegistrationBucketCountMin = 2048;
+ internal const uint RegistrationBucketCountFactor = 8;
+ internal const uint RegistrationBucketCountMax = 32768;
+ internal const uint RegisteredFileSlotCountFactor = 4;
+
+ // Completion operation pool sizing
+ internal const int CompletionOperationPoolCapacityFactor = 2;
+
+ // mmap offsets (from kernel UAPI: IORING_OFF_SQ_RING, IORING_OFF_CQ_RING, IORING_OFF_SQES)
+ internal const ulong OffSqRing = 0;
+ internal const ulong OffCqRing = 0x8000000;
+ internal const ulong OffSqes = 0x10000000;
+
+ // Minimum kernel version for io_uring engine
+ internal const int MinKernelMajor = 6;
+ internal const int MinKernelMinor = 1;
+
+ // Zero-copy send size threshold (payloads below this use regular send).
+ internal const int ZeroCopySendThreshold = 16384; // 16KB
+
+ // User data tag values (encoded in upper bits of user_data)
+ internal const byte TagNone = 0;
+ internal const byte TagPollReadiness = 1;
+ internal const byte TagReservedCompletion = 2;
+ internal const byte TagWakeupSignal = 3;
+
+ // Message inline capacities (avoid heap allocation on common small payloads)
+ internal const int MessageInlineIovCount = 4;
+ internal const int MessageInlineSocketAddressCapacity = 128; // sizeof(sockaddr_storage)
+ internal const int MessageInlineControlBufferCapacity = 128;
+
+ // Internal discriminator for io_uring vs epoll fallback detection
+ internal const int NotSocketEventPort = int.MinValue + 1;
+
+ // Completion slot encoding
+ internal const int SlotIndexBits = 24;
+ internal const ulong SlotIndexMask = (1UL << SlotIndexBits) - 1UL;
+ internal const uint GenerationMask = uint.MaxValue;
+
+ // Test hook opcode masks (mirrors IoUringTestOpcodeMask in pal_io_uring.c)
+ internal const byte TestOpcodeMaskNone = 0;
+ internal const byte TestOpcodeMaskSend = 1 << 0;
+ internal const byte TestOpcodeMaskRecv = 1 << 1;
+ internal const byte TestOpcodeMaskSendMsg = 1 << 2;
+ internal const byte TestOpcodeMaskRecvMsg = 1 << 3;
+ internal const byte TestOpcodeMaskAccept = 1 << 4;
+ internal const byte TestOpcodeMaskConnect = 1 << 5;
+ internal const byte TestOpcodeMaskSendZc = 1 << 6;
+ internal const byte TestOpcodeMaskSendMsgZc = 1 << 7;
+ }
+
+ /// Captures the results of io_uring_setup(2) including ring fd, negotiated params, and feature flags.
+ private struct IoUringSetupResult
+ {
+ internal int RingFd;
+ internal Interop.Sys.IoUringParams Params;
+ internal uint NegotiatedFlags;
+ internal bool UsesExtArg;
+ internal bool SqPollNegotiated;
+ }
+
+ /// Discriminates completion slot metadata shape for operation-specific post-completion processing.
+ private enum IoUringCompletionOperationKind : byte
+ {
+ None = 0,
+ Accept = 1,
+ Message = 2,
+ }
+
+ ///
+ /// Hot per-slot metadata used on every CQE dispatch.
+ /// Keep this minimal; native pointer-heavy state lives in .
+ ///
+ private struct IoUringCompletionSlot
+ {
+ public uint Generation;
+ public IoUringCompletionOperationKind Kind;
+#if DEBUG
+ public bool HasTestForcedResult;
+ public int TestForcedResult;
+#endif
+ public bool IsZeroCopySend;
+ public bool ZeroCopyNotificationPending;
+ public bool UsesFixedRecvBuffer;
+ public ushort FixedRecvBufferId;
+ public int FreeListNext; // -1 = end of free list
+ }
+
+ ///
+ /// Cold per-slot native metadata: pointers and message writeback state needed only for
+ /// operation-specific completion processing.
+ ///
+ private struct IoUringCompletionSlotStorage
+ {
+ // Accept metadata
+ public unsafe int* NativeSocketAddressLengthPtr; // socklen_t* for accept
+ // Message metadata (pointers to native-alloc'd msghdr/iovec)
+ public IntPtr NativeMsgHdrPtr;
+ public IntPtr NativeMessageStorage; // heap-allocated contiguous block (or null for inline)
+ public bool MessageIsReceive;
+ // Message metadata - deep-copied native msghdr constituents (point into NativeMessageStorage block)
+ public unsafe Interop.Sys.IOVector* NativeIOVectors;
+ public unsafe byte* NativeSocketAddress;
+ public unsafe byte* NativeControlBuffer;
+ // RecvMsg output capture - pointers back to managed MessageHeader buffers for writeback
+ public unsafe byte* ReceiveOutputSocketAddress;
+ public unsafe byte* ReceiveOutputControlBuffer;
+ public int ReceiveSocketAddressCapacity;
+ public int ReceiveControlBufferCapacity;
+ }
+
+ ///
+ /// Mirrors the kernel's struct msghdr layout for direct SQE submission.
+ /// Used by to build a native msghdr that
+ /// io_uring sendmsg/recvmsg opcodes can consume directly.
+ /// Must only be used on 64-bit Linux where sizeof(msghdr) == 56.
+ ///
+ [StructLayout(LayoutKind.Sequential)]
+ private unsafe struct NativeMsghdr
+ {
+ public void* msg_name;
+ public uint msg_namelen;
+ // On x64, 4 bytes of padding are inserted by sequential layout before the next pointer.
+ public Interop.Sys.IOVector* msg_iov;
+ public nuint msg_iovlen;
+ public void* msg_control;
+ public nuint msg_controllen;
+ public int msg_flags;
+ }
+
+ /// Tracks per-socket epoll/poll registration state including registered file index and active events.
+ private sealed class SocketEventRegistration
+ {
+ public int Socket;
+ public int RegisteredFileIndex = -1;
+ public Interop.Sys.SocketEvents Events;
+ public uint PollEvents;
+ public UIntPtr Data;
+ public ulong RequestId;
+ }
+
+ /// Cross-thread request to modify a socket's event registration, completed by the event loop thread.
+ private sealed class RegistrationChangeRequest : IDisposable
+ {
+ public int Socket;
+ public Interop.Sys.SocketEvents NewEvents;
+ public UIntPtr Data;
+ public volatile Interop.Error Error;
+ public volatile bool Completed;
+ public ManualResetEventSlim CompletionEvent = new ManualResetEventSlim(false);
+
+ public void Dispose() => CompletionEvent.Dispose();
+ }
+
+ private const int IoUringDiagnosticsPollInterval = 64;
+ private const long DiagnosticSampleMask = 0x3F;
+ private const int MaxIoUringPrepareQueueDrainPerSubmit = 256;
+ private const int MaxIoUringCancelQueueDrainPerSubmit = 256;
+ private const int MaxSlotExhaustionRetries = 3;
+ private const int MaxIoUringSqeAcquireSubmitAttempts = 16;
+ private const ulong IoUringUserDataPayloadMask = 0x00FF_FFFF_FFFF_FFFFUL;
+ private const int IoUringUserDataTagShift = 56;
+ private static readonly int s_ioUringPrepareQueueCapacity = GetIoUringPrepareQueueCapacity();
+ private static readonly int s_ioUringCancellationQueueCapacity = s_ioUringPrepareQueueCapacity;
+ private static long s_ioUringPollReadinessCqeCount;
+ private static long s_ioUringPendingRetryQueuedToPrepareQueueCount;
+ private static long s_ioUringPublishedNonPinnablePrepareFallbackCount;
+ private static int s_ioUringPublishingNonPinnablePrepareFallback;
+ private MpscQueue? _ioUringPrepareQueue;
+ private MpscQueue? _ioUringCancelQueue;
+ private long _ioUringPrepareQueueLength;
+ private long _ioUringCancelQueueLength;
+ private long _ioUringPrepareQueueOverflowCount;
+ private long _ioUringCancelQueueOverflowCount;
+ private long _ioUringPrepareQueueOverflowFallbackCount;
+ private long _ioUringCompletionSlotExhaustionCount;
+ private long _ioUringCompletionSlotDrainRecoveryCount;
+ private long _ioUringPublishedPrepareQueueLength;
+ private long _ioUringBenignLateCompletionCount;
+ private long _ioUringCompletionRequeueFailureCount;
+ private long _ioUringUntrackMismatchCount;
+ private long _ioUringPublishedPrepareQueueOverflowCount;
+ private long _ioUringPublishedPrepareQueueOverflowFallbackCount;
+ private long _ioUringPublishedCompletionRequeueFailureCount;
+ private long _ioUringPublishedCompletionSlotExhaustionCount;
+ private long _ioUringPublishedCompletionSlotDrainRecoveryCount;
+ private int _ioUringDiagnosticsPollCountdown;
+ private bool _ioUringAdvancedFeatureStateLogged;
+ private int _ioUringWakeupRequested;
+ private int _ioUringPortClosedForTeardown;
+ private int _ioUringTeardownInitiated;
+ private int _ioUringSlotCapacity;
+ private bool _completionSlotDrainInProgress;
+ private uint _ioUringManagedPendingSubmissions;
+ private uint _ioUringManagedSqTail;
+ private bool _ioUringManagedSqTailLoaded;
+ private Interop.Sys.IoUringSqRingInfo _ioUringSqRingInfo;
+ private bool _ioUringDirectSqeEnabled;
+
+ // Per-opcode support flags, populated by ProbeIoUringOpcodeSupport.
+ private bool _supportsOpSend;
+ private bool _supportsOpReadFixed;
+ private bool _supportsOpRecv;
+ private bool _supportsOpSendMsg;
+ private bool _supportsOpRecvMsg;
+ private bool _supportsOpAccept;
+ private bool _supportsOpConnect;
+ private bool _supportsOpSendZc;
+ private bool _supportsOpSendMsgZc;
+ private bool _supportsOpAsyncCancel;
+ private bool _supportsOpPollAdd;
+ private bool _supportsMultishotRecv;
+ private bool _supportsMultishotAccept;
+ private bool _supportsProvidedBufferRings;
+ private bool _zeroCopySendEnabled;
+
+ // Managed ring state (populated by TryMmapRings, replaces native-provided IoUringSqRingInfo)
+ private unsafe Interop.Sys.IoUringCqe* _managedCqeBase;
+ private unsafe uint* _managedCqTailPtr;
+ private unsafe uint* _managedCqHeadPtr;
+ private uint _managedCqMask;
+ private uint _managedCqEntries;
+ private unsafe uint* _managedCqOverflowPtr;
+ private uint _managedObservedCqOverflow;
+ private unsafe byte* _managedSqRingPtr;
+ private unsafe byte* _managedCqRingPtr;
+ private unsafe uint* _managedSqFlagsPtr;
+ private ulong _managedSqRingSize;
+ private ulong _managedCqRingSize;
+ private ulong _managedSqesSize;
+ private bool _managedUsesSingleMmap;
+ private int _managedRingFd;
+ private bool _managedUsesExtArg;
+ private bool _managedUsesNoSqArray;
+ private uint _managedNegotiatedFlags;
+ private bool _sqPollEnabled;
+ private uint _managedCachedCqHead;
+ private bool _ioUringInitialized;
+ private bool _managedCqDrainEnabled;
+ private int _managedWakeupEventFd = -1;
+ private IoUringProvidedBufferRing? _ioUringProvidedBufferRing;
+ private bool _ioUringBuffersRegistered;
+ private ushort _ioUringProvidedBufferGroupId;
+ private Dictionary? _registrationsBySocket;
+ private Dictionary? _registrationsByRequestId;
+ private ulong _nextRequestId;
+ // Note: _registrationChangeQueue is only allocated when RequiresPollReadiness() is true,
+ // which is not the case in pure completion mode. It is not a target for MPSC migration
+ // until/unless a non-completion-mode io_uring path is reactivated.
+ private ConcurrentQueue? _registrationChangeQueue;
+ private IoUringCompletionSlot[]? _completionSlots;
+ private IoUringCompletionSlotStorage[]? _completionSlotStorage;
+ private System.Buffers.MemoryHandle[]? _zeroCopyPinHolds;
+ private int _completionSlotFreeListHead = -1;
+ private int _completionSlotsInUse;
+
+#if DEBUG
+ // Test hook state: forced completion result injection (mirrors native pal_io_uring.c test hooks).
+ private byte _testForceEagainOnceMask;
+ private byte _testForceEcanceledOnceMask;
+#endif
+
+ // Registered-file table state
+ private int[]? _registeredFiles; // slot -> fd mapping (-1 = empty)
+ private uint[]? _registeredFileFreeSlots; // stack of free slot indices
+ private uint _registeredFileFreeSlotCount;
+ private int _registeredFileHotSocket = -1;
+ private int _registeredFileHotIndex = -1;
+ private bool _usesRegisteredFiles;
+
+ private LinuxIoUringCapabilities _ioUringCapabilities;
+ /// Whether this engine instance is using io_uring completion mode.
+ internal bool IsIoUringCompletionModeEnabled => _ioUringCapabilities.IsCompletionMode;
+ /// Whether managed direct SQE submission is enabled.
+ internal bool IsIoUringDirectSqeEnabled => _ioUringDirectSqeEnabled;
+ /// Whether a connected send payload is eligible for the SEND_ZC path.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool ShouldTryIoUringDirectSendZeroCopy(int payloadLength) =>
+ IsIoUringZeroCopySendEligible(payloadLength, requiresSendMessageOpcode: false);
+ /// Whether a message-based send payload is eligible for the SENDMSG_ZC path.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool ShouldTryIoUringDirectSendMessageZeroCopy(int payloadLength) =>
+ IsIoUringZeroCopySendEligible(payloadLength, requiresSendMessageOpcode: true);
+ private IoUringOperationRegistry? _ioUringOperationRegistry;
+
+ ///
+ /// Centralized zero-copy policy:
+ /// 1) process-level opt-in, 2) opcode support, 3) payload threshold.
+ /// The threshold is based on total payload bytes so buffer-list workloads (e.g. 4KB segments)
+ /// are eligible once the aggregate payload crosses the cutoff.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsIoUringZeroCopySendEligible(int payloadLength, bool requiresSendMessageOpcode)
+ {
+ if (!_zeroCopySendEnabled || payloadLength < IoUringConstants.ZeroCopySendThreshold)
+ {
+ return false;
+ }
+
+ return requiresSendMessageOpcode ? _supportsOpSendMsgZc : _supportsOpSendZc;
+ }
+
+ /// Reads the process-wide count of poll-readiness CQEs observed by managed completion drains.
+ internal static long GetIoUringPollReadinessCqeCount() =>
+ Interlocked.Read(ref s_ioUringPollReadinessCqeCount);
+
+ ///
+ /// Reads the process-wide count of pending completions that had to requeue through the prepare queue
+ /// after inline completion-mode re-prepare was not used.
+ ///
+ internal static long GetIoUringPendingRetryQueuedToPrepareQueueCount() =>
+ Interlocked.Read(ref s_ioUringPendingRetryQueuedToPrepareQueueCount);
+
+ private static int GetIoUringPrepareQueueCapacity()
+ {
+#if DEBUG
+ if (Environment.GetEnvironmentVariable(
+ "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PREPARE_QUEUE_CAPACITY") is string configuredValue &&
+ int.TryParse(configuredValue, out int configuredCapacity) &&
+ configuredCapacity > 0)
+ {
+ return configuredCapacity;
+ }
+#endif
+
+ // Raised default to reduce fallback frequency under bursty load.
+ int scaledCapacity = s_eventBufferCount >= 32 ? checked(s_eventBufferCount * 4) : 512;
+ return Math.Max(scaledCapacity, 512);
+ }
+
+ /// Creates a capabilities snapshot based on whether the port is io_uring.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static LinuxIoUringCapabilities ResolveLinuxIoUringCapabilities(bool isIoUringPort) =>
+ new LinuxIoUringCapabilities(
+ isIoUringPort,
+ isIoUringPort ? LinuxIoUringMode.CompletionMode : LinuxIoUringMode.Disabled,
+ supportsMultishotRecv: false,
+ supportsMultishotAccept: false,
+ supportsZeroCopySend: false,
+ sqPollEnabled: false);
+
+ /// Encodes a tag byte and payload into a 64-bit user_data value.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ulong EncodeIoUringUserData(byte tag, ulong payload) =>
+ ((ulong)tag << IoUringUserDataTagShift) | (payload & IoUringUserDataPayloadMask);
+
+ /// Reads the next CQE from the completion ring without advancing the head.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool TryPeekNextCqe(out Interop.Sys.IoUringCqe* cqe)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryPeekNextCqe must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ cqe = null;
+ uint cqTail = Volatile.Read(ref *_managedCqTailPtr);
+ if (_managedCachedCqHead == cqTail) return false;
+ uint index = _managedCachedCqHead & _managedCqMask;
+ cqe = _managedCqeBase + index;
+ return true;
+ }
+
+ /// Advances the CQ head pointer by the given count, making slots available to the kernel.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void AdvanceCqHead(uint count)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "AdvanceCqHead must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ _managedCachedCqHead += count;
+ Volatile.Write(ref *_managedCqHeadPtr, _managedCachedCqHead);
+ }
+
+ ///
+ /// Drains up to CQEs from the mmap'd
+ /// completion ring and dispatches each based on the user_data tag.
+ /// Tag=2 (reserved completion) entries are dispatched directly through
+ /// .
+ /// Tag=3 (wakeup signal) entries are handled inline.
+ /// Tag=1 (poll readiness) entries are retained for readiness mode fallback.
+ /// Returns true when at least one CQE was drained.
+ ///
+ private unsafe bool DrainCqeRingBatch(SocketEventHandler handler)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "DrainCqeRingBatch must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ ObserveManagedCqOverflowCounter();
+ int drained = 0;
+ bool drainedAnyCqe = false;
+ bool enqueuedFallbackEvent = false;
+
+ while (drained < (int)IoUringConstants.MaxCqeDrainBatch
+ && TryPeekNextCqe(out Interop.Sys.IoUringCqe* cqe))
+ {
+ drainedAnyCqe = true;
+ ulong userData = cqe->UserData;
+ int result = cqe->Result;
+ uint flags = cqe->Flags;
+ AdvanceCqHead(1);
+
+ byte tag = (byte)(userData >> IoUringUserDataTagShift);
+ ulong payload = userData & IoUringUserDataPayloadMask;
+
+ switch (tag)
+ {
+ case IoUringConstants.TagReservedCompletion:
+ if ((flags & IoUringConstants.CqeFNotif) != 0)
+ {
+ if (HandleZeroCopyNotification(payload))
+ {
+ handler.DispatchZeroCopyIoUringNotification(payload);
+ }
+
+ break;
+ }
+
+ bool isMultishotCompletion = false;
+ if ((flags & IoUringConstants.CqeFMore) != 0)
+ {
+ IoUringCompletionSlot[]? completionSlots = _completionSlots;
+ int slotIndex = DecodeCompletionSlotIndex(payload);
+ if (completionSlots is not null &&
+ (uint)slotIndex < (uint)completionSlots.Length)
+ {
+ IoUringCompletionOperationKind kind = completionSlots[slotIndex].Kind;
+ isMultishotCompletion =
+ (kind == IoUringCompletionOperationKind.Message && _ioUringCapabilities.SupportsMultishotRecv) ||
+ (kind == IoUringCompletionOperationKind.Accept && _ioUringCapabilities.SupportsMultishotAccept);
+ }
+ }
+ ResolveReservedCompletionSlotMetadata(
+ payload,
+ isMultishotCompletion,
+ ref result,
+ out int completionSocketAddressLen,
+ out int completionControlBufferLen,
+ out uint completionAuxiliaryData,
+ out bool hasFixedRecvBuffer,
+ out ushort fixedRecvBufferId);
+
+ if (isMultishotCompletion)
+ {
+ handler.DispatchMultishotIoUringCompletion(
+ payload,
+ result,
+ flags,
+ completionSocketAddressLen,
+ completionControlBufferLen,
+ completionAuxiliaryData,
+ hasFixedRecvBuffer,
+ fixedRecvBufferId,
+ ref enqueuedFallbackEvent);
+ }
+ else
+ {
+ handler.DispatchSingleIoUringCompletion(
+ payload,
+ result,
+ flags,
+ completionSocketAddressLen,
+ completionControlBufferLen,
+ completionAuxiliaryData,
+ hasFixedRecvBuffer,
+ fixedRecvBufferId,
+ ref enqueuedFallbackEvent);
+ }
+ break;
+ case IoUringConstants.TagWakeupSignal:
+ HandleManagedWakeupSignal(result);
+ break;
+ case IoUringConstants.TagPollReadiness:
+ HandlePollReadinessCqe(payload, result, flags);
+ break;
+ default:
+ break; // Unknown tag - silently ignore.
+ }
+
+ drained++;
+ }
+
+ if (enqueuedFallbackEvent)
+ {
+ EnsureWorkerScheduled();
+ }
+
+ return drainedAnyCqe;
+ }
+
+ ///
+ /// Resolves metadata for a reserved completion by applying forced test results and
+ /// copying operation-specific completion outputs (accept/recvmsg) from native storage.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ResolveReservedCompletionSlotMetadata(
+ ulong payload,
+ bool isMultishotCompletion,
+ ref int result,
+ out int completionSocketAddressLen,
+ out int completionControlBufferLen,
+ out uint completionAuxiliaryData,
+ out bool hasFixedRecvBuffer,
+ out ushort fixedRecvBufferId)
+ {
+ completionSocketAddressLen = 0;
+ completionControlBufferLen = 0;
+ completionAuxiliaryData = 0;
+ hasFixedRecvBuffer = false;
+ fixedRecvBufferId = 0;
+
+ int slotIndex = DecodeCompletionSlotIndex(payload);
+ if ((uint)slotIndex >= (uint)_completionSlots!.Length)
+ {
+ return;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots[slotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+
+ ResolveDebugTestForcedResult(ref slot, ref result);
+
+ if (slot.UsesFixedRecvBuffer)
+ {
+ hasFixedRecvBuffer = true;
+ fixedRecvBufferId = slot.FixedRecvBufferId;
+ slot.UsesFixedRecvBuffer = false;
+ slot.FixedRecvBufferId = 0;
+ Debug.Assert(!isMultishotCompletion, "Fixed-buffer receive completions are expected to be one-shot.");
+ }
+
+ if (slot.Kind == IoUringCompletionOperationKind.Accept &&
+ slotStorage.NativeSocketAddressLengthPtr is not null)
+ {
+ int nativeSocketAddressLength = *slotStorage.NativeSocketAddressLengthPtr;
+ completionAuxiliaryData = nativeSocketAddressLength >= 0 ? (uint)nativeSocketAddressLength : 0u;
+ if (isMultishotCompletion)
+ {
+ int socketAddressCapacity = slotStorage.ReceiveSocketAddressCapacity;
+ *slotStorage.NativeSocketAddressLengthPtr = socketAddressCapacity >= 0 ? socketAddressCapacity : 0;
+ }
+ }
+ else if (slot.Kind == IoUringCompletionOperationKind.Message)
+ {
+ CopyMessageCompletionOutputs(
+ slotIndex,
+ out completionSocketAddressLen,
+ out completionControlBufferLen,
+ out completionAuxiliaryData);
+ }
+
+ if (!isMultishotCompletion)
+ {
+ if (!slot.IsZeroCopySend)
+ {
+ FreeCompletionSlot(slotIndex);
+ }
+ else if (result < 0)
+ {
+ // Error completion path may not produce a NOTIF CQE.
+ FreeCompletionSlot(slotIndex);
+ }
+ else if (!slot.ZeroCopyNotificationPending)
+ {
+ // First CQE for zero-copy send: keep slot alive until NOTIF CQE arrives.
+ slot.ZeroCopyNotificationPending = true;
+ AssertZeroCopyNotificationPendingForPayload(payload);
+ }
+ }
+ }
+
+ /// Handles NOTIF CQEs for zero-copy sends and releases retained completion slots.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool HandleZeroCopyNotification(ulong payload)
+ {
+ IoUringCompletionSlot[]? completionSlots = _completionSlots;
+ if (completionSlots is null)
+ {
+ return false;
+ }
+
+ int slotIndex = DecodeCompletionSlotIndex(payload);
+ if ((uint)slotIndex >= (uint)completionSlots.Length)
+ {
+ return false;
+ }
+
+ ref IoUringCompletionSlot slot = ref completionSlots[slotIndex];
+ if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending)
+ {
+ return false;
+ }
+
+ slot.IsZeroCopySend = false;
+ slot.ZeroCopyNotificationPending = false;
+ FreeCompletionSlot(slotIndex);
+ return true;
+ }
+
+ /// Returns true when the completion slot for is waiting on SEND_ZC NOTIF.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsZeroCopyNotificationPending(ulong userData)
+ {
+ IoUringCompletionSlot[]? completionSlots = _completionSlots;
+ if (completionSlots is null)
+ {
+ return false;
+ }
+
+ int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask);
+ if ((uint)slotIndex >= (uint)completionSlots.Length)
+ {
+ return false;
+ }
+
+ ref IoUringCompletionSlot slot = ref completionSlots[slotIndex];
+ return slot.IsZeroCopySend && slot.ZeroCopyNotificationPending;
+ }
+
+ /// Debug assertion that a reserved completion payload remains armed for SEND_ZC NOTIF.
+ [Conditional("DEBUG")]
+ private void AssertZeroCopyNotificationPendingForPayload(ulong payload)
+ {
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ Debug.Assert(
+ IsZeroCopyNotificationPending(userData),
+ "SEND_ZC first CQE must leave the completion slot pending until NOTIF CQE arrives.");
+ }
+
+ /// Debug assertion that SEND_ZC completion dispatch is deferred until NOTIF arrives.
+ [Conditional("DEBUG")]
+ private void AssertZeroCopyDeferredCompletionState(ulong userData, SocketAsyncContext.AsyncOperation operation)
+ {
+ Debug.Assert(
+ operation.IoUringUserData == userData,
+ "Deferred SEND_ZC completion must retain the original user_data until NOTIF CQE dispatch.");
+ Debug.Assert(
+ IsZeroCopyNotificationPending(userData),
+ "Deferred SEND_ZC completion requires an armed NOTIF state.");
+ }
+
+ /// Observes kernel CQ overflow count deltas and emits telemetry/logs.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void ObserveManagedCqOverflowCounter()
+ {
+ if (_managedCqOverflowPtr is null)
+ {
+ return;
+ }
+
+ uint observedOverflow = Volatile.Read(ref *_managedCqOverflowPtr);
+ uint previousOverflow = _managedObservedCqOverflow;
+ if (observedOverflow <= previousOverflow)
+ {
+ return;
+ }
+
+ uint delta = observedOverflow - previousOverflow;
+ _managedObservedCqOverflow = observedOverflow;
+ SocketsTelemetry.Log.IoUringCqOverflow(delta);
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringCqOverflow(observedOverflow, delta);
+ }
+ }
+
+ /// Handles a poll-readiness CQE and records diagnostics in completion mode.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void HandlePollReadinessCqe(ulong payload, int result, uint flags)
+ {
+ Debug.Assert(!_ioUringInitialized || !_ioUringCapabilities.IsCompletionMode,
+ "Unexpected poll readiness CQE in pure io_uring completion mode");
+ if (_ioUringCapabilities.IsCompletionMode)
+ {
+ RecordIoUringPollReadinessCqe();
+ }
+
+ DispatchManagedPollReadinessCqe(payload, result, flags);
+ }
+
+ ///
+ /// Handles a wakeup signal CQE by consuming the eventfd counter.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void HandleManagedWakeupSignal(int cqeResult)
+ {
+ if (cqeResult >= 0 && _managedWakeupEventFd >= 0)
+ {
+ ulong value;
+ Interop.Error readError = Interop.Sys.IoUringShimReadEventFd(_managedWakeupEventFd, &value);
+ if (readError != Interop.Error.SUCCESS &&
+ readError != Interop.Error.EAGAIN &&
+ NetEventSource.Log.IsEnabled())
+ {
+ LogWakeupReadFailure(this, readError);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static void LogWakeupReadFailure(SocketAsyncEngine engine, Interop.Error readErrorCode)
+ {
+ NetEventSource.Error(engine, $"io_uring wakeup eventfd read failed: error={readErrorCode}");
+ }
+ }
+
+ // Poll event constants matching Linux UAPI definitions.
+ // POLLIN = 0x0001, POLLOUT = 0x0004, POLLERR = 0x0008, POLLHUP = 0x0010, POLLRDHUP = 0x2000
+ private const uint PollIn = 0x0001;
+ private const uint PollOut = 0x0004;
+ private const uint PollErr = 0x0008;
+ private const uint PollHup = 0x0010;
+ private const uint PollRdHup = 0x2000;
+
+ /// Converts SocketEvents to kernel poll event flags.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint GetIoUringPollEvents(Interop.Sys.SocketEvents events)
+ {
+ uint pollEvents =
+ (((events & Interop.Sys.SocketEvents.Read) != 0) ? PollIn : 0u) |
+ (((events & Interop.Sys.SocketEvents.Write) != 0) ? PollOut : 0u) |
+ PollErr | PollHup;
+ if ((events & Interop.Sys.SocketEvents.ReadClose) != 0)
+ pollEvents |= PollRdHup;
+ return pollEvents;
+ }
+
+ /// Converts a kernel poll result bitmask to SocketEvents.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Interop.Sys.SocketEvents GetSocketEventsFromPollResult(int result)
+ {
+ uint events = (uint)result;
+ // Treat POLLHUP as both read + write ready (hangup means both directions signaled).
+ if ((events & PollHup) != 0)
+ {
+ events = (events & ~PollHup) | PollIn | PollOut;
+ }
+ return (Interop.Sys.SocketEvents)(
+ (((events & PollIn) != 0) ? (int)Interop.Sys.SocketEvents.Read : 0) |
+ (((events & PollOut) != 0) ? (int)Interop.Sys.SocketEvents.Write : 0) |
+ (((events & PollRdHup) != 0) ? (int)Interop.Sys.SocketEvents.ReadClose : 0) |
+ (((events & PollErr) != 0) ? (int)Interop.Sys.SocketEvents.Error : 0));
+ }
+
+ /// Looks up the event registration for a socket file descriptor.
+ private SocketEventRegistration? FindRegistrationBySocket(int socket)
+ {
+ if (_registrationsBySocket is not null && _registrationsBySocket.TryGetValue(socket, out SocketEventRegistration? reg))
+ return reg;
+ return null;
+ }
+
+ /// Looks up the event registration for a poll request ID.
+ private SocketEventRegistration? FindRegistrationByRequestId(ulong requestId)
+ {
+ if (_registrationsByRequestId is not null && _registrationsByRequestId.TryGetValue(requestId, out SocketEventRegistration? reg))
+ return reg;
+ return null;
+ }
+
+ /// Returns the existing registration or creates a new one for the socket.
+ private SocketEventRegistration FindOrCreateRegistrationBySocket(int socket)
+ {
+ SocketEventRegistration? reg = FindRegistrationBySocket(socket);
+ if (reg is not null) return reg;
+
+ reg = new SocketEventRegistration { Socket = socket };
+ _registrationsBySocket!.Add(socket, reg);
+ return reg;
+ }
+
+ /// Assigns a unique request ID and indexes the registration.
+ private void AssignRegistrationRequestId(SocketEventRegistration reg)
+ {
+ ulong requestId = ++_nextRequestId;
+ reg.RequestId = requestId;
+ _registrationsByRequestId!.Add(requestId, reg);
+ }
+
+ /// Removes the registration from the request ID index.
+ private void ClearRegistrationRequestId(SocketEventRegistration reg)
+ {
+ if (reg.RequestId != 0)
+ {
+ _registrationsByRequestId?.Remove(reg.RequestId);
+ reg.RequestId = 0;
+ }
+ }
+
+ /// Fully removes a socket's event registration from all indexes.
+ private void RemoveRegistration(SocketEventRegistration reg)
+ {
+ TryUnregisterRegisteredFileForRegistration(reg);
+ ClearRegistrationRequestId(reg);
+ _registrationsBySocket?.Remove(reg.Socket);
+ }
+
+ // Raw Linux errno values as returned by the kernel in CQE results (negative).
+ // These differ from Interop.Error which uses a PAL-specific numbering scheme.
+ private const int ErrnoECANCELED = 125;
+ private const int ErrnoEBADF = 9;
+ private const int ErrnoENOENT = 2;
+ private const int ErrnoEINVAL = 22;
+
+ ///
+ /// Dispatches a poll readiness CQE by looking up the registration and raising
+ /// the appropriate socket events. The CQE result contains the poll events
+ /// returned by the kernel (or a negative errno on error).
+ ///
+ private void DispatchManagedPollReadinessCqe(ulong requestIdPayload, int cqeResult, uint cqeFlags)
+ {
+ SocketEventRegistration? reg = FindRegistrationByRequestId(requestIdPayload);
+ if (reg is null) return;
+
+ UIntPtr registrationData = reg.Data;
+ bool removeRegistration = false;
+ bool pollStillArmed = false;
+
+ Interop.Sys.SocketEvents events = Interop.Sys.SocketEvents.None;
+ if (cqeResult >= 0)
+ {
+ events = GetSocketEventsFromPollResult(cqeResult);
+ if ((cqeFlags & IoUringConstants.CqeFMore) != 0)
+ {
+ pollStillArmed = true;
+ }
+ }
+ else if (cqeResult != -ErrnoECANCELED && cqeResult != -ErrnoENOENT)
+ {
+ events = Interop.Sys.SocketEvents.Error;
+ }
+
+ // Certain errors require removing the registration entirely.
+ if (cqeResult == -ErrnoEBADF ||
+ cqeResult == -ErrnoENOENT ||
+ cqeResult == -ErrnoEINVAL)
+ {
+ removeRegistration = true;
+ }
+
+ if (!pollStillArmed || removeRegistration)
+ {
+ ClearRegistrationRequestId(reg);
+ }
+
+ if (events != Interop.Sys.SocketEvents.None)
+ {
+ // Deliver the event through the registered context lookup table.
+ // The Data field holds the index into s_registeredContexts, matching
+ // how native poll events flow through HandleSocketEvents.
+ SocketAsyncContext? context = s_registeredContexts[(int)(nuint)registrationData];
+
+ if (context is not null)
+ {
+ if (context.PreferInlineCompletions)
+ {
+ context.HandleEventsInline(events);
+ }
+ else
+ {
+ Interop.Sys.SocketEvents filteredEvents = context.HandleSyncEventsSpeculatively(events);
+
+ if (filteredEvents != Interop.Sys.SocketEvents.None)
+ {
+ _eventQueue.Enqueue(new SocketIOEvent(context, filteredEvents));
+ }
+ }
+ }
+ }
+
+ if (removeRegistration)
+ {
+ RemoveRegistration(reg);
+ }
+ }
+
+ /// Applies an event registration change for io_uring poll-based readiness.
+ partial void LinuxTryChangeSocketEventRegistration(IntPtr socketHandle, Interop.Sys.SocketEvents currentEvents, Interop.Sys.SocketEvents newEvents, int data, ref Interop.Error error, ref bool handled)
+ {
+ if (!_ioUringInitialized) return;
+ if (_ioUringCapabilities.IsCompletionMode)
+ {
+ // In pure completion mode, readiness registration is intentionally disabled.
+ handled = true;
+ error = Interop.Error.SUCCESS;
+ return;
+ }
+
+ handled = true;
+
+ int socket = (int)(nint)socketHandle;
+
+ // Check if we're on the event loop thread - apply directly
+ int eventLoopThreadId = Volatile.Read(ref _eventLoopManagedThreadId);
+ if (eventLoopThreadId != 0 && eventLoopThreadId == Environment.CurrentManagedThreadId)
+ {
+ error = ApplyManagedRegistrationChange(socket, newEvents, (UIntPtr)(nuint)(uint)data);
+ return;
+ }
+
+ // Off event loop thread: enqueue and wait
+ var request = new RegistrationChangeRequest
+ {
+ Socket = socket,
+ NewEvents = newEvents,
+ Data = (UIntPtr)(nuint)(uint)data,
+ };
+
+ _registrationChangeQueue!.Enqueue(request);
+ WakeEventLoop();
+
+ request.CompletionEvent.Wait();
+ error = request.Error;
+ request.Dispose();
+ }
+
+ ///
+ /// Indicates whether this io_uring mode still relies on poll-readiness registration.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool RequiresPollReadiness()
+ => _ioUringCapabilities.IsIoUringPort && !_ioUringCapabilities.IsCompletionMode;
+
+ ///
+ /// Applies a socket event registration change directly on the event loop thread.
+ /// Ports the native ApplySocketEventRegistrationChange from pal_io_uring.c.
+ ///
+ private unsafe Interop.Error ApplyManagedRegistrationChange(int socket, Interop.Sys.SocketEvents newEvents, UIntPtr data)
+ {
+ SocketEventRegistration? registration = FindRegistrationBySocket(socket);
+
+ // If existing registration has an active poll, remove it first
+ if (registration is not null && registration.RequestId != 0)
+ {
+ Interop.Error removeError = WritePollRemoveSqe(registration.RequestId);
+ if (removeError != Interop.Error.SUCCESS)
+ return removeError;
+ ClearRegistrationRequestId(registration);
+ }
+
+ // If deregistering (newEvents == None), just remove
+ if (newEvents == Interop.Sys.SocketEvents.None)
+ {
+ if (registration is not null)
+ RemoveRegistration(registration);
+ return Interop.Error.SUCCESS;
+ }
+
+ // Create or update registration
+ registration ??= FindOrCreateRegistrationBySocket(socket);
+
+ registration.Events = newEvents;
+ registration.PollEvents = GetIoUringPollEvents(newEvents);
+ registration.Data = data;
+
+ // Write POLL_ADD SQE
+ Interop.Error addError = WritePollAddSqe(registration);
+ if (addError != Interop.Error.SUCCESS)
+ {
+ RemoveRegistration(registration);
+ return addError;
+ }
+
+ return Interop.Error.SUCCESS;
+ }
+
+ ///
+ /// Writes a POLL_ADD SQE for the given registration, using multishot mode.
+ /// Ports the native QueueIoUringPollAdd from pal_io_uring.c.
+ ///
+ private unsafe Interop.Error WritePollAddSqe(SocketEventRegistration registration)
+ {
+ Debug.Assert(registration.RequestId == 0);
+
+ if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError))
+ return submitError;
+
+ // Generate monotonic request ID (wrapping within payload space)
+ do { _nextRequestId++; }
+ while ((_nextRequestId & IoUringUserDataPayloadMask) == 0);
+ ulong requestId = _nextRequestId & IoUringUserDataPayloadMask;
+
+ TryAssignRegisteredFileForRegistration(registration, out int pollSqeFd, out byte pollSqeFlags);
+ sqe->Opcode = IoUringOpcodes.PollAdd;
+ sqe->Fd = pollSqeFd;
+ sqe->Flags = pollSqeFlags;
+ sqe->Len = IoUringConstants.PollAddFlagMulti; // IORING_POLL_ADD_MULTI
+ sqe->RwFlags = registration.PollEvents; // poll_events in rw_flags union
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagPollReadiness, requestId);
+
+ registration.RequestId = requestId;
+ _registrationsByRequestId!.Add(requestId, registration);
+
+ return Interop.Error.SUCCESS;
+ }
+
+ ///
+ /// Writes a POLL_REMOVE SQE to cancel an outstanding POLL_ADD identified by .
+ /// Ports the native QueueIoUringPollRemove from pal_io_uring.c.
+ ///
+ private unsafe Interop.Error WritePollRemoveSqe(ulong requestId)
+ {
+ if (requestId == 0)
+ return Interop.Error.SUCCESS;
+
+ if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError))
+ return submitError;
+
+ sqe->Opcode = IoUringOpcodes.PollRemove;
+ sqe->Addr = EncodeIoUringUserData(IoUringConstants.TagPollReadiness, requestId);
+ sqe->UserData = 0; // CQE for POLL_REMOVE is untracked
+
+ return Interop.Error.SUCCESS;
+ }
+
+ ///
+ /// Drains pending registration change requests enqueued by off-event-loop threads.
+ /// Each request is applied, then the waiting thread is signaled.
+ /// Ports the native ProcessPendingRegistrationChangeRequests from pal_io_uring.c.
+ ///
+ private void ProcessPendingRegistrationChanges()
+ {
+ ConcurrentQueue? queue = _registrationChangeQueue;
+ if (queue is null || queue.IsEmpty)
+ return;
+
+ while (queue.TryDequeue(out RegistrationChangeRequest? request))
+ {
+ request.Error = ApplyManagedRegistrationChange(request.Socket, request.NewEvents, request.Data);
+ request.Completed = true;
+ request.CompletionEvent.Set();
+ }
+ }
+
+ /// Allocates the completion slot array and initializes the free list.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void InitializeCompletionSlotPool(int capacity)
+ {
+ _completionSlots = new IoUringCompletionSlot[capacity];
+ _completionSlotStorage = new IoUringCompletionSlotStorage[capacity];
+ _zeroCopyPinHolds = new System.Buffers.MemoryHandle[capacity];
+ // Build free list linking all slots
+ for (int i = 0; i < capacity - 1; i++)
+ {
+ _completionSlots[i].Generation = 1;
+ _completionSlots[i].FreeListNext = i + 1;
+ }
+ _completionSlots[capacity - 1].Generation = 1;
+ _completionSlots[capacity - 1].FreeListNext = -1;
+ _completionSlotFreeListHead = 0;
+ _completionSlotsInUse = 0;
+ }
+
+ ///
+ /// Allocates a completion slot from the free list. Returns the slot index,
+ /// or -1 if the pool is exhausted (backpressure signal).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private int AllocateCompletionSlot()
+ {
+ Debug.Assert(_completionSlotStorage is not null);
+ int index = _completionSlotFreeListHead;
+ if (index < 0)
+ return -1; // Pool exhausted
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![index];
+ _completionSlotFreeListHead = slot.FreeListNext;
+ slot.FreeListNext = -1;
+ slot.Kind = IoUringCompletionOperationKind.None;
+ ResetDebugTestForcedResult(ref slot);
+ slot.IsZeroCopySend = false;
+ slot.ZeroCopyNotificationPending = false;
+ slot.UsesFixedRecvBuffer = false;
+ slot.FixedRecvBufferId = 0;
+ _completionSlotsInUse++;
+ return index;
+ }
+
+ ///
+ /// Returns a completion slot to the free list, incrementing its generation
+ /// to invalidate any stale user_data references.
+ ///
+ private unsafe void FreeCompletionSlot(int index)
+ {
+ Debug.Assert(index >= 0 && index < _completionSlots!.Length);
+ Debug.Assert(_completionSlotStorage is not null);
+
+ ReleaseZeroCopyPinHold(index);
+ ref IoUringCompletionSlot slot = ref _completionSlots![index];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![index];
+
+ if (slot.UsesFixedRecvBuffer)
+ {
+ IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing;
+ if (providedBufferRing is not null)
+ {
+ providedBufferRing.TryRecycleBufferFromCompletion(slot.FixedRecvBufferId);
+ }
+
+ slot.UsesFixedRecvBuffer = false;
+ slot.FixedRecvBufferId = 0;
+ }
+
+ // Free any native message storage
+ if (slot.Kind == IoUringCompletionOperationKind.Message)
+ {
+ FreeMessageStorage(index);
+ }
+ else if (slot.Kind == IoUringCompletionOperationKind.Accept)
+ {
+ if (slotStorage.NativeSocketAddressLengthPtr != null)
+ {
+ NativeMemory.Free(slotStorage.NativeSocketAddressLengthPtr);
+ slotStorage.NativeSocketAddressLengthPtr = null;
+ }
+ }
+
+ slot.Generation++;
+ if (slot.Generation == 0)
+ {
+ slot.Generation = 1;
+ }
+ slot.Kind = IoUringCompletionOperationKind.None;
+ ResetDebugTestForcedResult(ref slot);
+ slot.IsZeroCopySend = false;
+ slot.ZeroCopyNotificationPending = false;
+ slot.UsesFixedRecvBuffer = false;
+ slot.FixedRecvBufferId = 0;
+ slot.FreeListNext = _completionSlotFreeListHead;
+ _completionSlotFreeListHead = index;
+ _completionSlotsInUse--;
+ }
+
+ /// Disposes a retained zero-copy pin-hold for the specified completion slot.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ReleaseZeroCopyPinHold(int slotIndex)
+ {
+ System.Buffers.MemoryHandle[]? pinHolds = _zeroCopyPinHolds;
+ if (pinHolds is null || (uint)slotIndex >= (uint)pinHolds.Length)
+ {
+ return;
+ }
+
+ pinHolds[slotIndex].Dispose();
+ pinHolds[slotIndex] = default;
+ }
+
+ /// Transfers operation-owned pin state into the engine's zero-copy pin-hold registry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void TransferIoUringZeroCopyPinHold(ulong userData, System.Buffers.MemoryHandle pinHold)
+ {
+ System.Buffers.MemoryHandle[]? pinHolds = _zeroCopyPinHolds;
+ if (pinHolds is null)
+ {
+ pinHold.Dispose();
+ ThrowInternalException(Interop.Error.EINVAL);
+ return;
+ }
+
+ int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask);
+ if ((uint)slotIndex >= (uint)pinHolds.Length)
+ {
+ pinHold.Dispose();
+ ThrowInternalException(Interop.Error.EINVAL);
+ return;
+ }
+
+ Debug.Assert(_completionSlots is not null);
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ if (!slot.IsZeroCopySend)
+ {
+ pinHold.Dispose();
+ ThrowInternalException(Interop.Error.EINVAL);
+ return;
+ }
+
+ pinHolds[slotIndex].Dispose();
+ pinHolds[slotIndex] = pinHold;
+ }
+
+ ///
+ /// Allocates a single contiguous native memory block containing the kernel-consumable
+ /// struct msghdr, IOVector array, socket address, and control buffer for a
+ /// sendmsg/recvmsg io_uring operation. The layout within the block is:
+ /// [NativeMsghdr | IOVectors | SocketAddress | ControlBuffer].
+ /// For sendmsg, socket address and control buffer data are deep-copied from the
+ /// managed . For recvmsg, output pointers
+ /// are saved so completion can copy kernel-written data back to managed buffers.
+ ///
+ private unsafe void AllocateMessageStorage(int slotIndex, Interop.Sys.MessageHeader* messageHeader, bool isReceive)
+ {
+ Debug.Assert(sizeof(NativeMsghdr) == 56, $"NativeMsghdr size mismatch with kernel struct msghdr: expected 56, got {sizeof(NativeMsghdr)}");
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+
+ int iovCount = messageHeader->IOVectorCount;
+ int sockAddrLen = messageHeader->SocketAddressLen;
+ int controlBufLen = messageHeader->ControlBufferLen;
+ Debug.Assert(iovCount >= 0, $"Expected non-negative iovCount, got {iovCount}");
+ Debug.Assert(sockAddrLen >= 0, $"Expected non-negative socket address length, got {sockAddrLen}");
+ Debug.Assert(controlBufLen >= 0, $"Expected non-negative control buffer length, got {controlBufLen}");
+
+ nuint hdrSize = (nuint)sizeof(NativeMsghdr);
+ nuint iovSize = (nuint)iovCount * (nuint)sizeof(Interop.Sys.IOVector);
+ nuint sockAddrSize = (nuint)sockAddrLen;
+ nuint controlBufSize = (nuint)controlBufLen;
+ nuint totalSize = hdrSize + iovSize + sockAddrSize + controlBufSize;
+
+ byte* storage = (byte*)NativeMemory.AllocZeroed(totalSize);
+ slotStorage.NativeMessageStorage = (IntPtr)storage;
+
+ // Partition the contiguous block
+ NativeMsghdr* hdr = (NativeMsghdr*)storage;
+ Interop.Sys.IOVector* iovDst = (Interop.Sys.IOVector*)(storage + hdrSize);
+ byte* sockAddrDst = storage + hdrSize + iovSize;
+ byte* controlBufDst = storage + hdrSize + iovSize + sockAddrSize;
+
+ slotStorage.NativeMsgHdrPtr = (IntPtr)hdr;
+ slotStorage.NativeIOVectors = iovCount > 0 ? iovDst : null;
+ slotStorage.NativeSocketAddress = sockAddrLen > 0 ? sockAddrDst : null;
+ slotStorage.NativeControlBuffer = controlBufLen > 0 ? controlBufDst : null;
+
+ // Deep-copy IOVectors (base/count pairs pointing to caller's pinned buffers)
+ if (iovCount > 0 && messageHeader->IOVectors != null)
+ {
+ nuint iovBytes = (nuint)iovCount * (nuint)sizeof(Interop.Sys.IOVector);
+ Buffer.MemoryCopy(messageHeader->IOVectors, iovDst, iovBytes, iovBytes);
+ }
+
+ // For sendmsg: deep-copy socket address and control buffer data into native copies.
+ // For recvmsg: the kernel will write into these buffers; we copy back at completion.
+ if (sockAddrLen > 0 && messageHeader->SocketAddress != null)
+ {
+ if (!isReceive)
+ {
+ Buffer.MemoryCopy(messageHeader->SocketAddress, sockAddrDst, sockAddrSize, sockAddrSize);
+ }
+ }
+
+ if (controlBufLen > 0 && messageHeader->ControlBuffer != null)
+ {
+ if (!isReceive)
+ {
+ Buffer.MemoryCopy(messageHeader->ControlBuffer, controlBufDst, controlBufSize, controlBufSize);
+ }
+ }
+
+ // Build the kernel-consumable msghdr
+ hdr->msg_name = sockAddrLen > 0 ? sockAddrDst : null;
+ hdr->msg_namelen = (uint)sockAddrLen;
+ hdr->msg_iov = iovCount > 0 ? iovDst : null;
+ hdr->msg_iovlen = (nuint)iovCount;
+ hdr->msg_control = controlBufLen > 0 ? controlBufDst : null;
+ hdr->msg_controllen = (nuint)controlBufLen;
+ hdr->msg_flags = 0;
+
+ // For recvmsg: save pointers back to the managed MessageHeader's buffers
+ // so CopyMessageCompletionOutputs can write back kernel results at completion time.
+ if (isReceive)
+ {
+ slotStorage.ReceiveOutputSocketAddress = messageHeader->SocketAddress;
+ slotStorage.ReceiveOutputControlBuffer = messageHeader->ControlBuffer;
+ slotStorage.ReceiveSocketAddressCapacity = sockAddrLen;
+ slotStorage.ReceiveControlBufferCapacity = controlBufLen;
+ }
+ }
+
+ ///
+ /// Frees the contiguous native memory block allocated by
+ /// and resets all associated pointer fields on the completion slot.
+ ///
+ private unsafe void FreeMessageStorage(int slotIndex)
+ {
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+
+ if (slotStorage.NativeMessageStorage != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)slotStorage.NativeMessageStorage);
+ slotStorage.NativeMessageStorage = IntPtr.Zero;
+ }
+
+ slotStorage.NativeMsgHdrPtr = IntPtr.Zero;
+ slotStorage.NativeIOVectors = null;
+ slotStorage.NativeSocketAddress = null;
+ slotStorage.NativeControlBuffer = null;
+ slotStorage.ReceiveOutputSocketAddress = null;
+ slotStorage.ReceiveOutputControlBuffer = null;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+ slotStorage.ReceiveControlBufferCapacity = 0;
+ slotStorage.MessageIsReceive = false;
+ }
+
+ ///
+ /// After a recvmsg CQE completes, copies the kernel-written socket address and
+ /// control buffer data from the native msghdr back to the managed MessageHeader's
+ /// output buffers. For sendmsg completions this is a no-op.
+ /// Returns the actual socket address length, control buffer length, and msg_flags written by the kernel.
+ ///
+ private unsafe void CopyMessageCompletionOutputs(
+ int slotIndex,
+ out int socketAddressLen,
+ out int controlBufferLen,
+ out uint messageFlags)
+ {
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ socketAddressLen = 0;
+ controlBufferLen = 0;
+ messageFlags = 0;
+
+ if (!slotStorage.MessageIsReceive)
+ return;
+
+ NativeMsghdr* hdr = (NativeMsghdr*)slotStorage.NativeMsgHdrPtr;
+ if (hdr == null)
+ return;
+
+ socketAddressLen = (int)hdr->msg_namelen;
+ controlBufferLen = (int)hdr->msg_controllen;
+ messageFlags = (uint)hdr->msg_flags;
+
+ // Copy socket address from native buffer back to managed output buffer
+ if (slotStorage.ReceiveOutputSocketAddress != null && slotStorage.NativeSocketAddress != null &&
+ slotStorage.ReceiveSocketAddressCapacity > 0 && socketAddressLen > 0)
+ {
+ int copyLen = Math.Min(slotStorage.ReceiveSocketAddressCapacity, socketAddressLen);
+ Buffer.MemoryCopy(slotStorage.NativeSocketAddress, slotStorage.ReceiveOutputSocketAddress, copyLen, copyLen);
+ }
+
+ // Copy control buffer from native buffer back to managed output buffer
+ if (slotStorage.ReceiveOutputControlBuffer != null && slotStorage.NativeControlBuffer != null &&
+ slotStorage.ReceiveControlBufferCapacity > 0 && controlBufferLen > 0)
+ {
+ int copyLen = Math.Min(slotStorage.ReceiveControlBufferCapacity, controlBufferLen);
+ Buffer.MemoryCopy(slotStorage.NativeControlBuffer, slotStorage.ReceiveOutputControlBuffer, copyLen, copyLen);
+ }
+ }
+
+ ///
+ /// Decodes a completion slot index from a user_data payload value.
+ /// The slot index is encoded in the lower 24 bits of the payload.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static int DecodeCompletionSlotIndex(ulong payload)
+ {
+ return (int)(payload & IoUringConstants.SlotIndexMask);
+ }
+
+ ///
+ /// Encodes a completion slot index and generation into a user_data value
+ /// with the ReservedCompletion tag.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ulong EncodeCompletionSlotUserData(int slotIndex, uint generation)
+ {
+ ulong payload = ((ulong)(generation & IoUringConstants.GenerationMask) << IoUringConstants.SlotIndexBits) | ((ulong)slotIndex & IoUringConstants.SlotIndexMask);
+ return EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ }
+
+ ///
+ /// Checks whether direct SQE submission is disabled.
+ /// Defaults to enabled; test-only env var can disable for deterministic tests.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsIoUringDirectSqeDisabled()
+ {
+#if DEBUG
+ // Test-only override for deterministic stress scenarios.
+ string? value = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE");
+ if (string.Equals(value, "0", StringComparison.Ordinal))
+ {
+ return true;
+ }
+
+ if (string.Equals(value, "1", StringComparison.Ordinal))
+ {
+ return false;
+ }
+#endif
+
+ // Default: direct SQE enabled.
+ return false;
+ }
+
+ /// Checks whether io_uring is enabled (env var overrides AppContext switch).
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsIoUringEnabled()
+ {
+ string? value = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING");
+ if (string.Equals(value, "1", StringComparison.Ordinal))
+ {
+ return true;
+ }
+
+ if (string.Equals(value, "0", StringComparison.Ordinal))
+ {
+ return false;
+ }
+
+ if (AppContext.TryGetSwitch("System.Net.Sockets.IoUring.Enable", out bool enabled))
+ {
+ return enabled;
+ }
+
+ return false;
+ }
+
+ ///
+ /// Returns whether SEND_ZC should be enabled.
+ /// Defaults to enabled; test-only env var can disable for deterministic tests.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsZeroCopySendOptedIn()
+ {
+#if DEBUG
+ // Test-only override for deterministic coverage.
+ string? value = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND");
+ if (string.Equals(value, "1", StringComparison.Ordinal))
+ {
+ return true;
+ }
+
+ if (string.Equals(value, "0", StringComparison.Ordinal))
+ {
+ return false;
+ }
+#endif
+
+ // Default: zero-copy send enabled.
+ return true;
+ }
+
+ ///
+ /// Returns whether SQPOLL mode has been explicitly requested.
+ /// Requires both env var and AppContext switch to be enabled.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsSqPollRequested()
+ {
+ bool switchEnabled =
+ AppContext.TryGetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", out bool enabled) &&
+ enabled;
+
+ string? value = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL");
+ if (string.Equals(value, "1", StringComparison.Ordinal))
+ {
+ return switchEnabled;
+ }
+
+ if (string.Equals(value, "0", StringComparison.Ordinal))
+ {
+ return false;
+ }
+
+ return false;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ResetDebugTestForcedResult(ref IoUringCompletionSlot slot)
+ {
+#if DEBUG
+ slot.HasTestForcedResult = false;
+ slot.TestForcedResult = 0;
+#endif
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ResolveDebugTestForcedResult(ref IoUringCompletionSlot slot, ref int result)
+ {
+#if DEBUG
+ if (slot.HasTestForcedResult)
+ {
+ result = slot.TestForcedResult;
+ slot.HasTestForcedResult = false;
+ }
+#endif
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ApplyDebugTestForcedResult(ref IoUringCompletionSlot slot, byte opcode)
+ {
+#if DEBUG
+ if ((_testForceEagainOnceMask | _testForceEcanceledOnceMask) == 0)
+ {
+ return;
+ }
+
+ if (TryConsumeTestForcedResult(opcode, out int forced))
+ {
+ slot.HasTestForcedResult = true;
+ slot.TestForcedResult = forced;
+ }
+#else
+ _ = _ioUringInitialized;
+#endif
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RestoreDebugTestForcedResultIfNeeded(int slotIndex, byte opcode)
+ {
+#if DEBUG
+ Debug.Assert(_completionSlots is not null);
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ if (slot.HasTestForcedResult)
+ {
+ RestoreTestForcedResult(slot.TestForcedResult, opcode);
+ }
+#else
+ _ = _ioUringInitialized;
+#endif
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void InitializeDebugTestHooksFromEnvironment()
+ {
+#if DEBUG
+ // Mirrors native pal_io_uring.c test hooks.
+ _testForceEagainOnceMask = ParseTestOpcodeMask(
+ Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_EAGAIN_ONCE_MASK"));
+ _testForceEcanceledOnceMask = ParseTestOpcodeMask(
+ Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ECANCELED_ONCE_MASK"));
+#else
+ _ = _ioUringInitialized;
+#endif
+ }
+
+#if DEBUG
+ ///
+ /// Parses a comma-separated list of opcode names (e.g. "send,recv,accept") into a
+ /// bitmask of TestOpcodeMask* values.
+ /// Mirrors GetIoUringTestOpcodeMaskFromOpcodeNameList in pal_io_uring.c.
+ ///
+ private static byte ParseTestOpcodeMask(string? opcodeNameList)
+ {
+ if (string.IsNullOrEmpty(opcodeNameList))
+ return IoUringConstants.TestOpcodeMaskNone;
+
+ byte mask = IoUringConstants.TestOpcodeMaskNone;
+ foreach (var name in opcodeNameList.Split(',', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries))
+ {
+ if (name.Equals("send", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskSend;
+ else if (name.Equals("recv", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskRecv;
+ else if (name.Equals("sendmsg", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskSendMsg;
+ else if (name.Equals("recvmsg", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskRecvMsg;
+ else if (name.Equals("accept", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskAccept;
+ else if (name.Equals("connect", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskConnect;
+ else if (name.Equals("sendzc", StringComparison.OrdinalIgnoreCase) || name.Equals("send_zc", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskSendZc;
+ else if (name.Equals("sendmsgzc", StringComparison.OrdinalIgnoreCase) || name.Equals("sendmsg_zc", StringComparison.OrdinalIgnoreCase)) mask |= IoUringConstants.TestOpcodeMaskSendMsgZc;
+ }
+ return mask;
+ }
+
+ ///
+ /// Maps an io_uring opcode to its corresponding test opcode mask bit.
+ /// Mirrors GetIoUringTestOpcodeMaskFromOpcode in pal_io_uring.c.
+ ///
+ private static byte GetTestOpcodeMaskFromOpcode(byte opcode)
+ {
+ return opcode switch
+ {
+ IoUringOpcodes.Send => IoUringConstants.TestOpcodeMaskSend,
+ IoUringOpcodes.Recv => IoUringConstants.TestOpcodeMaskRecv,
+ IoUringOpcodes.SendMsg => IoUringConstants.TestOpcodeMaskSendMsg,
+ IoUringOpcodes.RecvMsg => IoUringConstants.TestOpcodeMaskRecvMsg,
+ IoUringOpcodes.Accept => IoUringConstants.TestOpcodeMaskAccept,
+ IoUringOpcodes.Connect => IoUringConstants.TestOpcodeMaskConnect,
+ IoUringOpcodes.SendZc => IoUringConstants.TestOpcodeMaskSendZc,
+ IoUringOpcodes.SendMsgZc => IoUringConstants.TestOpcodeMaskSendMsgZc,
+ _ => IoUringConstants.TestOpcodeMaskNone,
+ };
+ }
+
+ ///
+ /// Tries to consume a forced test result for the given opcode.
+ /// EAGAIN takes priority over ECANCELED when both are set.
+ /// Mirrors TryConsumeIoUringForcedCompletionResultLocked in pal_io_uring.c.
+ ///
+ private bool TryConsumeTestForcedResult(byte opcode, out int forcedResult)
+ {
+ forcedResult = 0;
+ byte opcodeMask = GetTestOpcodeMaskFromOpcode(opcode);
+ if (opcodeMask == IoUringConstants.TestOpcodeMaskNone)
+ return false;
+
+ if ((_testForceEagainOnceMask & opcodeMask) != 0)
+ {
+ _testForceEagainOnceMask &= (byte)~opcodeMask;
+ forcedResult = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.EAGAIN);
+ return true;
+ }
+
+ if ((_testForceEcanceledOnceMask & opcodeMask) != 0)
+ {
+ _testForceEcanceledOnceMask &= (byte)~opcodeMask;
+ forcedResult = -ErrnoECANCELED;
+ return true;
+ }
+
+ return false;
+ }
+
+ ///
+ /// Restores a previously consumed forced test result mask bit.
+ /// Called when SQE acquisition fails after the forced result was consumed,
+ /// so the test hook can fire on the next attempt.
+ /// Mirrors RestoreIoUringForcedCompletionResultLocked in pal_io_uring.c.
+ ///
+ private void RestoreTestForcedResult(int forcedResult, byte opcode)
+ {
+ byte opcodeMask = GetTestOpcodeMaskFromOpcode(opcode);
+ if (opcodeMask == IoUringConstants.TestOpcodeMaskNone)
+ return;
+
+ if (forcedResult == -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.EAGAIN))
+ _testForceEagainOnceMask |= opcodeMask;
+ else if (forcedResult == -ErrnoECANCELED)
+ _testForceEcanceledOnceMask |= opcodeMask;
+ }
+#endif
+
+ ///
+ /// Probes the kernel for supported io_uring opcodes using IORING_REGISTER_PROBE and
+ /// populates the per-opcode _supportsOp* capability flags.
+ /// When the probe syscall is unavailable (older kernels), all flags remain at their
+ /// default value ().
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void ProbeIoUringOpcodeSupport(int ringFd)
+ {
+ // Probe buffer: 16-byte header + 256 * 8-byte ops = 2064 bytes.
+ const int maxOps = 256;
+ const int probeSize = 16 + maxOps * 8;
+ byte* probeBuffer = stackalloc byte[probeSize];
+ new Span(probeBuffer, probeSize).Clear();
+
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimRegister(
+ ringFd, IoUringConstants.RegisterProbe, probeBuffer, (uint)maxOps, &result);
+
+ if (err != Interop.Error.SUCCESS)
+ {
+ // Probe not supported (for example older kernels): per-opcode flags remain false.
+ // Direct SQE prep does not gate on these flags; this mainly affects optional feature light-up.
+ return;
+ }
+
+ // Parse: ops start at offset 16, each is 8 bytes.
+ IoUringProbeOp* ops = (IoUringProbeOp*)(probeBuffer + 16);
+ IoUringProbeHeader* header = (IoUringProbeHeader*)probeBuffer;
+ int opsCount = Math.Min((int)header->OpsLen, maxOps);
+
+ _supportsOpReadFixed = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.ReadFixed);
+ _supportsOpSend = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Send);
+ _supportsOpRecv = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Recv);
+ _supportsOpSendMsg = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendMsg);
+ _supportsOpRecvMsg = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.RecvMsg);
+ _supportsOpAccept = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Accept);
+ _supportsOpConnect = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Connect);
+ _supportsOpSendZc = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendZc);
+ _supportsOpSendMsgZc = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendMsgZc);
+ _zeroCopySendEnabled = _supportsOpSendZc && IsZeroCopySendOptedIn();
+ _supportsOpAsyncCancel = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.AsyncCancel);
+ _supportsOpPollAdd = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.PollAdd);
+ _supportsMultishotAccept = _supportsOpAccept;
+ RefreshIoUringMultishotRecvSupport();
+ }
+
+ /// Checks whether a specific opcode is supported by the kernel's io_uring probe result.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe bool IsOpcodeSupported(IoUringProbeOp* ops, int opsCount, byte opcode)
+ {
+ if (opcode >= opsCount) return false;
+ return (ops[opcode].Flags & IoUringConstants.ProbeOpFlagSupported) != 0;
+ }
+
+ /// Converts SocketFlags to the kernel msg_flags representation for io_uring.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool TryConvertIoUringPrepareSocketFlags(SocketFlags flags, out uint rwFlags)
+ {
+ const SocketFlags SupportedIoUringFlags =
+ SocketFlags.OutOfBand |
+ SocketFlags.Peek |
+ SocketFlags.DontRoute;
+
+ if ((flags & ~SupportedIoUringFlags) != 0)
+ {
+ rwFlags = 0;
+ return false;
+ }
+
+ rwFlags = (uint)(int)flags;
+ return true;
+ }
+
+ /// Writes a send SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteSendSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* buffer,
+ uint length,
+ uint rwFlags)
+ {
+ sqe->Opcode = IoUringOpcodes.Send;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)buffer;
+ sqe->Len = length;
+ sqe->RwFlags = rwFlags;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ /// Writes a zero-copy send SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteSendZcSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* buffer,
+ uint length,
+ uint rwFlags)
+ {
+ sqe->Opcode = IoUringOpcodes.SendZc;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)buffer;
+ sqe->Len = length;
+ sqe->RwFlags = rwFlags;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ /// Writes a recv SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteRecvSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* buffer,
+ uint length,
+ uint rwFlags)
+ {
+ sqe->Opcode = IoUringOpcodes.Recv;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Ioprio = 0;
+ sqe->Addr = (ulong)(nuint)buffer;
+ sqe->Len = length;
+ sqe->RwFlags = rwFlags;
+ sqe->BufIndex = 0;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ /// Writes a read-fixed SQE for registered-buffer receive.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteReadFixedSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* buffer,
+ uint length,
+ ushort bufferIndex)
+ {
+ sqe->Opcode = IoUringOpcodes.ReadFixed;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Ioprio = 0;
+ sqe->Addr = (ulong)(nuint)buffer;
+ sqe->Len = length;
+ // For non-seekable sockets, offset is ignored; -1 matches "current position" semantics.
+ sqe->Off = ulong.MaxValue;
+ sqe->RwFlags = 0;
+ sqe->BufIndex = bufferIndex;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ ///
+ /// Writes a one-shot recv SQE using provided-buffer selection.
+ /// The kernel chooses a buffer from the specified buffer group.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void WriteProvidedBufferRecvSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ uint requestedLength,
+ uint rwFlags,
+ ushort bufferGroupId)
+ {
+ sqe->Opcode = IoUringOpcodes.Recv;
+ sqe->Fd = sqeFd;
+ sqe->Flags = (byte)(sqeFlags | IoUringConstants.SqeBufferSelect);
+ sqe->Ioprio = 0;
+ sqe->Addr = 0;
+ sqe->Len = requestedLength;
+ sqe->RwFlags = rwFlags;
+ sqe->BufIndex = bufferGroupId;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ ///
+ /// Writes a multishot recv SQE to the submission ring entry.
+ /// The kernel selects buffers from a provided buffer ring (IOSQE_BUFFER_SELECT).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void WriteMultishotRecvSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ ushort bufferGroupId)
+ {
+ sqe->Opcode = IoUringOpcodes.Recv;
+ sqe->Fd = sqeFd;
+ sqe->Flags = (byte)(sqeFlags | IoUringConstants.SqeBufferSelect);
+ sqe->Ioprio = IoUringConstants.RecvMultishot;
+ sqe->Addr = 0;
+ sqe->Len = 0;
+ sqe->RwFlags = 0;
+ sqe->BufIndex = bufferGroupId;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ /// Writes an accept SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteAcceptSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* socketAddress,
+ IntPtr socketAddressLengthPtr)
+ {
+ sqe->Opcode = IoUringOpcodes.Accept;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)socketAddress;
+ // Kernel accept prep aliases addr2 at sqe->off.
+ sqe->Off = (ulong)(nuint)socketAddressLengthPtr;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ /// Writes a multishot accept SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static unsafe void WriteMultishotAcceptSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* socketAddress,
+ IntPtr socketAddressLengthPtr)
+ {
+ sqe->Opcode = IoUringOpcodes.Accept;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Ioprio = IoUringConstants.AcceptMultishot;
+ sqe->Addr = (ulong)(nuint)socketAddress;
+ // accept4 prep aliases addr2 at sqe->off for addrlen pointer
+ sqe->Off = (ulong)(nuint)socketAddressLengthPtr;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ /// Writes a sendmsg SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void WriteSendMsgSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ IntPtr messageHeader,
+ uint rwFlags)
+ {
+ sqe->Opcode = IoUringOpcodes.SendMsg;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)messageHeader;
+ sqe->Len = 1;
+ sqe->RwFlags = rwFlags;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ /// Writes a sendmsg_zc SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void WriteSendMsgZcSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ IntPtr messageHeader,
+ uint rwFlags)
+ {
+ sqe->Opcode = IoUringOpcodes.SendMsgZc;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)messageHeader;
+ sqe->Len = 1;
+ sqe->RwFlags = rwFlags;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ /// Writes a recvmsg SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void WriteRecvMsgSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ IntPtr messageHeader,
+ uint rwFlags)
+ {
+ sqe->Opcode = IoUringOpcodes.RecvMsg;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)messageHeader;
+ sqe->Len = 1;
+ sqe->RwFlags = rwFlags;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ /// Writes a connect SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteConnectSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* socketAddress,
+ int socketAddressLen)
+ {
+ sqe->Opcode = IoUringOpcodes.Connect;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)socketAddress;
+ // Kernel connect prep aliases addrlen at sqe->off and requires len=0.
+ sqe->Off = (uint)socketAddressLen;
+ sqe->Len = 0;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ }
+
+ /// Writes an ASYNC_CANCEL SQE targeting the specified user_data.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void WriteAsyncCancelSqe(IoUringSqe* sqe, ulong userData)
+ {
+ sqe->Opcode = IoUringOpcodes.AsyncCancel;
+ sqe->Fd = -1;
+ sqe->Addr = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, userData);
+ sqe->UserData = 0;
+ }
+
+ /// Publishes the managed SQ tail pointer to make queued SQEs visible to the kernel.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void PublishManagedSqeTail()
+ {
+ if (!_ioUringManagedSqTailLoaded || _ioUringSqRingInfo.SqTailPtr == IntPtr.Zero)
+ {
+ return;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "PublishManagedSqeTail must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ ref uint sqTailRef = ref Unsafe.AsRef((void*)_ioUringSqRingInfo.SqTailPtr);
+ Volatile.Write(ref sqTailRef, _ioUringManagedSqTail);
+ _ioUringManagedSqTailLoaded = false;
+ }
+
+ ///
+ /// Returns true when the SQPOLL kernel thread has gone idle and needs an explicit wakeup.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool SqNeedWakeup()
+ {
+ Debug.Assert(_sqPollEnabled, "SqNeedWakeup should only be checked in SQPOLL mode.");
+ if (_managedSqFlagsPtr == null)
+ {
+ return true;
+ }
+
+ return (Volatile.Read(ref *_managedSqFlagsPtr) & IoUringConstants.SqNeedWakeup) != 0;
+ }
+
+ /// Allocates the next available SQE slot from the submission ring.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool TryGetNextManagedSqe(out IoUringSqe* sqe)
+ {
+ sqe = null;
+ if (!_ioUringDirectSqeEnabled)
+ {
+ return false;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryGetNextManagedSqe must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo;
+ if (ringInfo.SqeBase == IntPtr.Zero ||
+ ringInfo.SqHeadPtr == IntPtr.Zero ||
+ ringInfo.SqTailPtr == IntPtr.Zero ||
+ ringInfo.SqEntries == 0)
+ {
+ return false;
+ }
+
+ ref uint sqHeadRef = ref Unsafe.AsRef((void*)ringInfo.SqHeadPtr);
+ uint sqHead = Volatile.Read(ref sqHeadRef);
+ if (!_ioUringManagedSqTailLoaded)
+ {
+ ref uint sqTailRef = ref Unsafe.AsRef((void*)ringInfo.SqTailPtr);
+ _ioUringManagedSqTail = Volatile.Read(ref sqTailRef);
+ _ioUringManagedSqTailLoaded = true;
+ }
+
+ uint sqTail = _ioUringManagedSqTail;
+ if (sqTail - sqHead >= ringInfo.SqEntries)
+ {
+ return false;
+ }
+
+ uint index = sqTail & ringInfo.SqMask;
+ nint sqeOffset = checked((nint)((nuint)index * ringInfo.SqeSize));
+ sqe = (IoUringSqe*)((byte*)ringInfo.SqeBase + sqeOffset);
+ Unsafe.InitBlockUnaligned(sqe, 0, ringInfo.SqeSize);
+ _ioUringManagedSqTail = sqTail + 1;
+ _ioUringManagedPendingSubmissions++;
+ return true;
+ }
+
+ /// Attempts to acquire an SQE, retrying with intermediate submits on ring full.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError)
+ {
+ sqe = null;
+ submitError = Interop.Error.SUCCESS;
+ for (int attempt = 0; attempt < MaxIoUringSqeAcquireSubmitAttempts; attempt++)
+ {
+ if (TryGetNextManagedSqe(out sqe))
+ {
+ return true;
+ }
+
+ submitError = SubmitIoUringOperationsNormalized();
+ if (submitError != Interop.Error.SUCCESS)
+ {
+ return false;
+ }
+ }
+
+ submitError = Interop.Error.EAGAIN;
+ return false;
+ }
+
+ ///
+ /// Common setup for direct SQE preparation: allocates a completion slot, encodes user data,
+ /// resolves the socket fd/flags, applies test hooks, and acquires an SQE. On failure,
+ /// restores test state and frees the slot.
+ ///
+ ///
+ /// if the SQE was acquired
+ /// (caller must write the SQE and return Prepared),
+ /// or a terminal result (Unsupported/PrepareFailed) that the caller should return directly.
+ ///
+ private unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TrySetupDirectSqe(
+ SafeSocketHandle socket,
+ byte opcode,
+ out int slotIndex,
+ out ulong allocatedUserData,
+ out int sqeFd,
+ out byte sqeFlags,
+ out IoUringSqe* sqe,
+ out SocketError errorCode)
+ {
+ slotIndex = -1;
+ allocatedUserData = 0;
+ sqeFd = 0;
+ sqeFlags = 0;
+ sqe = null;
+ errorCode = SocketError.Success;
+
+ if (!_ioUringDirectSqeEnabled)
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ slotIndex = AllocateCompletionSlot();
+ if (slotIndex < 0)
+ {
+ RecordIoUringCompletionSlotExhaustion();
+
+ if (!_completionSlotDrainInProgress)
+ {
+ _completionSlotDrainInProgress = true;
+ try
+ {
+ SocketEventHandler handler = new SocketEventHandler(this);
+ if (DrainCqeRingBatch(handler))
+ {
+ slotIndex = AllocateCompletionSlot();
+ }
+ }
+ finally
+ {
+ _completionSlotDrainInProgress = false;
+ }
+ }
+
+ if (slotIndex < 0)
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ RecordIoUringCompletionSlotDrainRecovery();
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ allocatedUserData = EncodeCompletionSlotUserData(slotIndex, slot.Generation);
+ int socketFd = (int)(nint)socket.DangerousGetHandle();
+ ConfigureSocketSqeFdAndFlags(socketFd, out sqeFd, out sqeFlags);
+ ApplyDebugTestForcedResult(ref slot, opcode);
+
+ if (!TryAcquireManagedSqeWithRetry(out sqe, out Interop.Error submitError))
+ {
+ RestoreDebugTestForcedResultIfNeeded(slotIndex, opcode);
+ FreeCompletionSlot(slotIndex);
+ slotIndex = -1;
+
+ if (submitError == Interop.Error.SUCCESS ||
+ submitError == Interop.Error.EAGAIN ||
+ submitError == Interop.Error.EWOULDBLOCK)
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ errorCode = SocketPal.GetSocketErrorForErrorCode(submitError);
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a send SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSend(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ var result = TrySetupDirectSqe(socket, IoUringOpcodes.Send, out _, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode);
+ if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ return result;
+ }
+
+ WriteSendSqe(sqe, sqeFd, sqeFlags, allocatedUserData, buffer, (uint)bufferLen, rwFlags);
+ userData = allocatedUserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ ///
+ /// Prepares a send SQE, preferring SEND_ZC when eligible and falling back to SEND when unavailable.
+ ///
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendWithZeroCopyFallback(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ SocketFlags flags,
+ out bool usedZeroCopy,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ usedZeroCopy = false;
+ if (ShouldTryIoUringDirectSendZeroCopy(bufferLen))
+ {
+ SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult zeroCopyResult = TryPrepareIoUringDirectSendZc(
+ socket,
+ buffer,
+ bufferLen,
+ flags,
+ out userData,
+ out errorCode);
+ if (zeroCopyResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported)
+ {
+ usedZeroCopy = zeroCopyResult == SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ return zeroCopyResult;
+ }
+ }
+
+ return TryPrepareIoUringDirectSend(
+ socket,
+ buffer,
+ bufferLen,
+ flags,
+ out userData,
+ out errorCode);
+ }
+
+ /// Prepares a zero-copy send SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendZc(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!ShouldTryIoUringDirectSendZeroCopy(bufferLen))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ var result = TrySetupDirectSqe(
+ socket,
+ IoUringOpcodes.SendZc,
+ out int slotIndex,
+ out ulong allocatedUserData,
+ out int sqeFd,
+ out byte sqeFlags,
+ out IoUringSqe* sqe,
+ out errorCode);
+ if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ return result;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ slot.IsZeroCopySend = true;
+ slot.ZeroCopyNotificationPending = false;
+
+ WriteSendZcSqe(sqe, sqeFd, sqeFlags, allocatedUserData, buffer, (uint)bufferLen, rwFlags);
+ userData = allocatedUserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a recv SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectRecv(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ SocketFlags flags,
+ bool allowMultishotRecv,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ var result = TrySetupDirectSqe(socket, IoUringOpcodes.Recv, out int slotIndex, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode);
+ if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ return result;
+ }
+
+ if (ShouldTryIoUringDirectFixedRecv(flags, allowMultishotRecv, bufferLen) &&
+ TryPrepareIoUringDirectRecvFixed(slotIndex, sqe, sqeFd, sqeFlags, allocatedUserData, bufferLen))
+ {
+ SocketsTelemetry.Log.IoUringFixedRecvSelected();
+ userData = allocatedUserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ if (allowMultishotRecv &&
+ bufferLen > 0 &&
+ TryGetIoUringMultishotRecvBufferGroupId(out ushort multishotBufferGroupId))
+ {
+ WriteMultishotRecvSqe(sqe, sqeFd, sqeFlags, allocatedUserData, multishotBufferGroupId);
+ }
+ else if (bufferLen > 0 &&
+ TryGetIoUringProvidedBufferGroupId(out ushort providedBufferGroupId))
+ {
+ WriteProvidedBufferRecvSqe(
+ sqe,
+ sqeFd,
+ sqeFlags,
+ allocatedUserData,
+ (uint)bufferLen,
+ rwFlags,
+ providedBufferGroupId);
+ }
+ else
+ {
+ WriteRecvSqe(sqe, sqeFd, sqeFlags, allocatedUserData, buffer, (uint)bufferLen, rwFlags);
+ }
+ userData = allocatedUserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool ShouldTryIoUringDirectFixedRecv(SocketFlags flags, bool allowMultishotRecv, int bufferLen)
+ {
+ if (!_supportsOpReadFixed || !_ioUringBuffersRegistered)
+ {
+ return false;
+ }
+
+ if (allowMultishotRecv || bufferLen <= 0)
+ {
+ return false;
+ }
+
+ // READ_FIXED does not provide recvmsg/socket flags semantics.
+ return flags == SocketFlags.None;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool TryPrepareIoUringDirectRecvFixed(
+ int slotIndex,
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ int requestedLength)
+ {
+ IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ SocketsTelemetry.Log.IoUringFixedRecvFallback();
+ return false;
+ }
+
+ if (!providedBufferRing.TryAcquireBufferForPreparedReceive(
+ out ushort bufferId,
+ out byte* fixedBuffer,
+ out int fixedBufferLength))
+ {
+ // Under transient provided-buffer pressure, fall back to normal receive preparation.
+ SocketsTelemetry.Log.IoUringFixedRecvFallback();
+ return false;
+ }
+
+ Debug.Assert(_completionSlots is not null);
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ slot.UsesFixedRecvBuffer = true;
+ slot.FixedRecvBufferId = bufferId;
+
+ int receiveLength = Math.Min(requestedLength, fixedBufferLength);
+ WriteReadFixedSqe(
+ sqe,
+ sqeFd,
+ sqeFlags,
+ userData,
+ fixedBuffer,
+ (uint)receiveLength,
+ bufferId);
+ return true;
+ }
+
+ /// Prepares an accept SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectAccept(
+ SafeSocketHandle socket,
+ byte* socketAddress,
+ int socketAddressLen,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ var result = TrySetupDirectSqe(socket, IoUringOpcodes.Accept, out int slotIndex, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode);
+ if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ return result;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ slot.Kind = IoUringCompletionOperationKind.Accept;
+ slotStorage.NativeSocketAddressLengthPtr = (int*)NativeMemory.Alloc((nuint)sizeof(int));
+ *slotStorage.NativeSocketAddressLengthPtr = socketAddressLen;
+
+ WriteAcceptSqe(sqe, sqeFd, sqeFlags, allocatedUserData, socketAddress, (IntPtr)slotStorage.NativeSocketAddressLengthPtr);
+ userData = allocatedUserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a multishot accept SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectMultishotAccept(
+ SafeSocketHandle socket,
+ byte* socketAddress,
+ int socketAddressLen,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+ if (!_supportsMultishotAccept)
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ var result = TrySetupDirectSqe(
+ socket,
+ IoUringOpcodes.Accept,
+ out int slotIndex,
+ out ulong allocatedUserData,
+ out int sqeFd,
+ out byte sqeFlags,
+ out IoUringSqe* sqe,
+ out errorCode);
+ if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ return result;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ slot.Kind = IoUringCompletionOperationKind.Accept;
+ slotStorage.NativeSocketAddressLengthPtr = (int*)NativeMemory.Alloc((nuint)sizeof(int));
+ *slotStorage.NativeSocketAddressLengthPtr = socketAddressLen;
+ // Preserve the original sockaddr capacity for future multishot accept re-arm/reset handling.
+ slotStorage.ReceiveSocketAddressCapacity = socketAddressLen;
+
+ WriteMultishotAcceptSqe(
+ sqe,
+ sqeFd,
+ sqeFlags,
+ allocatedUserData,
+ socketAddress,
+ (IntPtr)slotStorage.NativeSocketAddressLengthPtr);
+ userData = allocatedUserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a connect SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectConnect(
+ SafeSocketHandle socket,
+ byte* socketAddress,
+ int socketAddressLen,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ var result = TrySetupDirectSqe(socket, IoUringOpcodes.Connect, out _, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode);
+ if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ return result;
+ }
+
+ WriteConnectSqe(sqe, sqeFd, sqeFlags, allocatedUserData, socketAddress, socketAddressLen);
+ userData = allocatedUserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a sendmsg SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessage(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ var result = TrySetupDirectSqe(socket, IoUringOpcodes.SendMsg, out int slotIndex, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode);
+ if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ return result;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ slot.Kind = IoUringCompletionOperationKind.Message;
+ slotStorage.MessageIsReceive = false;
+ AllocateMessageStorage(slotIndex, messageHeader, isReceive: false);
+
+ WriteSendMsgSqe(sqe, sqeFd, sqeFlags, allocatedUserData, slotStorage.NativeMsgHdrPtr, rwFlags);
+ userData = allocatedUserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ ///
+ /// Prepares a sendmsg SQE, preferring SENDMSG_ZC when eligible and falling back to SENDMSG otherwise.
+ ///
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ int payloadLength,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ if (ShouldTryIoUringDirectSendMessageZeroCopy(payloadLength))
+ {
+ SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult zeroCopyResult = TryPrepareIoUringDirectSendMessageZc(
+ socket,
+ messageHeader,
+ payloadLength,
+ flags,
+ out userData,
+ out errorCode);
+ if (zeroCopyResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported)
+ {
+ return zeroCopyResult;
+ }
+ }
+
+ return TryPrepareIoUringDirectSendMessage(
+ socket,
+ messageHeader,
+ flags,
+ out userData,
+ out errorCode);
+ }
+
+ /// Prepares a sendmsg_zc SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessageZc(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ int payloadLength,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!ShouldTryIoUringDirectSendMessageZeroCopy(payloadLength))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ var result = TrySetupDirectSqe(
+ socket,
+ IoUringOpcodes.SendMsgZc,
+ out int slotIndex,
+ out ulong allocatedUserData,
+ out int sqeFd,
+ out byte sqeFlags,
+ out IoUringSqe* sqe,
+ out errorCode);
+ if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ return result;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ slot.Kind = IoUringCompletionOperationKind.Message;
+ slotStorage.MessageIsReceive = false;
+ // Mirror SEND_ZC semantics: first CQE is not final managed completion; operation
+ // completes only after NOTIF CQE confirms kernel/NIC no longer references payload.
+ slot.IsZeroCopySend = true;
+ slot.ZeroCopyNotificationPending = false;
+ AllocateMessageStorage(slotIndex, messageHeader, isReceive: false);
+
+ WriteSendMsgZcSqe(sqe, sqeFd, sqeFlags, allocatedUserData, slotStorage.NativeMsgHdrPtr, rwFlags);
+ userData = allocatedUserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a recvmsg SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectReceiveMessage(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ var result = TrySetupDirectSqe(socket, IoUringOpcodes.RecvMsg, out int slotIndex, out ulong allocatedUserData, out int sqeFd, out byte sqeFlags, out IoUringSqe* sqe, out errorCode);
+ if (result != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ return result;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ slot.Kind = IoUringCompletionOperationKind.Message;
+ slotStorage.MessageIsReceive = true;
+ AllocateMessageStorage(slotIndex, messageHeader, isReceive: true);
+
+ WriteRecvMsgSqe(sqe, sqeFd, sqeFlags, allocatedUserData, slotStorage.NativeMsgHdrPtr, rwFlags);
+ userData = allocatedUserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Debug-only assertion that validates a state machine transition.
+ [Conditional("DEBUG")]
+ private static void AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState from,
+ IoUringOperationLifecycleState to)
+ {
+ bool isValid =
+ from == IoUringOperationLifecycleState.Queued && to == IoUringOperationLifecycleState.Prepared ||
+ from == IoUringOperationLifecycleState.Prepared && to == IoUringOperationLifecycleState.Submitted ||
+ from == IoUringOperationLifecycleState.Prepared && to == IoUringOperationLifecycleState.Detached ||
+ from == IoUringOperationLifecycleState.Submitted &&
+ (to == IoUringOperationLifecycleState.Queued ||
+ to == IoUringOperationLifecycleState.Completed ||
+ to == IoUringOperationLifecycleState.Canceled ||
+ to == IoUringOperationLifecycleState.Detached);
+
+ Debug.Assert(isValid, $"Invalid io_uring lifecycle transition: {from} -> {to}");
+ }
+
+ /// Resets the native diagnostics poll countdown.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void InitializeLinuxIoUringDiagnosticsState() =>
+ _ioUringDiagnosticsPollCountdown = IoUringDiagnosticsPollInterval;
+
+ /// Logs a failed ASYNC_CANCEL SQE preparation.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringAsyncCancelPrepareFailure(SocketError cancelError, ulong userData, IoUringCancellationOrigin origin)
+ {
+ string originLabel = origin == IoUringCancellationOrigin.Teardown ? " during teardown" : string.Empty;
+ NetEventSource.Info(this, $"io_uring async-cancel prepare failed{originLabel}: error={cancelError}, user_data=0x{userData:x}");
+ }
+
+ /// Logs a failed ASYNC_CANCEL submission.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringAsyncCancelSubmitFailure(Interop.Error submitError, IoUringCancellationOrigin origin)
+ {
+ string originLabel = origin == IoUringCancellationOrigin.Teardown ? " during teardown" : string.Empty;
+ NetEventSource.Info(this, $"io_uring async-cancel submit failed{originLabel}: error={submitError}");
+ }
+
+ /// Logs a sampled counter value with its associated user_data.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCounterSample(string message, long count, ulong userData)
+ {
+ NetEventSource.Info(this, $"{message}: count={count}, user_data=0x{userData:x}");
+ }
+
+ /// Logs a prepare queue overflow event.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringPrepareQueueOverflow(long count, int capacity)
+ {
+ NetEventSource.Info(this, $"io_uring prepare queue overflow: count={count}, capacity={capacity}");
+ }
+
+ /// Logs a cancellation queue overflow event.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCancellationQueueOverflow(long count, int capacity)
+ {
+ NetEventSource.Info(this, $"io_uring cancellation queue overflow: count={count}, capacity={capacity}");
+ }
+
+ /// Logs a CQ overflow observation from the kernel CQ ring counter.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCqOverflow(uint totalOverflowCount, uint delta)
+ {
+ NetEventSource.Error(this, $"io_uring CQ overflow detected: total={totalOverflowCount}, delta={delta}");
+ }
+
+ /// Logs a failed eventfd wake signal.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringWakeFailure(Interop.Error error)
+ {
+ NetEventSource.Info(this, $"io_uring wake signal failed: error={error}");
+ }
+
+ /// Logs the final count of benign late completions at teardown.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringTeardownSummary(long lateCompletionCount)
+ {
+ NetEventSource.Info(this, $"io_uring benign late-completion total={lateCompletionCount}");
+ }
+
+ /// Logs an untrack operation mismatch.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringUntrackMismatch(ulong userData, long mismatchCount)
+ {
+ NetEventSource.Info(this, $"io_uring untrack mismatch: user_data=0x{userData:x}, count={mismatchCount}");
+ }
+
+ /// Logs the negotiated io_uring mode for this engine instance.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringModeSelection(LinuxIoUringCapabilities capabilities)
+ {
+ NetEventSource.Info(
+ this,
+ $"io_uring mode={capabilities.Mode}, is_io_uring_port={capabilities.IsIoUringPort}, supports_multishot_recv={capabilities.SupportsMultishotRecv}, supports_multishot_accept={capabilities.SupportsMultishotAccept}, zero_copy_send_enabled={capabilities.SupportsZeroCopySend}, supports_read_fixed={_supportsOpReadFixed}, supports_send_zc={_supportsOpSendZc}, supports_sendmsg_zc={_supportsOpSendMsgZc}, sqpoll_enabled={capabilities.SqPollEnabled}");
+ }
+
+ /// Logs active advanced io_uring features for this engine instance.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringAdvancedFeatureState()
+ {
+ int providedBufferSize = _ioUringProvidedBufferRing?.BufferSize ?? 0;
+ NetEventSource.Info(
+ this,
+ $"io_uring features: multishot_recv={_ioUringCapabilities.SupportsMultishotRecv}, multishot_accept={_ioUringCapabilities.SupportsMultishotAccept}, zero_copy_send_enabled={_ioUringCapabilities.SupportsZeroCopySend}, supports_read_fixed={_supportsOpReadFixed}, fixed_recv_active={_supportsOpReadFixed && _ioUringBuffersRegistered}, supports_send_zc={_supportsOpSendZc}, supports_sendmsg_zc={_supportsOpSendMsgZc}, provided_buffers={_supportsProvidedBufferRings}, registered_buffers={_ioUringBuffersRegistered}, adaptive_buffer_sizing={_adaptiveBufferSizingEnabled}, sqpoll_enabled={_ioUringCapabilities.SqPollEnabled}, provided_buffer_size={providedBufferSize}");
+ }
+
+ /// Checks whether the kernel version meets the minimum for io_uring support.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsIoUringKernelVersionSupported() =>
+ OperatingSystem.IsOSPlatformVersionAtLeast(
+ "Linux",
+ IoUringConstants.MinKernelMajor,
+ IoUringConstants.MinKernelMinor);
+
+ ///
+ /// Recomputes whether multishot recv can be used by this engine instance.
+ /// Requires opcode support and active provided-buffer ring support.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool RefreshIoUringMultishotRecvSupport()
+ {
+ _supportsMultishotRecv =
+ _supportsOpRecv &&
+ _supportsProvidedBufferRings;
+ return _supportsMultishotRecv;
+ }
+
+ ///
+ /// Returns the provided-buffer group id used for buffer-select receive submissions.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryGetIoUringProvidedBufferGroupId(out ushort bufferGroupId)
+ {
+ if (_supportsProvidedBufferRings && _ioUringProvidedBufferRing is not null)
+ {
+ bufferGroupId = _ioUringProvidedBufferGroupId;
+ return true;
+ }
+
+ bufferGroupId = default;
+ return false;
+ }
+
+ ///
+ /// Returns the provided-buffer group id used for multishot recv submissions.
+ /// Multishot recv remains disabled unless both the opcode probe and provided-ring
+ /// registration succeeded for this engine instance.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryGetIoUringMultishotRecvBufferGroupId(out ushort bufferGroupId)
+ {
+ if (_supportsMultishotRecv && TryGetIoUringProvidedBufferGroupId(out bufferGroupId))
+ {
+ return true;
+ }
+
+ bufferGroupId = default;
+ return false;
+ }
+
+ internal bool SupportsMultishotRecv => _ioUringCapabilities.SupportsMultishotRecv;
+ internal bool SupportsMultishotAccept => _ioUringCapabilities.SupportsMultishotAccept;
+
+ /// Calls io_uring_setup and negotiates feature flags.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static unsafe bool TrySetupIoUring(bool sqPollRequested, out IoUringSetupResult setupResult)
+ {
+ setupResult = default;
+
+ uint flags = IoUringConstants.SetupCqSize | IoUringConstants.SetupSubmitAll
+ | IoUringConstants.SetupCoopTaskrun | IoUringConstants.SetupSingleIssuer
+ | IoUringConstants.SetupNoSqArray;
+
+ if (sqPollRequested)
+ {
+ // SQPOLL and DEFER_TASKRUN are mutually exclusive in practice.
+ flags |= IoUringConstants.SetupSqPoll;
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(null, "io_uring setup: SQPOLL requested and included in initial setup flags.");
+ }
+ }
+ else
+ {
+ flags |= IoUringConstants.SetupDeferTaskrun;
+ }
+
+ Interop.Sys.IoUringParams ioParams = default;
+ ioParams.Flags = flags;
+ ioParams.CqEntries = IoUringConstants.QueueEntries * IoUringConstants.CqEntriesFactor;
+
+ int ringFd;
+ Interop.Error err = Interop.Sys.IoUringShimSetup(IoUringConstants.QueueEntries, &ioParams, &ringFd);
+
+ // IORING_SETUP_NO_SQARRAY was introduced in Linux 6.6.
+ // For 6.1-6.5 kernels, keep setup simple but allow a single targeted retry without NO_SQARRAY.
+ if ((err == Interop.Error.EINVAL || err == Interop.Error.EPERM) &&
+ (flags & IoUringConstants.SetupNoSqArray) != 0)
+ {
+ flags &= ~IoUringConstants.SetupNoSqArray;
+ ioParams = default;
+ ioParams.Flags = flags;
+ ioParams.CqEntries = IoUringConstants.QueueEntries * IoUringConstants.CqEntriesFactor;
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(null, $"io_uring setup: peeled NO_SQARRAY after {err}.");
+ }
+
+ err = Interop.Sys.IoUringShimSetup(IoUringConstants.QueueEntries, &ioParams, &ringFd);
+ }
+
+ if (err != Interop.Error.SUCCESS)
+ {
+ return false;
+ }
+
+ setupResult.RingFd = ringFd;
+ setupResult.Params = ioParams;
+ setupResult.NegotiatedFlags = flags;
+ setupResult.UsesExtArg = (ioParams.Features & IoUringConstants.FeatureExtArg) != 0;
+ setupResult.SqPollNegotiated = (flags & IoUringConstants.SetupSqPoll) != 0;
+ if (setupResult.SqPollNegotiated && NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(null, "io_uring setup: SQPOLL negotiated.");
+ }
+ return true;
+ }
+
+ ///
+ /// Maps the SQ ring, CQ ring, and SQE array into managed address space and derives
+ /// all ring pointers from the kernel-reported offsets. On failure, unmaps any
+ /// partially-mapped regions and closes the ring fd.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool TryMmapRings(ref IoUringSetupResult setup)
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static bool IsOffsetInRange(ulong offset, ulong size, ulong mappedSize) =>
+ offset <= mappedSize && size <= mappedSize - offset;
+
+ ref Interop.Sys.IoUringParams p = ref setup.Params;
+ bool usesNoSqArray = (setup.NegotiatedFlags & IoUringConstants.SetupNoSqArray) != 0;
+
+ // Compute ring sizes.
+ ulong sqRingSize = p.SqOff.Array;
+ if (!usesNoSqArray)
+ {
+ sqRingSize += p.SqEntries * (uint)sizeof(uint);
+ }
+ ulong cqRingSize = p.CqOff.Cqes + p.CqEntries * (uint)sizeof(Interop.Sys.IoUringCqe);
+ ulong sqesSize = p.SqEntries * 64u; // sizeof(io_uring_sqe) = 64
+
+ // mmap SQ ring (and possibly CQ ring if SINGLE_MMAP).
+ bool usesSingleMmap = (p.Features & IoUringConstants.FeatureSingleMmap) != 0;
+
+ byte* sqRingPtr;
+ byte* cqRingPtr;
+
+ if (usesSingleMmap)
+ {
+ ulong ringSize = sqRingSize > cqRingSize ? sqRingSize : cqRingSize;
+ void* ptr;
+ Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, ringSize, IoUringConstants.OffSqRing, &ptr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ sqRingPtr = (byte*)ptr;
+ cqRingPtr = (byte*)ptr;
+ sqRingSize = ringSize;
+ cqRingSize = ringSize;
+ }
+ else
+ {
+ void* sqPtr;
+ Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, sqRingSize, IoUringConstants.OffSqRing, &sqPtr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ sqRingPtr = (byte*)sqPtr;
+
+ void* cqPtr;
+ err = Interop.Sys.IoUringShimMmap(setup.RingFd, cqRingSize, IoUringConstants.OffCqRing, &cqPtr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize);
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ cqRingPtr = (byte*)cqPtr;
+ }
+
+ Debug.Assert(IsOffsetInRange(p.SqOff.Head, sizeof(uint), sqRingSize));
+ Debug.Assert(IsOffsetInRange(p.SqOff.Tail, sizeof(uint), sqRingSize));
+ Debug.Assert(IsOffsetInRange(p.SqOff.RingMask, sizeof(uint), sqRingSize));
+ Debug.Assert(IsOffsetInRange(p.SqOff.RingEntries, sizeof(uint), sqRingSize));
+ Debug.Assert(IsOffsetInRange(p.SqOff.Flags, sizeof(uint), sqRingSize));
+ if (!usesNoSqArray)
+ {
+ Debug.Assert(IsOffsetInRange(p.SqOff.Array, p.SqEntries * (uint)sizeof(uint), sqRingSize));
+ }
+
+ Debug.Assert(IsOffsetInRange(p.CqOff.Head, sizeof(uint), cqRingSize));
+ Debug.Assert(IsOffsetInRange(p.CqOff.Tail, sizeof(uint), cqRingSize));
+ Debug.Assert(IsOffsetInRange(p.CqOff.RingMask, sizeof(uint), cqRingSize));
+ Debug.Assert(IsOffsetInRange(p.CqOff.RingEntries, sizeof(uint), cqRingSize));
+ Debug.Assert(IsOffsetInRange(p.CqOff.Overflow, sizeof(uint), cqRingSize));
+ Debug.Assert(IsOffsetInRange(p.CqOff.Cqes, p.CqEntries * (uint)sizeof(Interop.Sys.IoUringCqe), cqRingSize));
+
+ // mmap SQE array.
+ void* sqePtr;
+ {
+ Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, sqesSize, IoUringConstants.OffSqes, &sqePtr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ if (!usesSingleMmap)
+ Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize);
+ Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize);
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ }
+
+ // Derive SQ pointers and populate existing _ioUringSqRingInfo for compatibility.
+ _ioUringSqRingInfo.SqeBase = (IntPtr)sqePtr;
+ _ioUringSqRingInfo.SqTailPtr = (IntPtr)(sqRingPtr + p.SqOff.Tail);
+ _ioUringSqRingInfo.SqHeadPtr = (IntPtr)(sqRingPtr + p.SqOff.Head);
+ _ioUringSqRingInfo.SqMask = *(uint*)(sqRingPtr + p.SqOff.RingMask);
+ _ioUringSqRingInfo.SqEntries = *(uint*)(sqRingPtr + p.SqOff.RingEntries);
+ _ioUringSqRingInfo.SqeSize = 64;
+ _ioUringSqRingInfo.UsesNoSqArray = usesNoSqArray ? (byte)1 : (byte)0;
+ _ioUringSqRingInfo.RingFd = setup.RingFd;
+ _ioUringSqRingInfo.UsesEnterExtArg = setup.UsesExtArg ? (byte)1 : (byte)0;
+ _managedSqFlagsPtr = (uint*)(sqRingPtr + p.SqOff.Flags);
+
+ // Initialize SQ array identity mapping if NO_SQARRAY is not active.
+ if (!usesNoSqArray)
+ {
+ uint* sqArray = (uint*)(sqRingPtr + p.SqOff.Array);
+ for (uint i = 0; i < p.SqEntries; i++)
+ {
+ sqArray[i] = i;
+ }
+ }
+
+ // Derive CQ pointers.
+ _managedCqeBase = (Interop.Sys.IoUringCqe*)(cqRingPtr + p.CqOff.Cqes);
+ _managedCqTailPtr = (uint*)(cqRingPtr + p.CqOff.Tail);
+ _managedCqHeadPtr = (uint*)(cqRingPtr + p.CqOff.Head);
+ _managedCqMask = *(uint*)(cqRingPtr + p.CqOff.RingMask);
+ _managedCqEntries = *(uint*)(cqRingPtr + p.CqOff.RingEntries);
+ _managedCqOverflowPtr = (uint*)(cqRingPtr + p.CqOff.Overflow);
+ _managedObservedCqOverflow = Volatile.Read(ref *_managedCqOverflowPtr);
+
+ // Store ring region info for teardown.
+ _managedSqRingPtr = sqRingPtr;
+ _managedCqRingPtr = cqRingPtr;
+ _managedSqRingSize = sqRingSize;
+ _managedCqRingSize = cqRingSize;
+ _managedSqesSize = sqesSize;
+ _managedUsesSingleMmap = usesSingleMmap;
+ _managedRingFd = setup.RingFd;
+ _managedUsesExtArg = setup.UsesExtArg;
+ _managedUsesNoSqArray = usesNoSqArray;
+ _managedNegotiatedFlags = setup.NegotiatedFlags;
+
+ return true;
+ }
+
+ /// Queues a POLL_ADD SQE on the wakeup eventfd for cross-thread signaling.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool QueueManagedWakeupPollAdd()
+ {
+ if (_managedWakeupEventFd < 0)
+ return false;
+
+ if (!TryGetNextManagedSqe(out IoUringSqe* sqe))
+ return false;
+
+ sqe->Opcode = IoUringOpcodes.PollAdd;
+ sqe->Fd = _managedWakeupEventFd;
+ sqe->Len = IoUringConstants.PollAddFlagMulti; // IORING_POLL_ADD_MULTI
+ sqe->RwFlags = 1; // POLLIN = 0x0001 in poll32_events (stored in RwFlags union at offset 28)
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagWakeupSignal, 0);
+ return true;
+ }
+
+ /// Attempts to register the ring fd for fixed-fd submission.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool TryRegisterRingFd(int ringFd, out int registeredRingFd)
+ {
+ registeredRingFd = -1;
+
+ // io_uring_rsrc_update: { uint32 offset, uint32 resv, uint64 data }
+ uint* update = stackalloc uint[4]; // 16 bytes
+ update[0] = IoUringConstants.RegisterOffsetAuto; // offset = auto-assign
+ update[1] = 0; // resv
+ *(ulong*)(update + 2) = (ulong)ringFd; // data = ring fd
+
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimRegister(
+ ringFd, IoUringConstants.RegisterRingFds, update, 1u, &result);
+
+ if (err != Interop.Error.SUCCESS || result <= 0)
+ return false;
+
+ registeredRingFd = (int)update[0]; // kernel wrote assigned index back
+ return true;
+ }
+
+ ///
+ /// Initializes the registered-file table with the kernel. Allocates slot arrays,
+ /// fills all entries with -1, and calls IORING_REGISTER_FILES. On success, builds
+ /// the free-slot stack so that slots can be assigned to sockets.
+ /// Ports the native TryInitializeRegisteredFilesTable from pal_io_uring.c.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool TryInitializeRegisteredFileTable(int ringFd)
+ {
+ uint slotCount = IoUringConstants.QueueEntries * IoUringConstants.RegisteredFileSlotCountFactor;
+ _registeredFiles = new int[slotCount];
+ Array.Fill(_registeredFiles, -1);
+
+ // Register the file table with the kernel.
+ fixed (int* fdsPtr = _registeredFiles)
+ {
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimRegister(
+ ringFd, IoUringConstants.RegisterFiles, fdsPtr, slotCount, &result);
+ if (err != Interop.Error.SUCCESS)
+ {
+ _registeredFiles = null;
+ return false;
+ }
+ }
+
+ // Build free-slot stack (all slots initially free).
+ // Initialize in reverse order so that popping yields lowest indices first,
+ // matching native behavior.
+ _registeredFileFreeSlots = new uint[slotCount];
+ for (uint i = 0; i < slotCount; i++)
+ {
+ _registeredFileFreeSlots[i] = slotCount - i - 1;
+ }
+ _registeredFileFreeSlotCount = slotCount;
+ _registeredFileHotSocket = -1;
+ _registeredFileHotIndex = -1;
+ _usesRegisteredFiles = true;
+ return true;
+ }
+
+ ///
+ /// Updates a single registered-file slot to the given fd value by calling
+ /// IORING_REGISTER_FILES_UPDATE via the io_uring register syscall.
+ /// Ports the native UpdateRegisteredFileSlotLocked from pal_io_uring.c.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe Interop.Error UpdateRegisteredFileSlot(uint slot, int fd)
+ {
+ // io_uring_rsrc_update: { uint32 offset, uint32 resv, uint64 data }
+ // For IORING_REGISTER_FILES_UPDATE, data is a pointer to the fd value.
+ int fdValue = fd;
+ int* fdsPtr = &fdValue;
+
+ uint* update = stackalloc uint[4]; // 16 bytes
+ update[0] = slot; // offset
+ update[1] = 0; // resv
+ *(ulong*)(update + 2) = (ulong)(nuint)fdsPtr; // data = pointer to fd
+
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimRegister(
+ _managedRingFd, IoUringConstants.RegisterFilesUpdate, update, 1u, &result);
+
+ // The shim returns SUCCESS only when the syscall returned >= 0,
+ // so no separate result check is needed.
+ return err;
+ }
+
+ ///
+ /// Assigns a registered-file slot for a socket registration, enabling IOSQE_FIXED_FILE
+ /// for faster fd lookup in the kernel. If the registration does not yet have a slot,
+ /// one is popped from the free-slot stack and the kernel slot is updated.
+ /// Ports the native TryAssignRegisteredFileForRegistrationLocked from pal_io_uring.c.
+ ///
+ private void TryAssignRegisteredFileForRegistration(
+ SocketEventRegistration registration, out int sqeFd, out byte sqeFlags)
+ {
+ sqeFd = registration.Socket;
+ sqeFlags = 0;
+
+ if (!_usesRegisteredFiles || _managedRingFd < 0)
+ return;
+
+ if (registration.RegisteredFileIndex < 0)
+ {
+ if (_registeredFileFreeSlotCount == 0)
+ return; // No free slots
+
+ uint slot = _registeredFileFreeSlots![--_registeredFileFreeSlotCount];
+ Interop.Error err = UpdateRegisteredFileSlot(slot, registration.Socket);
+ if (err != Interop.Error.SUCCESS)
+ {
+ // Restore free slot on failure.
+ _registeredFileFreeSlots[_registeredFileFreeSlotCount++] = slot;
+ return;
+ }
+
+ _registeredFiles![slot] = registration.Socket;
+ registration.RegisteredFileIndex = (int)slot;
+ }
+
+ uint assignedSlot = (uint)registration.RegisteredFileIndex;
+ if (assignedSlot < (uint)_registeredFiles!.Length &&
+ _registeredFiles[assignedSlot] == registration.Socket)
+ {
+ _registeredFileHotSocket = registration.Socket;
+ _registeredFileHotIndex = registration.RegisteredFileIndex;
+ sqeFd = registration.RegisteredFileIndex;
+ sqeFlags = IoUringConstants.SqeFixedFile;
+ }
+ }
+
+ ///
+ /// Unregisters a file slot for a socket registration by updating the kernel slot to -1
+ /// and returning the slot to the free list.
+ /// Ports the native TryUnregisterRegisteredFileForRegistrationLocked from pal_io_uring.c.
+ ///
+ private void TryUnregisterRegisteredFileForRegistration(SocketEventRegistration registration)
+ {
+ if (!_usesRegisteredFiles || registration.RegisteredFileIndex < 0)
+ return;
+
+ if (Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ if (_registeredFileHotSocket == registration.Socket)
+ {
+ _registeredFileHotSocket = -1;
+ _registeredFileHotIndex = -1;
+ }
+ registration.RegisteredFileIndex = -1;
+ return;
+ }
+
+ uint slot = (uint)registration.RegisteredFileIndex;
+ if (slot >= (uint)_registeredFiles!.Length)
+ {
+ registration.RegisteredFileIndex = -1;
+ return;
+ }
+
+ Interop.Error err = UpdateRegisteredFileSlot(slot, -1);
+ if (err != Interop.Error.SUCCESS)
+ {
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Error(this, $"io_uring registered-file slot unregister failed: slot={slot}, error={err}");
+ }
+
+ _registeredFiles[slot] = -1;
+ if (_registeredFileFreeSlotCount < (uint)_registeredFileFreeSlots!.Length)
+ {
+ _registeredFileFreeSlots[_registeredFileFreeSlotCount++] = slot;
+ }
+
+ if (_registeredFileHotSocket == registration.Socket)
+ {
+ _registeredFileHotSocket = -1;
+ _registeredFileHotIndex = -1;
+ }
+
+ registration.RegisteredFileIndex = -1;
+ return;
+ }
+
+ _registeredFiles[slot] = -1;
+ if (_registeredFileFreeSlotCount < (uint)_registeredFileFreeSlots!.Length)
+ {
+ _registeredFileFreeSlots[_registeredFileFreeSlotCount++] = slot;
+ }
+
+ if (_registeredFileHotSocket == registration.Socket)
+ {
+ _registeredFileHotSocket = -1;
+ _registeredFileHotIndex = -1;
+ }
+
+ registration.RegisteredFileIndex = -1;
+ }
+
+ ///
+ /// Configures the SQE fd and flags for a socket operation, using the registered-file
+ /// hot cache for fast lookup, falling back to registration-based assignment.
+ /// Ports the native ConfigureSocketSqeFdAndFlagsLocked from pal_io_uring.c.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ConfigureSocketSqeFdAndFlags(int socketFd, out int sqeFd, out byte sqeFlags)
+ {
+ sqeFd = socketFd;
+ sqeFlags = 0;
+
+ if (!_usesRegisteredFiles)
+ return;
+
+ // Hot cache check
+ int hotIndex = _registeredFileHotIndex;
+ if (_registeredFileHotSocket == socketFd &&
+ hotIndex >= 0 &&
+ (uint)hotIndex < (uint)_registeredFiles!.Length &&
+ _registeredFiles[hotIndex] == socketFd)
+ {
+ sqeFd = hotIndex;
+ sqeFlags = IoUringConstants.SqeFixedFile;
+ return;
+ }
+
+ // Fallback: look up by socket registration
+ SocketEventRegistration? registration = FindRegistrationBySocket(socketFd);
+ if (registration is null)
+ return;
+
+ TryAssignRegisteredFileForRegistration(registration, out sqeFd, out sqeFlags);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void CleanupManagedRings()
+ {
+ _managedSqFlagsPtr = null;
+ if (_managedSqRingPtr != null)
+ {
+ // Unmap SQEs first
+ if (_managedSqesSize > 0)
+ {
+ Interop.Sys.IoUringShimMunmap(_ioUringSqRingInfo.SqeBase.ToPointer(), _managedSqesSize);
+ }
+ // Unmap CQ ring (only if separate from SQ ring)
+ if (!_managedUsesSingleMmap && _managedCqRingPtr != null && _managedCqRingPtr != _managedSqRingPtr)
+ {
+ Interop.Sys.IoUringShimMunmap(_managedCqRingPtr, _managedCqRingSize);
+ }
+ // Unmap SQ ring
+ Interop.Sys.IoUringShimMunmap(_managedSqRingPtr, _managedSqRingSize);
+ _managedSqRingPtr = null;
+ _managedCqRingPtr = null;
+ }
+ if (_managedRingFd >= 0)
+ {
+ Interop.Sys.IoUringShimCloseFd(_managedRingFd);
+ _managedRingFd = -1;
+ }
+ }
+
+ ///
+ /// Orchestrates complete managed io_uring initialization: kernel version check,
+ /// ring setup with flag negotiation, mmap, opcode probe, eventfd creation,
+ /// ring fd registration, and initial wakeup poll queue.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool TryInitializeManagedIoUring()
+ {
+ if (!IsIoUringKernelVersionSupported())
+ return false;
+
+ bool sqPollRequested = IsSqPollRequested();
+ if (!TrySetupIoUring(sqPollRequested, out IoUringSetupResult setupResult))
+ return false;
+
+ if (!TryMmapRings(ref setupResult))
+ return false;
+
+ _sqPollEnabled = setupResult.SqPollNegotiated;
+ if (NetEventSource.Log.IsEnabled())
+ {
+ if (sqPollRequested && !_sqPollEnabled)
+ {
+ NetEventSource.Info(
+ this,
+ "SQPOLL requested but not negotiated (kernel support/capabilities may be unavailable).");
+ }
+ else if (_sqPollEnabled)
+ {
+ NetEventSource.Info(this, "SQPOLL negotiated and enabled.");
+ }
+ }
+
+ // Probe opcode support.
+ ProbeIoUringOpcodeSupport(setupResult.RingFd);
+
+ // Try to initialize registered file table (optional optimization).
+ TryInitializeRegisteredFileTable(setupResult.RingFd);
+
+ // Create wakeup eventfd.
+ int eventFd;
+ Interop.Error err = Interop.Sys.IoUringShimCreateEventFd(&eventFd);
+ if (err != Interop.Error.SUCCESS)
+ {
+ // Cleanup: unmap and close
+ CleanupManagedRings();
+ return false;
+ }
+ _managedWakeupEventFd = eventFd;
+
+ // Try to register the ring fd for faster enter syscalls.
+ if (TryRegisterRingFd(setupResult.RingFd, out int registeredRingFd))
+ {
+ _ioUringSqRingInfo.RegisteredRingFd = registeredRingFd;
+ }
+
+ // Queue the initial wakeup POLL_ADD.
+ // Direct SQE must be enabled for QueueManagedWakeupPollAdd to work.
+ _ioUringDirectSqeEnabled = true;
+ if (!QueueManagedWakeupPollAdd())
+ {
+ _ioUringDirectSqeEnabled = false;
+ Interop.Sys.IoUringShimCloseFd(eventFd);
+ _managedWakeupEventFd = -1;
+ CleanupManagedRings();
+ return false;
+ }
+
+ // Respect process-level direct SQE toggle after the required wakeup POLL_ADD is armed.
+ if (IsIoUringDirectSqeDisabled())
+ {
+ _ioUringDirectSqeEnabled = false;
+ }
+
+ InitializeIoUringProvidedBufferRingIfSupported(setupResult.RingFd);
+ RefreshIoUringMultishotRecvSupport();
+ _ioUringInitialized = true;
+
+ InitializeDebugTestHooksFromEnvironment();
+
+ return true;
+ }
+
+ /// Validates the managed NativeMsghdr layout contract for direct io_uring message operations.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private bool IsNativeMsghdrLayoutSupportedForIoUring()
+ {
+ if (IntPtr.Size == 8 && sizeof(NativeMsghdr) == 56)
+ {
+ return true;
+ }
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(
+ this,
+ $"io_uring disabled: unsupported NativeMsghdr layout (pointerSize={IntPtr.Size}, sizeof(NativeMsghdr)={sizeof(NativeMsghdr)})");
+ }
+
+ return false;
+ }
+
+ /// Detects io_uring support and initializes the managed submission/completion paths.
+ partial void LinuxDetectAndInitializeIoUring()
+ {
+ if (!IsIoUringEnabled() || !IsNativeMsghdrLayoutSupportedForIoUring() || !TryInitializeManagedIoUring())
+ {
+ _ioUringCapabilities = ResolveLinuxIoUringCapabilities(isIoUringPort: false);
+ SocketsTelemetry.Log.ReportSocketEngineBackendSelected(
+ isIoUringPort: false,
+ isCompletionMode: false,
+ sqPollEnabled: false);
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringModeSelection(_ioUringCapabilities);
+ }
+
+ return;
+ }
+
+ // Managed init succeeded — set capabilities and initialize managed-side state.
+ _ioUringCapabilities = new LinuxIoUringCapabilities(
+ isIoUringPort: true,
+ mode: LinuxIoUringMode.CompletionMode,
+ supportsMultishotRecv: _supportsMultishotRecv,
+ supportsMultishotAccept: _supportsMultishotAccept,
+ supportsZeroCopySend: _zeroCopySendEnabled,
+ sqPollEnabled: _sqPollEnabled);
+
+ SocketsTelemetry.Log.ReportSocketEngineBackendSelected(
+ isIoUringPort: true,
+ isCompletionMode: true,
+ sqPollEnabled: _sqPollEnabled);
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringModeSelection(_ioUringCapabilities);
+ }
+
+ InitializeLinuxIoUringDiagnosticsState();
+
+ _ioUringSlotCapacity = (int)Math.Max(_managedCqEntries, IoUringConstants.QueueEntries);
+ // Slot pool capacity is 2x slot capacity (currently 8192 with default cq sizing).
+ // Multishot operations retain slots for their full lifetime, so this bounds
+ // concurrent long-lived multishot receives before backpressure/exhaustion.
+ _ioUringPrepareQueue = new MpscQueue();
+ _ioUringCancelQueue = new MpscQueue();
+ _ioUringOperationRegistry = new IoUringOperationRegistry(_ioUringSlotCapacity);
+ InitializeCompletionSlotPool(_ioUringSlotCapacity * IoUringConstants.CompletionOperationPoolCapacityFactor);
+
+ if (RequiresPollReadiness())
+ {
+ _registrationsBySocket = new Dictionary();
+ _registrationsByRequestId = new Dictionary();
+ _registrationChangeQueue = new ConcurrentQueue();
+ }
+
+ _managedCqDrainEnabled = true;
+ }
+
+ /// Tears down io_uring state before native resource cleanup.
+ partial void LinuxBeforeFreeNativeResources(ref bool closeSocketEventPort)
+ {
+ if (!_ioUringCapabilities.IsIoUringPort || _port == (IntPtr)(-1))
+ {
+ return;
+ }
+
+ Volatile.Write(ref _ioUringTeardownInitiated, 1);
+ DrainQueuedIoUringOperationsForTeardown();
+
+ Interop.Error closeError = Interop.Sys.CloseSocketEventPort(_port);
+ if (closeError == Interop.Error.SUCCESS)
+ {
+ closeSocketEventPort = false;
+ Volatile.Write(ref _ioUringPortClosedForTeardown, 1);
+ }
+ }
+
+ /// Submits pending SQEs before entering the wait.
+ partial void LinuxEventLoopBeforeWait()
+ {
+ ProcessPendingRegistrationChanges();
+
+ Interop.Error submitError = SubmitIoUringBatch();
+ if (submitError != Interop.Error.SUCCESS)
+ {
+ ThrowInternalException(submitError);
+ }
+ }
+
+ /// Attempts a managed completion wait using io_uring_enter with timeout.
+ partial void LinuxEventLoopTryCompletionWait(SocketEventHandler handler, ref int numEvents, ref int numCompletions, ref Interop.Error err, ref bool waitHandled)
+ {
+ if (!_ioUringCapabilities.IsCompletionMode)
+ {
+ return;
+ }
+
+ // Managed CQE drain path: read CQEs directly from mmap'd ring.
+ // First, try a non-blocking drain of any already-available CQEs.
+ bool hadCqes = DrainCqeRingBatch(handler);
+ if (hadCqes)
+ {
+ numCompletions = 1;
+ waitHandled = true;
+ err = Interop.Error.SUCCESS;
+ return;
+ }
+
+ // No CQEs available — submit pending SQEs and wait for at least 1 CQE.
+ uint enterFlags = IoUringConstants.EnterGetevents;
+ int ringFd = _managedRingFd;
+ if (_ioUringSqRingInfo.RegisteredRingFd >= 0)
+ {
+ enterFlags |= IoUringConstants.EnterRegisteredRing;
+ ringFd = _ioUringSqRingInfo.RegisteredRingFd;
+ }
+
+ uint submitCount = _sqPollEnabled ? 0u : _ioUringManagedPendingSubmissions;
+ if (_sqPollEnabled &&
+ _ioUringManagedPendingSubmissions != 0 &&
+ SqNeedWakeup())
+ {
+ enterFlags |= IoUringConstants.EnterSqWakeup;
+ }
+
+ if (_managedUsesExtArg)
+ {
+ // Bounded wait with 50ms timeout via EXT_ARG.
+ enterFlags |= IoUringConstants.EnterExtArg;
+ Interop.Sys.IoUringKernelTimespec timeout = default;
+ timeout.TvNsec = IoUringConstants.BoundedWaitTimeoutNanos;
+ Interop.Sys.IoUringGeteventsArg extArg = default;
+ extArg.Ts = (ulong)(nuint)(&timeout);
+
+ int result;
+ err = Interop.Sys.IoUringShimEnterExt(
+ ringFd, submitCount, 1, enterFlags, &extArg, &result);
+ if (err == Interop.Error.SUCCESS)
+ {
+ _ioUringManagedPendingSubmissions = 0;
+ }
+ }
+ else
+ {
+ int result;
+ err = Interop.Sys.IoUringShimEnter(
+ ringFd, submitCount, 1, enterFlags, &result);
+ if (err == Interop.Error.SUCCESS)
+ {
+ _ioUringManagedPendingSubmissions = 0;
+ }
+ }
+
+ // Drain after waking.
+ hadCqes = DrainCqeRingBatch(handler);
+ numCompletions = hadCqes ? 1 : 0;
+ numEvents = 0;
+ waitHandled = true;
+ err = Interop.Error.SUCCESS;
+ }
+
+ /// Polls diagnostics after each event loop iteration.
+ partial void LinuxEventLoopAfterIteration() =>
+ PollIoUringDiagnosticsIfNeeded(force: false);
+
+ /// Unmaps rings and closes the ring fd.
+ partial void LinuxFreeIoUringResources()
+ {
+ // Managed io_uring teardown: release resources allocated during TryInitializeManagedIoUring.
+ // This must run BEFORE the common slot/buffer cleanup below because kernel
+ // unregister operations need the ring fd to still be open.
+ if (_ioUringInitialized)
+ {
+ // 0. Unregister/dispose provided buffer ring while the main ring fd is still open.
+ FreeIoUringProvidedBufferRing();
+
+ // 1. Drain pending registration changes — complete them with ECANCELED
+ // so callers waiting on CompletionEvent are unblocked.
+ if (_registrationChangeQueue is not null)
+ {
+ while (_registrationChangeQueue.TryDequeue(out RegistrationChangeRequest? request))
+ {
+ request.Error = Interop.Error.ECANCELED;
+ request.Completed = true;
+ request.CompletionEvent.Set();
+ }
+ _registrationChangeQueue = null;
+ }
+
+ // 2. Clear registration tracking dictionaries.
+ // Individual per-fd unregistration is skipped because _ioUringTeardownInitiated
+ // is already set, so TryUnregisterRegisteredFileForRegistration would early-return.
+ // The entire registered-file table is bulk-unregistered in step 3 instead.
+ if (_registrationsBySocket is not null)
+ {
+ _registrationsBySocket.Clear();
+ _registrationsBySocket = null;
+ }
+ if (_registrationsByRequestId is not null)
+ {
+ _registrationsByRequestId.Clear();
+ _registrationsByRequestId = null;
+ }
+
+ // 3. Unregister the entire registered-file table from the kernel (bulk).
+ if (_usesRegisteredFiles && _managedRingFd >= 0)
+ {
+ int result;
+ Interop.Sys.IoUringShimRegister(
+ _managedRingFd, IoUringConstants.UnregisterFiles, null, 0u, &result);
+ _usesRegisteredFiles = false;
+ }
+ _registeredFiles = null;
+ _registeredFileFreeSlots = null;
+ _registeredFileFreeSlotCount = 0;
+ _registeredFileHotSocket = -1;
+ _registeredFileHotIndex = -1;
+
+ // 4. The registered ring fd is implicitly released when the ring fd is closed.
+ // Just mark it as inactive so no subsequent code attempts to use it.
+ _ioUringSqRingInfo.RegisteredRingFd = -1;
+
+ // 5. Close the wakeup eventfd.
+ if (_managedWakeupEventFd >= 0)
+ {
+ Interop.Sys.IoUringShimCloseFd(_managedWakeupEventFd);
+ _managedWakeupEventFd = -1;
+ }
+
+ // 6. Unmap SQ/CQ rings, SQEs and close the ring fd.
+ // Closing the ring fd also terminates any kernel SQPOLL thread for this ring.
+ CleanupManagedRings();
+
+ // 7. Disable managed flags to prevent any late operations.
+ _ioUringInitialized = false;
+ _managedCqDrainEnabled = false;
+ }
+
+ bool portClosedForTeardown = Volatile.Read(ref _ioUringPortClosedForTeardown) != 0;
+ if (!portClosedForTeardown)
+ {
+ PollIoUringDiagnosticsIfNeeded(force: true);
+ }
+
+ DrainQueuedIoUringOperationsForTeardown();
+
+ if (_ioUringOperationRegistry is not null)
+ {
+ DrainTrackedIoUringOperationsForTeardown(portClosedForTeardown);
+ Debug.Assert(_ioUringOperationRegistry.IsEmpty, $"Leaked tracked io_uring operations: {_ioUringOperationRegistry.Count}");
+ _ioUringOperationRegistry = null;
+
+ // Free any native memory still held by completion slots
+ if (_completionSlots is not null)
+ {
+ for (int i = 0; i < _completionSlots.Length; i++)
+ {
+ ReleaseZeroCopyPinHold(i);
+ ref IoUringCompletionSlot slot = ref _completionSlots[i];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![i];
+ if (slot.Kind == IoUringCompletionOperationKind.Message)
+ {
+ FreeMessageStorage(i);
+ }
+ else if (slot.Kind == IoUringCompletionOperationKind.Accept && slotStorage.NativeSocketAddressLengthPtr != null)
+ {
+ NativeMemory.Free(slotStorage.NativeSocketAddressLengthPtr);
+ slotStorage.NativeSocketAddressLengthPtr = null;
+ }
+ }
+ _completionSlots = null;
+ _completionSlotStorage = null;
+ _zeroCopyPinHolds = null;
+ _completionSlotFreeListHead = -1;
+ _completionSlotsInUse = 0;
+ }
+
+ _ioUringSlotCapacity = 0;
+ _ioUringManagedPendingSubmissions = 0;
+ _ioUringManagedSqTail = 0;
+ _ioUringManagedSqTailLoaded = false;
+ _ioUringSqRingInfo = default;
+ _ioUringDirectSqeEnabled = false;
+ _sqPollEnabled = false;
+
+ LogLinuxIoUringTeardownSummaryIfNeeded();
+ }
+
+ ResetIoUringPrepareQueueDepthTelemetry();
+
+ // Final flush of managed io_uring deltas in case teardown modified counters
+ // after the forced diagnostics poll and no further event-loop iteration runs.
+ PublishIoUringManagedDiagnosticsDelta();
+ }
+
+ /// Publishes prepare queue depth delta to telemetry and resets the counter.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ResetIoUringPrepareQueueDepthTelemetry()
+ {
+ long publishedDepth = Interlocked.Exchange(ref _ioUringPublishedPrepareQueueLength, 0);
+ if (publishedDepth != 0)
+ {
+ SocketsTelemetry.Log.IoUringPrepareQueueDepthDelta(-publishedDepth);
+ }
+ }
+
+ /// Queued work item pairing an operation with its prepare sequence number for deferred SQE preparation.
+ private readonly struct IoUringPrepareWorkItem
+ {
+ /// The operation to prepare.
+ public readonly SocketAsyncContext.AsyncOperation Operation;
+ /// The sequence number that must match for the preparation to proceed.
+ public readonly long PrepareSequence;
+
+ /// Creates a work item pairing an operation with its prepare sequence number.
+ public IoUringPrepareWorkItem(SocketAsyncContext.AsyncOperation operation, long prepareSequence)
+ {
+ Operation = operation;
+ PrepareSequence = prepareSequence;
+ }
+ }
+
+ /// Enqueues an operation for deferred SQE preparation on the event loop thread.
+ internal bool TryEnqueueIoUringPreparation(SocketAsyncContext.AsyncOperation operation, long prepareSequence)
+ {
+ if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return false;
+ }
+
+ MpscQueue? prepareQueue = _ioUringPrepareQueue;
+ if (prepareQueue is null)
+ {
+ return false;
+ }
+
+ long queueLength = Interlocked.Increment(ref _ioUringPrepareQueueLength);
+ if (queueLength > s_ioUringPrepareQueueCapacity)
+ {
+ Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ long overflowCount = Interlocked.Increment(ref _ioUringPrepareQueueOverflowCount);
+ if ((overflowCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringPrepareQueueOverflow(overflowCount, s_ioUringPrepareQueueCapacity);
+ }
+
+ return false;
+ }
+
+ prepareQueue.Enqueue(new IoUringPrepareWorkItem(operation, prepareSequence));
+ WakeEventLoop();
+ return true;
+ }
+
+ /// Removes an operation from the registry, logging on mismatch.
+ internal bool TryUntrackIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation? expectedOperation = null)
+ {
+ IoUringOperationRegistry? registry = _ioUringOperationRegistry;
+ if (registry is null)
+ {
+ return true;
+ }
+
+ IoUringOperationRegistry.RemoveResult removeResult =
+ registry.TryUntrack(userData, expectedOperation, out _);
+ if (removeResult == IoUringOperationRegistry.RemoveResult.Mismatch)
+ {
+ Debug.Fail("io_uring tracked operation mismatch while untracking user_data.");
+ long mismatchCount = Interlocked.Increment(ref _ioUringUntrackMismatchCount);
+ if ((mismatchCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringUntrackMismatch(userData, mismatchCount);
+ }
+
+ return false;
+ }
+
+ return true;
+ }
+
+ /// Attempts to replace the currently tracked operation for an existing user_data slot.
+ internal bool TryReplaceIoUringTrackedOperation(ulong userData, SocketAsyncContext.AsyncOperation newOperation)
+ {
+ return _ioUringOperationRegistry?.TryReplace(userData, newOperation) ?? false;
+ }
+
+ /// Enqueues a user_data for ASYNC_CANCEL on the event loop thread.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryEnqueueIoUringCancellation(ulong userData)
+ {
+ if (!_ioUringCapabilities.IsCompletionMode || userData == 0 || Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return false;
+ }
+
+ MpscQueue? cancelQueue = _ioUringCancelQueue;
+ if (cancelQueue is null)
+ {
+ return false;
+ }
+
+ for (int attempt = 0; ; attempt++)
+ {
+ long queueLength = Interlocked.Increment(ref _ioUringCancelQueueLength);
+ if (queueLength <= s_ioUringCancellationQueueCapacity)
+ {
+ cancelQueue.Enqueue(userData);
+ return true;
+ }
+
+ Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ if (attempt == 0)
+ {
+ // Queue can be transiently full under burst cancellation.
+ // Nudge the event loop to drain, then retry once before recording overflow.
+ WakeEventLoop();
+ Thread.SpinWait(64);
+ continue;
+ }
+
+ long overflowCount = Interlocked.Increment(ref _ioUringCancelQueueOverflowCount);
+ if ((overflowCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringCancellationQueueOverflow(overflowCount, s_ioUringCancellationQueueCapacity);
+ }
+
+ return false;
+ }
+ }
+
+ /// Writes an ASYNC_CANCEL SQE directly if the engine is on the event loop thread.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryQueueIoUringAsyncCancel(ulong userData)
+ {
+ if (!_ioUringCapabilities.IsIoUringPort || userData == 0)
+ {
+ return false;
+ }
+
+ if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out _))
+ {
+ return false;
+ }
+
+ WriteAsyncCancelSqe(sqe, userData);
+ return true;
+ }
+
+ /// Writes to the eventfd to wake the event loop from a blocking wait.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private Interop.Error ManagedWakeEventLoop()
+ {
+ return Interop.Sys.IoUringShimWriteEventFd(_managedWakeupEventFd);
+ }
+
+ /// Sends a coalesced wake signal to the event loop thread.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void WakeEventLoop()
+ {
+ if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return;
+ }
+
+ if (Interlocked.Exchange(ref _ioUringWakeupRequested, 1) != 0)
+ {
+ return;
+ }
+
+ Interop.Error error = ManagedWakeEventLoop();
+ if (error != Interop.Error.SUCCESS)
+ {
+ // Reset flag so the next producer can retry the eventfd write.
+ // Worst case under sustained wake failure: work is picked up on the next bounded wait cycle.
+ Volatile.Write(ref _ioUringWakeupRequested, 0);
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringWakeFailure(error);
+ }
+ }
+ }
+
+ /// Enqueues a cancellation request and wakes the event loop.
+ internal void TryRequestIoUringCancellation(ulong userData)
+ {
+ if (!TryEnqueueIoUringCancellation(userData))
+ {
+ return;
+ }
+
+ WakeEventLoop();
+ }
+
+ ///
+ /// Enqueues a readiness fallback event when io_uring submission is congested.
+ ///
+ internal void EnqueueReadinessFallbackEvent(
+ SocketAsyncContext context,
+ Interop.Sys.SocketEvents events,
+ bool countAsPrepareQueueOverflowFallback = false)
+ {
+ if (events == Interop.Sys.SocketEvents.None)
+ {
+ return;
+ }
+
+ _eventQueue.Enqueue(new SocketIOEvent(context, events));
+ if (countAsPrepareQueueOverflowFallback)
+ {
+ RecordIoUringPrepareQueueOverflowFallback();
+ }
+ EnsureWorkerScheduled();
+ }
+
+ /// Drains queued cancellation requests into ASYNC_CANCEL SQEs.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool DrainIoUringCancellationQueue()
+ {
+ MpscQueue? cancelQueue = _ioUringCancelQueue;
+ if (cancelQueue is null)
+ {
+ return false;
+ }
+
+ bool preparedSqe = false;
+ for (int drained = 0; drained < MaxIoUringCancelQueueDrainPerSubmit &&
+ cancelQueue.TryDequeue(out ulong userData); drained++)
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ Debug.Assert(remainingLength >= 0);
+
+ // Cancellation requests can race with terminal completion/untracking.
+ // Skip stale requests to avoid issuing known -ENOENT async-cancel SQEs.
+ if (!IsTrackedIoUringOperation(userData))
+ {
+ continue;
+ }
+
+ if (TryQueueIoUringAsyncCancel(userData))
+ {
+ preparedSqe = true;
+ }
+ }
+ return preparedSqe;
+ }
+
+ /// Drains both prepare and cancel queues, then submits all pending SQEs.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private Interop.Error SubmitIoUringBatch()
+ {
+ if (!_ioUringCapabilities.IsIoUringPort)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "SubmitIoUringBatch must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ bool preparedSqe = false;
+ if (_ioUringCapabilities.IsCompletionMode)
+ {
+ // Clear the coalesced wake flag before draining queues so producers that enqueue
+ // during this drain window can publish a new wake signal without being suppressed.
+ Volatile.Write(ref _ioUringWakeupRequested, 0);
+
+ preparedSqe |= DrainIoUringCancellationQueue();
+
+ MpscQueue? prepareQueue = _ioUringPrepareQueue;
+ if (prepareQueue is null)
+ {
+ return Interop.Error.EINVAL;
+ }
+
+ for (int drained = 0; drained < MaxIoUringPrepareQueueDrainPerSubmit &&
+ prepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem); drained++)
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ Debug.Assert(remainingLength >= 0);
+ Interop.Error prepareError = TryPrepareAndTrackIoUringOperation(
+ workItem.Operation,
+ workItem.PrepareSequence,
+ out bool preparedOperation);
+ if (prepareError != Interop.Error.SUCCESS)
+ {
+ return prepareError;
+ }
+
+ preparedSqe |= preparedOperation;
+ if (!preparedOperation && workItem.Operation.IsInWaitingState())
+ {
+ if (IsPotentialCompletionSlotExhaustion())
+ {
+ int retryCount = workItem.Operation.IncrementIoUringSlotExhaustionRetryCount();
+ if (retryCount < MaxSlotExhaustionRetries &&
+ workItem.Operation.TryQueueIoUringPreparation())
+ {
+ continue;
+ }
+ }
+
+ workItem.Operation.ResetIoUringSlotExhaustionRetryCount();
+ EmitReadinessFallbackForUnpreparedOperation(workItem.Operation);
+ }
+ }
+
+ }
+
+ if (!preparedSqe)
+ {
+ // Inline re-prepare paths can write SQEs outside queue drains; ensure they are submitted.
+ if (_ioUringManagedPendingSubmissions != 0)
+ {
+ return SubmitIoUringOperationsNormalized();
+ }
+
+ if ((_ioUringCancelQueue?.IsEmpty == false) || (_ioUringPrepareQueue?.IsEmpty == false))
+ {
+ WakeEventLoop();
+ }
+
+ return Interop.Error.SUCCESS;
+ }
+
+ return SubmitIoUringOperationsNormalized();
+ }
+
+ ///
+ /// Prepares an operation for io_uring submission and tracks it in the completion registry.
+ /// On non-prepared paths, clears operation user_data and releases preparation resources.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private Interop.Error TryPrepareAndTrackIoUringOperation(
+ SocketAsyncContext.AsyncOperation operation,
+ long prepareSequence,
+ out bool preparedSqe)
+ {
+ preparedSqe = false;
+
+ bool prepared = operation.TryPrepareIoUring(operation.AssociatedContext, prepareSequence);
+ if (prepared)
+ {
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Queued,
+ IoUringOperationLifecycleState.Prepared);
+ }
+
+ if (prepared && operation.ErrorCode == SocketError.Success)
+ {
+ preparedSqe = true;
+ if (!TryTrackPreparedIoUringOperation(operation))
+ {
+ // Invariant violation: tracking collision after prepare.
+ // A prepared SQE may now complete without a managed owner; do not attempt best-effort recovery.
+ // Surface InternalException to terminate the engine path deterministically.
+ Debug.Fail("io_uring prepared operation could not be tracked by user_data.");
+ operation.ClearIoUringUserData();
+ return Interop.Error.EINVAL;
+ }
+
+ return Interop.Error.SUCCESS;
+ }
+
+ if (prepared)
+ {
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Prepared,
+ IoUringOperationLifecycleState.Detached);
+ }
+
+ if (!TryUntrackIoUringOperation(operation.IoUringUserData, operation))
+ {
+ // Mismatch indicates token ownership confusion; avoid releasing
+ // resources that may still be associated with another tracked op.
+ return Interop.Error.EINVAL;
+ }
+
+ operation.ClearIoUringUserData();
+ return Interop.Error.SUCCESS;
+ }
+
+ ///
+ /// Falls back to readiness notification for an operation that remained waiting after a failed prepare attempt.
+ ///
+ private void EmitReadinessFallbackForUnpreparedOperation(SocketAsyncContext.AsyncOperation operation)
+ {
+ operation.ClearIoUringUserData();
+ Interop.Sys.SocketEvents fallbackEvents = operation.GetIoUringFallbackSocketEvents();
+ if (fallbackEvents == Interop.Sys.SocketEvents.None)
+ {
+ return;
+ }
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringPrepareFallbackToReadiness(fallbackEvents);
+ }
+
+ EnqueueReadinessFallbackEvent(operation.AssociatedContext, fallbackEvents);
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ void LogIoUringPrepareFallbackToReadiness(Interop.Sys.SocketEvents events)
+ {
+ NetEventSource.Error(
+ this,
+ $"io_uring prepare fallback to readiness notification: events={events}");
+ }
+ }
+
+ /// Registers a prepared operation in the completion registry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryTrackPreparedIoUringOperation(SocketAsyncContext.AsyncOperation operation)
+ {
+ IoUringOperationRegistry? registry = _ioUringOperationRegistry;
+ if (registry is null)
+ {
+ return false;
+ }
+
+ if (registry.TryTrack(operation))
+ {
+ return true;
+ }
+
+ // Persistent multishot receive can rebind an existing tracked user_data to a new
+ // managed operation before this call. In that case, tracking is already satisfied.
+ return operation.IoUringUserData != 0 &&
+ registry.TryGet(operation.IoUringUserData, out SocketAsyncContext.AsyncOperation? trackedOperation) &&
+ ReferenceEquals(trackedOperation, operation);
+ }
+
+ /// Returns whether the given user_data is currently tracked.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsTrackedIoUringOperation(ulong userData)
+ {
+ IoUringOperationRegistry? registry = _ioUringOperationRegistry;
+ return registry is not null && registry.Contains(userData);
+ }
+
+ /// Returns whether current completion-slot usage indicates likely slot exhaustion pressure.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsPotentialCompletionSlotExhaustion()
+ {
+ IoUringCompletionSlot[]? completionSlots = _completionSlots;
+ if (completionSlots is null || completionSlots.Length == 0)
+ {
+ return false;
+ }
+
+ int threshold = Math.Max(0, completionSlots.Length - 16);
+ return _completionSlotsInUse >= threshold;
+ }
+
+ /// Returns whether the calling thread is the event loop thread.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsCurrentThreadEventLoopThread() =>
+ Volatile.Read(ref _eventLoopManagedThreadId) == Environment.CurrentManagedThreadId;
+
+ /// Returns whether a submit error indicates an unsupported operation rather than a real failure.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsIgnoredIoUringSubmitError(Interop.Error error) =>
+ error == Interop.Error.ENOSYS || error == Interop.Error.ENOTSUP || error == Interop.Error.EOPNOTSUPP;
+
+ /// Submits the specified number of pending SQEs via io_uring_enter.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe Interop.Error ManagedSubmitPendingEntries(uint toSubmit)
+ {
+ if (toSubmit == 0)
+ return Interop.Error.SUCCESS;
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "ManagedSubmitPendingEntries must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ if (_sqPollEnabled)
+ {
+ if (!SqNeedWakeup())
+ {
+ SocketsTelemetry.Log.IoUringSqPollSubmissionSkipped(toSubmit);
+ return Interop.Error.SUCCESS;
+ }
+
+ uint wakeupFlags = IoUringConstants.EnterSqWakeup;
+ int wakeupRingFd = _managedRingFd;
+ if (_ioUringSqRingInfo.RegisteredRingFd >= 0)
+ {
+ wakeupFlags |= IoUringConstants.EnterRegisteredRing;
+ wakeupRingFd = _ioUringSqRingInfo.RegisteredRingFd;
+ }
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogSqPollWakeup(this, toSubmit);
+ }
+
+ SocketsTelemetry.Log.IoUringSqPollWakeup();
+ int wakeupResult;
+ return Interop.Sys.IoUringShimEnter(wakeupRingFd, 0, 0, wakeupFlags, &wakeupResult);
+ }
+
+ uint enterFlags = 0;
+ int ringFd = _managedRingFd;
+ if (_ioUringSqRingInfo.RegisteredRingFd >= 0)
+ {
+ enterFlags |= IoUringConstants.EnterRegisteredRing;
+ ringFd = _ioUringSqRingInfo.RegisteredRingFd;
+ }
+
+ while (toSubmit > 0)
+ {
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimEnter(ringFd, toSubmit, 0, enterFlags, &result);
+ if (err != Interop.Error.SUCCESS)
+ return err;
+
+ if (result <= 0)
+ {
+ return Interop.Error.EAGAIN;
+ }
+
+ toSubmit -= (uint)result;
+ }
+ return Interop.Error.SUCCESS;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void LogSqPollWakeup(SocketAsyncEngine engine, uint pendingSubmissionCount) =>
+ NetEventSource.Info(engine, $"io_uring SQPOLL wakeup requested for pending SQEs: {pendingSubmissionCount}");
+
+ /// Computes pending submissions and calls ManagedSubmitPendingEntries.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private Interop.Error SubmitIoUringOperationsNormalized()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "SubmitIoUringOperationsNormalized must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ PublishManagedSqeTail();
+ uint managedPending = _ioUringManagedPendingSubmissions;
+ _ioUringManagedPendingSubmissions = 0;
+
+ Interop.Error error = ManagedSubmitPendingEntries(managedPending);
+
+ if (error != Interop.Error.SUCCESS && managedPending != 0)
+ {
+ _ioUringManagedPendingSubmissions += managedPending;
+ }
+
+ return IsIgnoredIoUringSubmitError(error) ? Interop.Error.SUCCESS : error;
+ }
+
+ /// Cancels all queued-but-not-submitted operations during teardown.
+ private void DrainQueuedIoUringOperationsForTeardown()
+ {
+ MpscQueue? prepareQueue = _ioUringPrepareQueue;
+ if (prepareQueue is not null)
+ {
+ while (prepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem))
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ Debug.Assert(remainingLength >= 0);
+
+ SocketAsyncContext.AsyncOperation operation = workItem.Operation;
+ operation.CancelPendingIoUringPreparation(workItem.PrepareSequence);
+ operation.TryCancelForTeardown();
+ operation.ClearIoUringUserData();
+ }
+ }
+
+ MpscQueue? cancelQueue = _ioUringCancelQueue;
+ if (cancelQueue is not null)
+ {
+ while (cancelQueue.TryDequeue(out _))
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ Debug.Assert(remainingLength >= 0);
+ }
+ }
+
+ Volatile.Write(ref _ioUringWakeupRequested, 0);
+ }
+
+ ///
+ /// Cancels all tracked in-flight operations during teardown.
+ /// This includes any future long-lived operations (for example multishot recv).
+ ///
+ private void DrainTrackedIoUringOperationsForTeardown(bool portClosedForTeardown)
+ {
+ IoUringOperationRegistry? registry = _ioUringOperationRegistry;
+ if (registry is null || registry.IsEmpty)
+ {
+ return;
+ }
+
+ bool queuedAsyncCancel = false;
+ bool canPrepareTeardownCancels = !portClosedForTeardown && IsCurrentThreadEventLoopThread();
+ foreach (SocketAsyncContext.AsyncOperation operation in registry.DrainAllTrackedOperations())
+ {
+ ulong userData = operation.IoUringUserData;
+ if (canPrepareTeardownCancels &&
+ TryQueueIoUringAsyncCancel(userData))
+ {
+ queuedAsyncCancel = true;
+ }
+
+ // Teardown policy: if the port was already closed, native ownership has been
+ // detached and it is now safe to release operation-owned resources eagerly.
+ // Otherwise, queue best-effort async cancel before releasing resources.
+ operation.TryCancelForTeardown();
+ operation.ClearIoUringUserData();
+ }
+
+ if (canPrepareTeardownCancels && queuedAsyncCancel)
+ {
+ Interop.Error submitError = SubmitIoUringOperationsNormalized();
+ if (submitError != Interop.Error.SUCCESS)
+ {
+ if (NetEventSource.Log.IsEnabled()) LogIoUringAsyncCancelSubmitFailure(submitError, IoUringCancellationOrigin.Teardown);
+ }
+ }
+ }
+
+ /// Increments the late-completion counter and samples to the log.
+ private void RecordBenignLateIoUringCompletion(ulong userData)
+ {
+ RecordIoUringCounterAndMaybeLog(ref _ioUringBenignLateCompletionCount, userData, "io_uring completion arrived after managed untrack");
+ }
+
+ /// Increments the poll-readiness CQE diagnostic counter.
+ private static void RecordIoUringPollReadinessCqe()
+ {
+ Interlocked.Increment(ref s_ioUringPollReadinessCqeCount);
+ }
+
+ /// Increments the diagnostic counter tracking pending completion retries that queued prepare work.
+ private static void RecordIoUringPendingRetryQueuedToPrepareQueue()
+ {
+ Interlocked.Increment(ref s_ioUringPendingRetryQueuedToPrepareQueueCount);
+ }
+
+ /// Increments the completion-slot exhaustion counter.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecordIoUringCompletionSlotExhaustion()
+ {
+ Interlocked.Increment(ref _ioUringCompletionSlotExhaustionCount);
+ }
+
+ /// Increments the completion-slot drain-recovery counter.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecordIoUringCompletionSlotDrainRecovery()
+ {
+ Interlocked.Increment(ref _ioUringCompletionSlotDrainRecoveryCount);
+ }
+
+ /// Increments the prepare-queue overflow fallback counter.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecordIoUringPrepareQueueOverflowFallback()
+ {
+ Interlocked.Increment(ref _ioUringPrepareQueueOverflowFallbackCount);
+ }
+
+ /// Increments the requeue-failure counter and samples to the log.
+ private void RecordIoUringCompletionRequeueFailure(ulong userData)
+ {
+ RecordIoUringCounterAndMaybeLog(ref _ioUringCompletionRequeueFailureCount, userData, "io_uring completion requeue failed; queued readiness fallback");
+ }
+
+ /// Increments a counter and logs a sample every 64 increments.
+ private void RecordIoUringCounterAndMaybeLog(ref long counter, ulong userData, string message)
+ {
+ long count = Interlocked.Increment(ref counter);
+ if ((count & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringCounterSample(message, count, userData);
+ }
+ }
+
+ /// Logs the teardown summary if any late completions were recorded.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void LogLinuxIoUringTeardownSummaryIfNeeded()
+ {
+ long lateCompletionCount = Interlocked.Read(ref _ioUringBenignLateCompletionCount);
+ if (lateCompletionCount > 0 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringTeardownSummary(lateCompletionCount);
+ }
+ }
+
+ /// Periodically polls native counters and publishes deltas to telemetry.
+ private void PollIoUringDiagnosticsIfNeeded(bool force)
+ {
+ if (!_ioUringCapabilities.IsIoUringPort)
+ {
+ return;
+ }
+
+ if (!force)
+ {
+ int countdown = _ioUringDiagnosticsPollCountdown - 1;
+ _ioUringDiagnosticsPollCountdown = countdown;
+ if (countdown > 0)
+ {
+ return;
+ }
+ }
+
+ _ioUringDiagnosticsPollCountdown = IoUringDiagnosticsPollInterval;
+ PublishIoUringManagedDiagnosticsDelta();
+ if (!_ioUringAdvancedFeatureStateLogged && NetEventSource.Log.IsEnabled())
+ {
+ _ioUringAdvancedFeatureStateLogged = true;
+ LogIoUringAdvancedFeatureState();
+ }
+
+ if (!force)
+ {
+ EvaluateProvidedBufferRingResize();
+ }
+ }
+
+ /// Returns the non-negative delta between two counter snapshots.
+ private static long ComputeManagedCounterDelta(long previous, long current) =>
+ current >= previous ? current - previous : current;
+
+ /// Computes and publishes the global non-pinnable fallback counter delta.
+ private static long GetIoUringNonPinnablePrepareFallbackDelta()
+ {
+ // This counter is process-wide and shared across all engines. Serialize publication so
+ // concurrent engine loops do not double-publish or observe torn baseline updates.
+ if (Interlocked.CompareExchange(ref s_ioUringPublishingNonPinnablePrepareFallback, 1, 0) != 0)
+ {
+ return 0;
+ }
+
+ try
+ {
+ long current = SocketAsyncContext.GetIoUringNonPinnablePrepareFallbackCount();
+ long previous = Interlocked.Exchange(ref s_ioUringPublishedNonPinnablePrepareFallbackCount, current);
+ return ComputeManagedCounterDelta(previous, current);
+ }
+ finally
+ {
+ Volatile.Write(ref s_ioUringPublishingNonPinnablePrepareFallback, 0);
+ }
+ }
+
+ /// Publishes all managed diagnostic counter deltas to telemetry.
+ private void PublishIoUringManagedDiagnosticsDelta()
+ {
+ long requeueFailureCurrent = Interlocked.Read(ref _ioUringCompletionRequeueFailureCount);
+ long requeueFailurePrevious = Volatile.Read(ref _ioUringPublishedCompletionRequeueFailureCount);
+ long requeueFailureDelta = ComputeManagedCounterDelta(requeueFailurePrevious, requeueFailureCurrent);
+ Volatile.Write(ref _ioUringPublishedCompletionRequeueFailureCount, requeueFailureCurrent);
+
+ long nonPinnableFallbackDelta = GetIoUringNonPinnablePrepareFallbackDelta();
+ long prepareQueueOverflowCurrent = Interlocked.Read(ref _ioUringPrepareQueueOverflowCount);
+ long prepareQueueOverflowPrevious = Volatile.Read(ref _ioUringPublishedPrepareQueueOverflowCount);
+ long prepareQueueOverflowDelta = ComputeManagedCounterDelta(prepareQueueOverflowPrevious, prepareQueueOverflowCurrent);
+ Volatile.Write(ref _ioUringPublishedPrepareQueueOverflowCount, prepareQueueOverflowCurrent);
+ long prepareQueueOverflowFallbackCurrent = Interlocked.Read(ref _ioUringPrepareQueueOverflowFallbackCount);
+ long prepareQueueOverflowFallbackPrevious = Volatile.Read(ref _ioUringPublishedPrepareQueueOverflowFallbackCount);
+ long prepareQueueOverflowFallbackDelta = ComputeManagedCounterDelta(prepareQueueOverflowFallbackPrevious, prepareQueueOverflowFallbackCurrent);
+ Volatile.Write(ref _ioUringPublishedPrepareQueueOverflowFallbackCount, prepareQueueOverflowFallbackCurrent);
+ long prepareQueueLengthCurrent = Interlocked.Read(ref _ioUringPrepareQueueLength);
+ long prepareQueueLengthPrevious = Volatile.Read(ref _ioUringPublishedPrepareQueueLength);
+ long prepareQueueDepthDelta = prepareQueueLengthCurrent - prepareQueueLengthPrevious;
+ Volatile.Write(ref _ioUringPublishedPrepareQueueLength, prepareQueueLengthCurrent);
+ long completionSlotExhaustionCurrent = Interlocked.Read(ref _ioUringCompletionSlotExhaustionCount);
+ long completionSlotExhaustionPrevious = Volatile.Read(ref _ioUringPublishedCompletionSlotExhaustionCount);
+ long completionSlotExhaustionDelta = ComputeManagedCounterDelta(completionSlotExhaustionPrevious, completionSlotExhaustionCurrent);
+ Volatile.Write(ref _ioUringPublishedCompletionSlotExhaustionCount, completionSlotExhaustionCurrent);
+ long completionSlotDrainRecoveryCurrent = Interlocked.Read(ref _ioUringCompletionSlotDrainRecoveryCount);
+ long completionSlotDrainRecoveryPrevious = Volatile.Read(ref _ioUringPublishedCompletionSlotDrainRecoveryCount);
+ long completionSlotDrainRecoveryDelta = ComputeManagedCounterDelta(completionSlotDrainRecoveryPrevious, completionSlotDrainRecoveryCurrent);
+ Volatile.Write(ref _ioUringPublishedCompletionSlotDrainRecoveryCount, completionSlotDrainRecoveryCurrent);
+
+ if (requeueFailureDelta != 0)
+ {
+ SocketsTelemetry.Log.IoUringCompletionRequeueFailure(requeueFailureDelta);
+ }
+
+ if (nonPinnableFallbackDelta != 0)
+ {
+ SocketsTelemetry.Log.IoUringPrepareNonPinnableFallback(nonPinnableFallbackDelta);
+ }
+
+ if (prepareQueueOverflowDelta != 0)
+ {
+ SocketsTelemetry.Log.IoUringPrepareQueueOverflow(prepareQueueOverflowDelta);
+ }
+
+ if (prepareQueueOverflowFallbackDelta != 0)
+ {
+ SocketsTelemetry.Log.IoUringPrepareQueueOverflowFallback(prepareQueueOverflowFallbackDelta);
+ }
+
+ if (prepareQueueDepthDelta != 0)
+ {
+ SocketsTelemetry.Log.IoUringPrepareQueueDepthDelta(prepareQueueDepthDelta);
+ }
+
+ if (completionSlotExhaustionDelta != 0)
+ {
+ SocketsTelemetry.Log.IoUringCompletionSlotExhaustion(completionSlotExhaustionDelta);
+ }
+
+ if (completionSlotDrainRecoveryDelta != 0)
+ {
+ SocketsTelemetry.Log.IoUringCompletionSlotDrainRecovery(completionSlotDrainRecoveryDelta);
+ }
+ }
+
+ private readonly partial struct SocketEventHandler
+ {
+ /// Delivers a completed operation to its owning socket context.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void DispatchCompletedIoUringOperation(SocketAsyncContext.AsyncOperation operation, ulong userData)
+ {
+ if (!operation.AssociatedContext.TryCompleteIoUringOperation(operation))
+ {
+ _engine.RecordBenignLateIoUringCompletion(userData);
+ }
+ }
+
+ /// Completes a deferred SEND_ZC operation when its NOTIF CQE arrives.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void DispatchZeroCopyIoUringNotification(ulong payload)
+ {
+ IoUringOperationRegistry? registry = _engine._ioUringOperationRegistry;
+ if (registry is null)
+ {
+ return;
+ }
+
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ if (!registry.TryTake(userData, out SocketAsyncContext.AsyncOperation? operation) || operation is null)
+ {
+ return;
+ }
+
+ Debug.Assert(
+ !_engine.IsZeroCopyNotificationPending(userData),
+ "NOTIF CQE dispatch must occur only after clearing SEND_ZC pending slot state.");
+ Debug.Assert(
+ operation.IoUringUserData == userData,
+ "Deferred SEND_ZC operation must still be tracked with its original user_data at NOTIF dispatch.");
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Completed);
+ operation.ClearIoUringUserData();
+ DispatchCompletedIoUringOperation(operation, userData);
+ }
+
+ /// Processes a single completion and dispatches it to its owning operation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void DispatchSingleIoUringCompletion(
+ ulong userData,
+ int result,
+ uint flags,
+ int socketAddressLen,
+ int controlBufferLen,
+ uint auxiliaryData,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId,
+ ref bool enqueuedFallbackEvent)
+ {
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "DispatchSingleIoUringCompletion must only run on the event-loop thread.");
+ if (userData == 0)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ return;
+ }
+
+ IoUringOperationRegistry? registry = _engine._ioUringOperationRegistry;
+ if (registry is null)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ return;
+ }
+
+ // Benign race: cancellation/abort paths may have already removed this tracked entry.
+ if (!registry.TryTake(userData, out SocketAsyncContext.AsyncOperation? operation))
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ _engine.RecordBenignLateIoUringCompletion(userData);
+ return;
+ }
+
+ if (operation is null)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ return;
+ }
+
+ SocketAsyncContext receiveContext = operation.AssociatedContext;
+ if (receiveContext.IsPersistentMultishotRecvArmed() &&
+ receiveContext.PersistentMultishotRecvUserData == userData)
+ {
+ // Terminal CQE for persistent multishot recv (normal completion, cancel,
+ // ENOBUFS, EOF, or other error): clear armed-state so the next receive can re-arm.
+ SocketsTelemetry.Log.IoUringPersistentMultishotRecvTermination();
+ receiveContext.ClearPersistentMultishotRecvArmed();
+ }
+
+ if (operation is SocketAsyncContext.AcceptOperation acceptOperation &&
+ acceptOperation.AssociatedContext.MultishotAcceptUserData == userData)
+ {
+ acceptOperation.AssociatedContext.DisarmMultishotAccept();
+ }
+
+ uint completionAuxiliaryData = auxiliaryData;
+ int completionResultCode = result;
+ if (!TryMaterializeIoUringReceiveCompletion(
+ operation!,
+ completionResultCode,
+ flags,
+ hasFixedRecvBuffer,
+ fixedRecvBufferId,
+ ref completionAuxiliaryData))
+ {
+ completionResultCode = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.ENOBUFS);
+ completionAuxiliaryData = 0;
+ }
+
+ // Process completion metadata before processing result to allow message post-processing.
+ operation!.SetIoUringCompletionMessageMetadata(socketAddressLen, controlBufferLen);
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult =
+ operation.ProcessIoUringCompletionResult(completionResultCode, flags, completionAuxiliaryData);
+
+ if (completionDispatchResult == SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed &&
+ _engine.IsZeroCopyNotificationPending(userData))
+ {
+ // SEND_ZC API contract: complete managed operation only once NOTIF confirms
+ // the kernel/NIC no longer references the caller buffer.
+ _engine.AssertZeroCopyDeferredCompletionState(userData, operation);
+ if (!registry.TryReattach(userData, operation))
+ {
+ ThrowInternalException(Interop.Error.EINVAL);
+ return;
+ }
+
+ return;
+ }
+
+ DispatchIoUringCompletionResult(
+ operation,
+ completionDispatchResult,
+ userData,
+ ref enqueuedFallbackEvent);
+ }
+
+ ///
+ /// Processes a multishot completion by completing the current operation and
+ /// requesting async cancel for non-terminal shots until full item-9 dispatch lands.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public void DispatchMultishotIoUringCompletion(
+ ulong userData,
+ int result,
+ uint flags,
+ int socketAddressLen,
+ int controlBufferLen,
+ uint auxiliaryData,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId,
+ ref bool enqueuedFallbackEvent)
+ {
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "DispatchMultishotIoUringCompletion must only run on the event-loop thread.");
+ _ = enqueuedFallbackEvent; // Transitional path never requeues via readiness fallback.
+ _ = hasFixedRecvBuffer;
+ _ = fixedRecvBufferId;
+ Debug.Assert((flags & IoUringConstants.CqeFMore) != 0,
+ "Multishot dispatch must only be used for non-terminal CQEs (IORING_CQE_F_MORE).");
+
+ if (userData == 0)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0);
+ return;
+ }
+
+ IoUringOperationRegistry? registry = _engine._ioUringOperationRegistry;
+ if (registry is null)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0);
+ return;
+ }
+
+ if (!registry.TryGet(userData, out SocketAsyncContext.AsyncOperation? operation) || operation is null)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0);
+ _engine.RecordBenignLateIoUringCompletion(userData);
+ return;
+ }
+
+ if (operation is SocketAsyncContext.AcceptOperation acceptOperation)
+ {
+ DispatchMultishotAcceptIoUringCompletion(
+ acceptOperation,
+ userData,
+ result,
+ flags,
+ socketAddressLen,
+ auxiliaryData);
+ return;
+ }
+
+ if (!operation.IsInWaitingState())
+ {
+ if (!TryBufferEarlyPersistentMultishotRecvCompletion(operation.AssociatedContext, result, flags))
+ {
+ _engine.TryRequestIoUringCancellation(userData);
+ }
+
+ return;
+ }
+
+ uint completionAuxiliaryData = auxiliaryData;
+ int completionResultCode = result;
+ if (!TryMaterializeIoUringReceiveCompletion(
+ operation,
+ completionResultCode,
+ flags,
+ hasFixedRecvBuffer: false,
+ fixedRecvBufferId: 0,
+ ref completionAuxiliaryData))
+ {
+ completionResultCode = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.ENOBUFS);
+ completionAuxiliaryData = 0;
+ }
+
+ operation.SetIoUringCompletionMessageMetadata(socketAddressLen, controlBufferLen);
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult =
+ operation.ProcessIoUringCompletionResult(completionResultCode, flags, completionAuxiliaryData);
+
+ SocketAsyncContext context = operation.AssociatedContext;
+ bool isPersistentMultishotRecv =
+ context.IsPersistentMultishotRecvArmed() &&
+ context.PersistentMultishotRecvUserData == userData;
+
+ // Transitional multishot model cancels after the first shot.
+ // Persistent multishot receive remains armed and rebinds future operations via TryReplace.
+ if (!isPersistentMultishotRecv)
+ {
+ _engine.TryRequestIoUringCancellation(userData);
+ }
+
+ switch (completionDispatchResult)
+ {
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed:
+ DispatchCompletedIoUringOperation(operation, userData);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending:
+ // Transitional multishot mode does not requeue intermediate shots.
+ // Cancellation is already requested above; terminal CQE cleanup path
+ // remains responsible for tracked-state/resource release.
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled:
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored:
+ break;
+
+ default:
+ Debug.Fail($"Unexpected io_uring multishot completion result: {completionDispatchResult}");
+ break;
+ }
+ }
+
+ ///
+ /// Handles transitional multishot-accept CQEs by completing one waiting operation and
+ /// canceling the multishot request. Extra successful shots are queued for dequeue on
+ /// the accept operation queue when possible.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void DispatchMultishotAcceptIoUringCompletion(
+ SocketAsyncContext.AcceptOperation operation,
+ ulong userData,
+ int result,
+ uint flags,
+ int socketAddressLen,
+ uint auxiliaryData)
+ {
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "DispatchMultishotAcceptIoUringCompletion must only run on the event-loop thread.");
+ operation.SetIoUringCompletionMessageMetadata(socketAddressLen, 0);
+ SocketAsyncContext context = operation.AssociatedContext;
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult =
+ operation.ProcessIoUringCompletionResult(result, flags, auxiliaryData);
+
+ // Transitional multishot-accept model: complete one managed accept and then
+ // issue async-cancel so terminal cleanup runs through single-shot dispatch.
+ _engine.TryRequestIoUringCancellation(userData);
+
+ switch (completionDispatchResult)
+ {
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed:
+ DispatchCompletedIoUringOperation(operation, userData);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending:
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled:
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored:
+ if (result >= 0)
+ {
+ int addressLength = auxiliaryData > (uint)operation.SocketAddress.Length ?
+ operation.SocketAddress.Length :
+ (int)auxiliaryData;
+ if (context.TryEnqueuePreAcceptedConnection((IntPtr)result, operation.SocketAddress.Span, addressLength))
+ {
+ _engine.EnqueueReadinessFallbackEvent(context, Interop.Sys.SocketEvents.Read);
+ }
+ else
+ {
+ Interop.Sys.Close((IntPtr)result);
+ }
+ }
+ break;
+
+ default:
+ Debug.Fail($"Unexpected io_uring multishot accept completion result: {completionDispatchResult}");
+ break;
+ }
+ }
+
+ ///
+ /// For receive completions that used provided buffers (buffer-select or fixed receive),
+ /// materializes payload bytes into the operation target and recycles checked-out buffers.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool TryMaterializeIoUringReceiveCompletion(
+ SocketAsyncContext.AsyncOperation operation,
+ int result,
+ uint flags,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId,
+ ref uint auxiliaryData)
+ {
+ bool hasSelectedBuffer = (flags & IoUringConstants.CqeFBuffer) != 0;
+ if (!hasFixedRecvBuffer && !hasSelectedBuffer)
+ {
+ return true;
+ }
+
+ IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ return false;
+ }
+
+ ushort bufferId;
+ bool reportRecycleFailureAsDepletion;
+ byte* providedBuffer = null;
+ int providedBufferLength = 0;
+ if (hasFixedRecvBuffer)
+ {
+ bufferId = fixedRecvBufferId;
+ reportRecycleFailureAsDepletion = true;
+
+ if (result > 0 &&
+ !providedBufferRing.TryGetCheckedOutBuffer(
+ bufferId,
+ out providedBuffer,
+ out providedBufferLength))
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferDepletion();
+ return false;
+ }
+ }
+ else
+ {
+ bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift);
+ reportRecycleFailureAsDepletion = false;
+ if (!providedBufferRing.TryAcquireBufferForCompletion(
+ bufferId,
+ out providedBuffer,
+ out providedBufferLength))
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferDepletion();
+ return false;
+ }
+ }
+
+ bool handled = result <= 0;
+ try
+ {
+ if (result > 0)
+ {
+ handled =
+ operation.TryProcessIoUringProvidedBufferCompletion(
+ providedBuffer,
+ providedBufferLength,
+ result,
+ ref auxiliaryData);
+ }
+
+ RecordProvidedBufferUtilizationIfEnabled(providedBufferRing, result);
+ }
+ finally
+ {
+ handled &= TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ bufferId,
+ reportFailureAsDepletion: reportRecycleFailureAsDepletion);
+ }
+
+ return handled;
+ }
+
+ ///
+ /// For persistent multishot recv, buffers payload bytes that arrive while no
+ /// managed receive operation is in the Waiting state.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool TryBufferEarlyPersistentMultishotRecvCompletion(
+ SocketAsyncContext context,
+ int result,
+ uint flags)
+ {
+ if (result <= 0)
+ {
+ return true;
+ }
+
+ if ((flags & IoUringConstants.CqeFBuffer) == 0)
+ {
+ return false;
+ }
+
+ IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ return false;
+ }
+
+ ushort bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift);
+ if (!providedBufferRing.TryAcquireBufferForCompletion(
+ bufferId,
+ out byte* providedBuffer,
+ out int providedBufferLength))
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferDepletion();
+ return false;
+ }
+
+ bool buffered = false;
+ try
+ {
+ if ((uint)result <= (uint)providedBufferLength)
+ {
+ buffered = context.TryBufferEarlyPersistentMultishotRecvData(
+ new ReadOnlySpan(providedBuffer, result));
+ if (buffered)
+ {
+ RecordProvidedBufferUtilizationIfEnabled(providedBufferRing, result);
+ SocketsTelemetry.Log.IoUringPersistentMultishotRecvEarlyData();
+ }
+ }
+ }
+ finally
+ {
+ buffered &= TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ bufferId,
+ reportFailureAsDepletion: false);
+ }
+
+ return buffered;
+ }
+
+ ///
+ /// Recycles a provided-buffer selection for completions that can no longer be
+ /// dispatched to a tracked operation (e.g., late multishot CQEs after cancel).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void RecycleUntrackedReceiveCompletionBuffers(
+ uint flags,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId)
+ {
+ IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ return;
+ }
+
+ if ((flags & IoUringConstants.CqeFBuffer) == 0)
+ {
+ if (hasFixedRecvBuffer)
+ {
+ _ = TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ fixedRecvBufferId,
+ reportFailureAsDepletion: true);
+ }
+
+ return;
+ }
+
+ ushort bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift);
+ if (!providedBufferRing.TryAcquireBufferForCompletion(
+ bufferId,
+ out _,
+ out _))
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferDepletion();
+ }
+ else
+ {
+ _ = TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ bufferId,
+ reportFailureAsDepletion: false);
+ }
+
+ if (hasFixedRecvBuffer)
+ {
+ _ = TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ fixedRecvBufferId,
+ reportFailureAsDepletion: true);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecordProvidedBufferUtilizationIfEnabled(
+ IoUringProvidedBufferRing providedBufferRing,
+ int bytesTransferred)
+ {
+ if (bytesTransferred <= 0 || !_engine._adaptiveBufferSizingEnabled)
+ {
+ return;
+ }
+
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "Adaptive provided-buffer utilization tracking must run on the event-loop thread.");
+ providedBufferRing.RecordCompletionUtilization(bytesTransferred);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool TryRecycleProvidedBufferFromCheckedOutState(
+ IoUringProvidedBufferRing providedBufferRing,
+ ushort bufferId,
+ bool reportFailureAsDepletion)
+ {
+ bool recycled = providedBufferRing.TryRecycleBufferFromCompletion(bufferId);
+ if (recycled)
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferRecycle();
+ }
+ else if (reportFailureAsDepletion)
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferDepletion();
+ }
+
+ return recycled;
+ }
+
+ /// Requeues a pending operation or falls back to readiness notification.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool DispatchPendingIoUringOperation(SocketAsyncContext.AsyncOperation operation, ulong userData)
+ {
+ PendingIoUringReprepareResult inlineReprepareResult = TryDispatchPendingIoUringOperationInline(operation);
+ if (inlineReprepareResult == PendingIoUringReprepareResult.Prepared)
+ {
+ return false;
+ }
+
+ if (inlineReprepareResult == PendingIoUringReprepareResult.NotAttempted &&
+ operation.TryQueueIoUringPreparation())
+ {
+ SocketAsyncEngine.RecordIoUringPendingRetryQueuedToPrepareQueue();
+ return false;
+ }
+
+ Debug.Assert(
+ inlineReprepareResult == PendingIoUringReprepareResult.Failed ||
+ !_engine._ioUringCapabilities.IsCompletionMode,
+ "Requeue should not fail in pure io_uring completion mode when inline re-prepare was not attempted.");
+
+ _engine.RecordIoUringCompletionRequeueFailure(userData);
+ operation.ClearIoUringUserData();
+ Interop.Sys.SocketEvents fallbackEvents = operation.GetIoUringFallbackSocketEvents();
+ if (fallbackEvents == Interop.Sys.SocketEvents.None)
+ {
+ return false;
+ }
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogUnexpectedCompletionFallback(_engine, fallbackEvents, userData);
+ }
+ _eventQueue.Enqueue(new SocketIOEvent(operation.AssociatedContext, fallbackEvents));
+ return true;
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static void LogUnexpectedCompletionFallback(SocketAsyncEngine engine, Interop.Sys.SocketEvents events, ulong completionUserData)
+ {
+ NetEventSource.Error(
+ engine,
+ $"io_uring completion fallback to readiness notification in unexpected path: events={events}, user_data=0x{completionUserData:x}");
+ }
+ }
+
+ ///
+ /// Attempts to re-prepare and re-track a pending operation inline on the event loop thread.
+ /// This avoids an extra prepare-queue round-trip for completion-mode retries.
+ ///
+ private enum PendingIoUringReprepareResult : byte
+ {
+ NotAttempted = 0,
+ Prepared = 1,
+ Failed = 2
+ }
+
+ ///
+ /// Attempts to re-prepare a pending operation inline.
+ /// Returns whether inline re-prepare was prepared, skipped, or failed without producing an SQE.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private PendingIoUringReprepareResult TryDispatchPendingIoUringOperationInline(SocketAsyncContext.AsyncOperation operation)
+ {
+ if (!_engine._ioUringCapabilities.IsCompletionMode || !_engine.IsCurrentThreadEventLoopThread())
+ {
+ return PendingIoUringReprepareResult.NotAttempted;
+ }
+
+ long prepareSequence = operation.MarkReadyForIoUringPreparation();
+ Interop.Error prepareError = _engine.TryPrepareAndTrackIoUringOperation(
+ operation,
+ prepareSequence,
+ out bool preparedSqe);
+ if (prepareError != Interop.Error.SUCCESS)
+ {
+ ThrowInternalException(prepareError);
+ return PendingIoUringReprepareResult.Failed;
+ }
+
+ return preparedSqe ? PendingIoUringReprepareResult.Prepared : PendingIoUringReprepareResult.Failed;
+ }
+
+ /// Routes a CQE completion result to the appropriate dispatch behavior.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void DispatchIoUringCompletionResult(
+ SocketAsyncContext.AsyncOperation operation,
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionResult,
+ ulong userData,
+ ref bool enqueuedFallbackEvent)
+ {
+ switch (completionResult)
+ {
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed:
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Completed);
+ operation.ClearIoUringUserData();
+ DispatchCompletedIoUringOperation(operation, userData);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending:
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Queued);
+ if (operation.ShouldReuseIoUringPreparationResourcesOnPending)
+ {
+ operation.MarkIoUringPreparationReusable();
+ operation.ResetIoUringUserDataForRequeue();
+ }
+ else
+ {
+ operation.ClearIoUringUserData();
+ }
+
+ enqueuedFallbackEvent |= DispatchPendingIoUringOperation(operation, userData);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled:
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored:
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Canceled);
+ operation.ClearIoUringUserData();
+ _engine.RecordBenignLateIoUringCompletion(userData);
+ break;
+
+ default:
+ Debug.Fail($"Unexpected io_uring completion result: {completionResult}");
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Detached);
+ operation.ClearIoUringUserData();
+ _engine.RecordBenignLateIoUringCompletion(userData);
+ break;
+ }
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs
index ae9b6c9095e43f..0a59231138c169 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs
@@ -4,20 +4,22 @@
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
namespace System.Net.Sockets
{
- internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem
+ internal sealed unsafe partial class SocketAsyncEngine : IThreadPoolWorkItem
{
- private const int EventBufferCount =
+ private const int DefaultEventBufferCount =
#if DEBUG
32;
#else
1024;
#endif
+ private static readonly int s_eventBufferCount = GetEventBufferCount();
// Socket continuations are dispatched to the ThreadPool from the event thread.
// This avoids continuations blocking the event handling.
@@ -25,9 +27,31 @@ internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem
// PreferInlineCompletions defaults to false and can be set to true using the DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS envvar.
internal static readonly bool InlineSocketCompletionsEnabled = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS") == "1";
+ private static int GetEventBufferCount()
+ {
+#if DEBUG
+ // Test-only knob to make wait-buffer saturation deterministic for io_uring diagnostics coverage.
+ // Only available in DEBUG builds so production code never reads test env vars.
+ if (OperatingSystem.IsLinux())
+ {
+ string? configuredValue = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_EVENT_BUFFER_COUNT");
+ if (configuredValue is not null &&
+ int.TryParse(configuredValue, out int parsedValue) &&
+ parsedValue >= 1 &&
+ parsedValue <= DefaultEventBufferCount)
+ {
+ return parsedValue;
+ }
+ }
+#endif
+
+ return DefaultEventBufferCount;
+ }
+
private static int GetEngineCount()
{
// The responsibility of SocketAsyncEngine is to get notifications from epoll|kqueue
+ // (or io_uring on Linux when enabled in the native shim)
// and schedule corresponding work items to ThreadPool (socket reads and writes).
//
// Using TechEmpower benchmarks that generate a LOT of SMALL socket reads and writes under a VERY HIGH load
@@ -85,6 +109,7 @@ private static SocketAsyncEngine[] CreateEngines()
private readonly IntPtr _port;
private readonly Interop.Sys.SocketEvent* _buffer;
+ private int _eventLoopManagedThreadId;
//
// Queue of events generated by EventLoop() that would be processed by the thread pool
@@ -143,8 +168,20 @@ private bool TryRegisterCore(IntPtr socketHandle, SocketAsyncContext context, ou
context.GlobalContextIndex = index;
}
- error = Interop.Sys.TryChangeSocketEventRegistration(_port, socketHandle, Interop.Sys.SocketEvents.None,
- Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, context.GlobalContextIndex);
+ Interop.Error managedError = default;
+ bool managedHandled = false;
+ LinuxTryChangeSocketEventRegistration(socketHandle, Interop.Sys.SocketEvents.None,
+ Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write,
+ context.GlobalContextIndex, ref managedError, ref managedHandled);
+ if (managedHandled)
+ {
+ error = managedError;
+ }
+ else
+ {
+ error = Interop.Sys.TryChangeSocketEventRegistration(_port, socketHandle, Interop.Sys.SocketEvents.None,
+ Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, context.GlobalContextIndex);
+ }
if (error == Interop.Error.SUCCESS)
{
return true;
@@ -182,19 +219,21 @@ private SocketAsyncEngine()
err = Interop.Sys.CreateSocketEventPort(portPtr);
if (err != Interop.Error.SUCCESS)
{
- throw new InternalException(err);
+ ThrowInternalException(err);
}
}
fixed (Interop.Sys.SocketEvent** bufferPtr = &_buffer)
{
- err = Interop.Sys.CreateSocketEventBuffer(EventBufferCount, bufferPtr);
+ err = Interop.Sys.CreateSocketEventBuffer(s_eventBufferCount, bufferPtr);
if (err != Interop.Error.SUCCESS)
{
- throw new InternalException(err);
+ ThrowInternalException(err);
}
}
+ LinuxDetectAndInitializeIoUring();
+
var thread = new Thread(static s => ((SocketAsyncEngine)s!).EventLoop())
{
IsBackground = true,
@@ -209,32 +248,78 @@ private SocketAsyncEngine()
}
}
+ partial void LinuxDetectAndInitializeIoUring();
+ partial void LinuxEventLoopBeforeWait();
+ partial void LinuxEventLoopTryCompletionWait(SocketEventHandler handler, ref int numEvents, ref int numCompletions, ref Interop.Error err, ref bool waitHandled);
+ partial void LinuxEventLoopAfterIteration();
+ partial void LinuxBeforeFreeNativeResources(ref bool closeSocketEventPort);
+ partial void LinuxFreeIoUringResources();
+ partial void LinuxTryChangeSocketEventRegistration(IntPtr socketHandle, Interop.Sys.SocketEvents currentEvents, Interop.Sys.SocketEvents newEvents, int data, ref Interop.Error error, ref bool handled);
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowInternalException(Interop.Error error) =>
+ throw new InternalException(error);
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void FailFastEventLoop(Exception exception) =>
+ Environment.FailFast($"Exception thrown from SocketAsyncEngine event loop: {exception}", exception);
+
+ private void RecordAndAssertEventLoopThreadIdentity()
+ {
+ int currentThreadId = Environment.CurrentManagedThreadId;
+#if DEBUG
+ int previousThreadId = Interlocked.CompareExchange(ref _eventLoopManagedThreadId, currentThreadId, 0);
+ Debug.Assert(
+ previousThreadId == 0 || previousThreadId == currentThreadId,
+ $"SocketAsyncEngine event loop thread changed: previous={previousThreadId}, current={currentThreadId}");
+#else
+ Interlocked.CompareExchange(ref _eventLoopManagedThreadId, currentThreadId, 0);
+#endif
+ }
+
private void EventLoop()
{
try
{
+ RecordAndAssertEventLoopThreadIdentity();
SocketEventHandler handler = new SocketEventHandler(this);
while (true)
{
- int numEvents = EventBufferCount;
- Interop.Error err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents);
+ LinuxEventLoopBeforeWait();
+
+ int numEvents = s_eventBufferCount;
+ int numCompletions = 0;
+ Interop.Error err = default;
+ bool waitHandled = false;
+ LinuxEventLoopTryCompletionWait(handler, ref numEvents, ref numCompletions, ref err, ref waitHandled);
+ if (!waitHandled)
+ {
+ err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents);
+ }
+
if (err != Interop.Error.SUCCESS)
{
- throw new InternalException(err);
+ ThrowInternalException(err);
}
- // The native shim is responsible for ensuring this condition.
- Debug.Assert(numEvents > 0, $"Unexpected numEvents: {numEvents}");
+ // io_uring completion-mode wait can return with zero surfaced events/completions
+ // when woken only to flush managed prepare/cancel queues.
+ Debug.Assert(waitHandled || numEvents > 0 || numCompletions > 0, $"Unexpected wait result: events={numEvents}, completions={numCompletions}");
- if (handler.HandleSocketEvents(numEvents))
+ if (numEvents > 0 && handler.HandleSocketEvents(numEvents))
{
EnsureWorkerScheduled();
}
+
+ LinuxEventLoopAfterIteration();
}
}
catch (Exception e)
{
- Environment.FailFast("Exception thrown from SocketAsyncEngine event loop: " + e.ToString(), e);
+ FailFastEventLoop(e);
}
}
@@ -295,11 +380,19 @@ void IThreadPoolWorkItem.Execute()
private void FreeNativeResources()
{
+ bool closeSocketEventPort = true;
+ // Linux io_uring teardown may need to close the port first to ensure native
+ // ownership is detached before managed operation resources are released.
+ LinuxBeforeFreeNativeResources(ref closeSocketEventPort);
+
+ LinuxFreeIoUringResources();
+
if (_buffer != null)
{
Interop.Sys.FreeSocketEventBuffer(_buffer);
}
- if (_port != (IntPtr)(-1))
+
+ if (closeSocketEventPort && _port != (IntPtr)(-1))
{
Interop.Sys.CloseSocketEventPort(_port);
}
@@ -310,14 +403,16 @@ private void FreeNativeResources()
// To avoid this, the event handling logic is delegated to a non-inlined processing method.
// See discussion: https://github.com/dotnet/runtime/issues/37064
// SocketEventHandler holds an on-stack cache of SocketAsyncEngine members needed by the handler method.
- private readonly struct SocketEventHandler
+ private readonly partial struct SocketEventHandler
{
public Interop.Sys.SocketEvent* Buffer { get; }
private readonly ConcurrentQueue _eventQueue;
+ private readonly SocketAsyncEngine _engine;
public SocketEventHandler(SocketAsyncEngine engine)
{
+ _engine = engine;
Buffer = engine._buffer;
_eventQueue = engine._eventQueue;
}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs
new file mode 100644
index 00000000000000..38d7ef78334b34
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs
@@ -0,0 +1,12 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace System.Net.Sockets
+{
+ internal static partial class SocketPal
+ {
+ /// Extracts from a completed io_uring recvmsg message header.
+ internal static unsafe IPPacketInformation GetIoUringIPPacketInformation(Interop.Sys.MessageHeader* messageHeader, bool isIPv4, bool isIPv6) =>
+ GetIPPacketInformation(messageHeader, isIPv4, isIPv6);
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs
index 1171961a204351..81c19d9b082918 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs
@@ -14,6 +14,46 @@ internal sealed partial class SocketsTelemetry : EventSource
private const string ConnectActivityName = ActivitySourceName + ".Connect";
private static readonly ActivitySource s_connectActivitySource = new ActivitySource(ActivitySourceName);
+ internal static class Keywords
+ {
+ // Stable operational counters are always published when the source is enabled on Linux.
+ // Diagnostic counters are opt-in and can evolve without name stability guarantees.
+ internal const EventKeywords IoUringDiagnostics = (EventKeywords)0x1;
+ }
+
+ internal static class IoUringCounterNames
+ {
+ internal const string PrepareNonPinnableFallbacks = "io-uring-prepare-nonpinnable-fallbacks";
+ internal const string SocketEventBufferFull = "io-uring-socket-event-buffer-full";
+ internal const string CqOverflow = "io-uring-cq-overflow";
+ internal const string PrepareQueueOverflows = "io-uring-prepare-queue-overflows";
+ internal const string PrepareQueueOverflowFallbacks = "io-uring-prepare-queue-overflow-fallbacks";
+ internal const string CompletionSlotExhaustions = "io-uring-completion-slot-exhaustions";
+ internal const string SqPollWakeups = "io-uring-sqpoll-wakeups";
+ internal const string SqPollSubmissionsSkipped = "io-uring-sqpoll-submissions-skipped";
+ }
+
+ internal static class IoUringDiagnosticCounterNames
+ {
+ internal const string AsyncCancelRequestCqes = "io-uring-async-cancel-request-cqes";
+ internal const string CompletionRequeueFailures = "io-uring-completion-requeue-failures";
+ internal const string PrepareQueueDepth = "io-uring-prepare-queue-depth";
+ internal const string CompletionSlotDrainRecoveries = "io-uring-completion-slot-drain-recoveries";
+ internal const string ProvidedBufferDepletions = "io-uring-provided-buffer-depletions";
+ internal const string ProvidedBufferCurrentSize = "io-uring-provided-buffer-current-size";
+ internal const string ProvidedBufferRecycles = "io-uring-provided-buffer-recycles";
+ internal const string ProvidedBufferResizes = "io-uring-provided-buffer-resizes";
+ internal const string RegisteredBuffersInitialSuccess = "io-uring-registered-buffers-initial-success";
+ internal const string RegisteredBuffersInitialFailure = "io-uring-registered-buffers-initial-failure";
+ internal const string RegisteredBuffersReregistrationSuccess = "io-uring-registered-buffers-reregistration-success";
+ internal const string RegisteredBuffersReregistrationFailure = "io-uring-registered-buffers-reregistration-failure";
+ internal const string FixedRecvSelected = "io-uring-fixed-recv-selected";
+ internal const string FixedRecvFallbacks = "io-uring-fixed-recv-fallbacks";
+ internal const string PersistentMultishotRecvReuse = "io-uring-persistent-multishot-recv-reuse";
+ internal const string PersistentMultishotRecvTermination = "io-uring-persistent-multishot-recv-termination";
+ internal const string PersistentMultishotRecvEarlyData = "io-uring-persistent-multishot-recv-early-data";
+ }
+
public static readonly SocketsTelemetry Log = new SocketsTelemetry();
private PollingCounter? _currentOutgoingConnectAttemptsCounter;
@@ -23,6 +63,33 @@ internal sealed partial class SocketsTelemetry : EventSource
private PollingCounter? _bytesSentCounter;
private PollingCounter? _datagramsReceivedCounter;
private PollingCounter? _datagramsSentCounter;
+ // Keep io_uring counter backing fields always present so EventCounter name contracts remain stable
+ // across platforms; OnEventCommand only registers these counters on Linux.
+ private PollingCounter? _ioUringPrepareNonPinnableFallbacksCounter;
+ private PollingCounter? _ioUringAsyncCancelRequestCqesCounter;
+ private PollingCounter? _ioUringSocketEventBufferFullCounter;
+ private PollingCounter? _ioUringCqOverflowCounter;
+ private PollingCounter? _ioUringCompletionRequeueFailuresCounter;
+ private PollingCounter? _ioUringPrepareQueueOverflowsCounter;
+ private PollingCounter? _ioUringPrepareQueueOverflowFallbacksCounter;
+ private PollingCounter? _ioUringPrepareQueueDepthCounter;
+ private PollingCounter? _ioUringCompletionSlotExhaustionsCounter;
+ private PollingCounter? _ioUringCompletionSlotDrainRecoveriesCounter;
+ private PollingCounter? _ioUringProvidedBufferDepletionsCounter;
+ private PollingCounter? _ioUringProvidedBufferCurrentSizeCounter;
+ private PollingCounter? _ioUringProvidedBufferRecyclesCounter;
+ private PollingCounter? _ioUringProvidedBufferResizesCounter;
+ private PollingCounter? _ioUringRegisteredBuffersInitialSuccessCounter;
+ private PollingCounter? _ioUringRegisteredBuffersInitialFailureCounter;
+ private PollingCounter? _ioUringRegisteredBuffersReregistrationSuccessCounter;
+ private PollingCounter? _ioUringRegisteredBuffersReregistrationFailureCounter;
+ private PollingCounter? _ioUringFixedRecvSelectedCounter;
+ private PollingCounter? _ioUringFixedRecvFallbacksCounter;
+ private PollingCounter? _ioUringSqPollWakeupsCounter;
+ private PollingCounter? _ioUringSqPollSubmissionsSkippedCounter;
+ private PollingCounter? _ioUringPersistentMultishotRecvReuseCounter;
+ private PollingCounter? _ioUringPersistentMultishotRecvTerminationCounter;
+ private PollingCounter? _ioUringPersistentMultishotRecvEarlyDataCounter;
private long _currentOutgoingConnectAttempts;
private long _outgoingConnectionsEstablished;
@@ -31,6 +98,32 @@ internal sealed partial class SocketsTelemetry : EventSource
private long _bytesSent;
private long _datagramsReceived;
private long _datagramsSent;
+ // Backing fields stay cross-platform for contract stability; they are only surfaced as counters on Linux.
+ private long _ioUringPrepareNonPinnableFallbacks;
+ private long _ioUringAsyncCancelRequestCqes;
+ private long _ioUringSocketEventBufferFull;
+ private long _ioUringCqOverflow;
+ private long _ioUringCompletionRequeueFailures;
+ private long _ioUringPrepareQueueOverflows;
+ private long _ioUringPrepareQueueOverflowFallbacks;
+ private long _ioUringPrepareQueueDepth;
+ private long _ioUringCompletionSlotExhaustions;
+ private long _ioUringCompletionSlotDrainRecoveries;
+ private long _ioUringProvidedBufferDepletions;
+ private long _ioUringProvidedBufferCurrentSize;
+ private long _ioUringProvidedBufferRecycles;
+ private long _ioUringProvidedBufferResizes;
+ private long _ioUringRegisteredBuffersInitialSuccess;
+ private long _ioUringRegisteredBuffersInitialFailure;
+ private long _ioUringRegisteredBuffersReregistrationSuccess;
+ private long _ioUringRegisteredBuffersReregistrationFailure;
+ private long _ioUringFixedRecvSelected;
+ private long _ioUringFixedRecvFallbacks;
+ private long _ioUringSqPollWakeups;
+ private long _ioUringSqPollSubmissionsSkipped;
+ private long _ioUringPersistentMultishotRecvReuse;
+ private long _ioUringPersistentMultishotRecvTermination;
+ private long _ioUringPersistentMultishotRecvEarlyData;
[Event(1, Level = EventLevel.Informational)]
private void ConnectStart(string? address)
@@ -80,6 +173,15 @@ private void AcceptFailed(SocketError error, string? exceptionMessage)
}
}
+ [Event(7, Level = EventLevel.Informational)]
+ private void SocketEngineBackendSelected(string backend, int isIoUringPort, int sqPollEnabled)
+ {
+ if (IsEnabled(EventLevel.Informational, EventKeywords.All))
+ {
+ WriteEvent(eventId: 7, backend, isIoUringPort, sqPollEnabled);
+ }
+ }
+
[NonEvent]
public Activity? ConnectStart(SocketAddress address, ProtocolType protocolType, EndPoint endPoint, bool keepActivityCurrent)
{
@@ -189,6 +291,20 @@ public void AcceptStart(EndPoint address)
}
}
+ [NonEvent]
+ internal void ReportSocketEngineBackendSelected(bool isIoUringPort, bool isCompletionMode, bool sqPollEnabled)
+ {
+ if (!IsEnabled(EventLevel.Informational, EventKeywords.All))
+ {
+ return;
+ }
+
+ SocketEngineBackendSelected(
+ isCompletionMode ? "io_uring_completion" : "epoll",
+ isIoUringPort ? 1 : 0,
+ sqPollEnabled ? 1 : 0);
+ }
+
[NonEvent]
public void AfterAccept(SocketError error, string? exceptionMessage = null)
{
@@ -231,6 +347,182 @@ public void DatagramSent()
Interlocked.Increment(ref _datagramsSent);
}
+ [NonEvent]
+ public void IoUringPrepareNonPinnableFallback(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPrepareNonPinnableFallbacks, count);
+ }
+
+ [NonEvent]
+ public void IoUringAsyncCancelRequestCqes(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringAsyncCancelRequestCqes, count);
+ }
+
+ [NonEvent]
+ public void IoUringSocketEventBufferFull(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringSocketEventBufferFull, count);
+ }
+
+ [NonEvent]
+ public void IoUringCqOverflow(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringCqOverflow, count);
+ }
+
+ [NonEvent]
+ public void IoUringCompletionRequeueFailure(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringCompletionRequeueFailures, count);
+ }
+
+ [NonEvent]
+ public void IoUringPrepareQueueOverflow(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPrepareQueueOverflows, count);
+ }
+
+ [NonEvent]
+ public void IoUringPrepareQueueOverflowFallback(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPrepareQueueOverflowFallbacks, count);
+ }
+
+ [NonEvent]
+ public void IoUringPrepareQueueDepthDelta(long delta)
+ {
+ long value = Interlocked.Add(ref _ioUringPrepareQueueDepth, delta);
+ Debug.Assert(value >= 0, $"io_uring prepare queue depth cannot be negative: {value}");
+ }
+
+ [NonEvent]
+ public void IoUringCompletionSlotExhaustion(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringCompletionSlotExhaustions, count);
+ }
+
+ [NonEvent]
+ public void IoUringCompletionSlotDrainRecovery(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringCompletionSlotDrainRecoveries, count);
+ }
+
+ [NonEvent]
+ public void IoUringProvidedBufferDepletion(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringProvidedBufferDepletions, count);
+ }
+
+ [NonEvent]
+ public void IoUringProvidedBufferCurrentSize(int size)
+ {
+ Debug.Assert(size >= 0);
+ Volatile.Write(ref _ioUringProvidedBufferCurrentSize, size);
+ }
+
+ [NonEvent]
+ public void IoUringProvidedBufferRecycle(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringProvidedBufferRecycles, count);
+ }
+
+ [NonEvent]
+ public void IoUringProvidedBufferResize(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringProvidedBufferResizes, count);
+ }
+
+ [NonEvent]
+ public void IoUringRegisteredBuffersResult(bool success, int bufferCount, int bufferSize)
+ {
+ Debug.Assert(bufferCount >= 0);
+ Debug.Assert(bufferSize >= 0);
+
+ if (success)
+ {
+ Interlocked.Increment(ref _ioUringRegisteredBuffersInitialSuccess);
+ }
+ else
+ {
+ Interlocked.Increment(ref _ioUringRegisteredBuffersInitialFailure);
+ }
+ }
+
+ [NonEvent]
+ public void IoUringRegisteredBuffersReregistration(bool success)
+ {
+ if (success)
+ {
+ Interlocked.Increment(ref _ioUringRegisteredBuffersReregistrationSuccess);
+ }
+ else
+ {
+ Interlocked.Increment(ref _ioUringRegisteredBuffersReregistrationFailure);
+ }
+ }
+
+ [NonEvent]
+ public void IoUringFixedRecvSelected(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringFixedRecvSelected, count);
+ }
+
+ [NonEvent]
+ public void IoUringFixedRecvFallback(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringFixedRecvFallbacks, count);
+ }
+
+ [NonEvent]
+ public void IoUringSqPollWakeup(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringSqPollWakeups, count);
+ }
+
+ [NonEvent]
+ public void IoUringSqPollSubmissionSkipped(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringSqPollSubmissionsSkipped, count);
+ }
+
+ [NonEvent]
+ public void IoUringPersistentMultishotRecvReuse(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPersistentMultishotRecvReuse, count);
+ }
+
+ [NonEvent]
+ public void IoUringPersistentMultishotRecvTermination(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPersistentMultishotRecvTermination, count);
+ }
+
+ [NonEvent]
+ public void IoUringPersistentMultishotRecvEarlyData(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPersistentMultishotRecvEarlyData, count);
+ }
+
private static string GetErrorType(SocketError socketError) => socketError switch
{
// Common connect() errors expected to be seen:
@@ -291,6 +583,118 @@ protected override void OnEventCommand(EventCommandEventArgs command)
{
DisplayName = "Datagrams Sent",
};
+
+ if (!OperatingSystem.IsLinux())
+ {
+ return;
+ }
+
+ _ioUringPrepareNonPinnableFallbacksCounter ??= new PollingCounter(IoUringCounterNames.PrepareNonPinnableFallbacks, this, () => Interlocked.Read(ref _ioUringPrepareNonPinnableFallbacks))
+ {
+ DisplayName = "io_uring Prepare Non-Pinnable Fallbacks",
+ };
+ _ioUringSocketEventBufferFullCounter ??= new PollingCounter(IoUringCounterNames.SocketEventBufferFull, this, () => Interlocked.Read(ref _ioUringSocketEventBufferFull))
+ {
+ DisplayName = "io_uring Socket Event Buffer Full",
+ };
+ _ioUringCqOverflowCounter ??= new PollingCounter(IoUringCounterNames.CqOverflow, this, () => Interlocked.Read(ref _ioUringCqOverflow))
+ {
+ DisplayName = "io_uring Completion Queue Overflow",
+ };
+ _ioUringPrepareQueueOverflowsCounter ??= new PollingCounter(IoUringCounterNames.PrepareQueueOverflows, this, () => Interlocked.Read(ref _ioUringPrepareQueueOverflows))
+ {
+ DisplayName = "io_uring Prepare Queue Overflows",
+ };
+ _ioUringPrepareQueueOverflowFallbacksCounter ??= new PollingCounter(IoUringCounterNames.PrepareQueueOverflowFallbacks, this, () => Interlocked.Read(ref _ioUringPrepareQueueOverflowFallbacks))
+ {
+ DisplayName = "io_uring Prepare Queue Overflow Fallbacks",
+ };
+ _ioUringCompletionSlotExhaustionsCounter ??= new PollingCounter(IoUringCounterNames.CompletionSlotExhaustions, this, () => Interlocked.Read(ref _ioUringCompletionSlotExhaustions))
+ {
+ DisplayName = "io_uring Completion Slot Exhaustions",
+ };
+ _ioUringSqPollWakeupsCounter ??= new PollingCounter(IoUringCounterNames.SqPollWakeups, this, () => Interlocked.Read(ref _ioUringSqPollWakeups))
+ {
+ DisplayName = "io_uring SQPOLL Wakeups",
+ };
+ _ioUringSqPollSubmissionsSkippedCounter ??= new PollingCounter(IoUringCounterNames.SqPollSubmissionsSkipped, this, () => Interlocked.Read(ref _ioUringSqPollSubmissionsSkipped))
+ {
+ DisplayName = "io_uring SQPOLL Submissions Skipped",
+ };
+
+ if (!IsEnabled(EventLevel.LogAlways, Keywords.IoUringDiagnostics))
+ {
+ return;
+ }
+
+ _ioUringAsyncCancelRequestCqesCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.AsyncCancelRequestCqes, this, () => Interlocked.Read(ref _ioUringAsyncCancelRequestCqes))
+ {
+ DisplayName = "io_uring Async-Cancel Request CQEs",
+ };
+ _ioUringCompletionRequeueFailuresCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.CompletionRequeueFailures, this, () => Interlocked.Read(ref _ioUringCompletionRequeueFailures))
+ {
+ DisplayName = "io_uring Completion Requeue Failures",
+ };
+ _ioUringPrepareQueueDepthCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.PrepareQueueDepth, this, () => Interlocked.Read(ref _ioUringPrepareQueueDepth))
+ {
+ DisplayName = "io_uring Prepare Queue Depth",
+ };
+ _ioUringCompletionSlotDrainRecoveriesCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.CompletionSlotDrainRecoveries, this, () => Interlocked.Read(ref _ioUringCompletionSlotDrainRecoveries))
+ {
+ DisplayName = "io_uring Completion Slot Drain Recoveries",
+ };
+ _ioUringProvidedBufferDepletionsCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.ProvidedBufferDepletions, this, () => Interlocked.Read(ref _ioUringProvidedBufferDepletions))
+ {
+ DisplayName = "io_uring Provided Buffer Depletions",
+ };
+ _ioUringProvidedBufferCurrentSizeCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.ProvidedBufferCurrentSize, this, () => Volatile.Read(ref _ioUringProvidedBufferCurrentSize))
+ {
+ DisplayName = "io_uring Provided Buffer Current Size",
+ };
+ _ioUringProvidedBufferRecyclesCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.ProvidedBufferRecycles, this, () => Interlocked.Read(ref _ioUringProvidedBufferRecycles))
+ {
+ DisplayName = "io_uring Provided Buffer Recycles",
+ };
+ _ioUringProvidedBufferResizesCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.ProvidedBufferResizes, this, () => Interlocked.Read(ref _ioUringProvidedBufferResizes))
+ {
+ DisplayName = "io_uring Provided Buffer Resizes",
+ };
+ _ioUringRegisteredBuffersInitialSuccessCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.RegisteredBuffersInitialSuccess, this, () => Interlocked.Read(ref _ioUringRegisteredBuffersInitialSuccess))
+ {
+ DisplayName = "io_uring Registered Buffers Initial Success",
+ };
+ _ioUringRegisteredBuffersInitialFailureCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.RegisteredBuffersInitialFailure, this, () => Interlocked.Read(ref _ioUringRegisteredBuffersInitialFailure))
+ {
+ DisplayName = "io_uring Registered Buffers Initial Failure",
+ };
+ _ioUringRegisteredBuffersReregistrationSuccessCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.RegisteredBuffersReregistrationSuccess, this, () => Interlocked.Read(ref _ioUringRegisteredBuffersReregistrationSuccess))
+ {
+ DisplayName = "io_uring Registered Buffers Re-Registration Success",
+ };
+ _ioUringRegisteredBuffersReregistrationFailureCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.RegisteredBuffersReregistrationFailure, this, () => Interlocked.Read(ref _ioUringRegisteredBuffersReregistrationFailure))
+ {
+ DisplayName = "io_uring Registered Buffers Re-Registration Failure",
+ };
+ _ioUringFixedRecvSelectedCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.FixedRecvSelected, this, () => Interlocked.Read(ref _ioUringFixedRecvSelected))
+ {
+ DisplayName = "io_uring Fixed Recv Selected",
+ };
+ _ioUringFixedRecvFallbacksCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.FixedRecvFallbacks, this, () => Interlocked.Read(ref _ioUringFixedRecvFallbacks))
+ {
+ DisplayName = "io_uring Fixed Recv Fallbacks",
+ };
+ _ioUringPersistentMultishotRecvReuseCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.PersistentMultishotRecvReuse, this, () => Interlocked.Read(ref _ioUringPersistentMultishotRecvReuse))
+ {
+ DisplayName = "io_uring Persistent Multishot Recv Reuse",
+ };
+ _ioUringPersistentMultishotRecvTerminationCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.PersistentMultishotRecvTermination, this, () => Interlocked.Read(ref _ioUringPersistentMultishotRecvTermination))
+ {
+ DisplayName = "io_uring Persistent Multishot Recv Terminations",
+ };
+ _ioUringPersistentMultishotRecvEarlyDataCounter ??= new PollingCounter(IoUringDiagnosticCounterNames.PersistentMultishotRecvEarlyData, this, () => Interlocked.Read(ref _ioUringPersistentMultishotRecvEarlyData))
+ {
+ DisplayName = "io_uring Persistent Multishot Recv Early Data",
+ };
}
}
}
diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs
new file mode 100644
index 00000000000000..c058336c247162
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs
@@ -0,0 +1,6366 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Buffers;
+using System.Collections.Generic;
+using System.Net;
+using System.Reflection;
+using System.Reflection.Emit;
+using System.Runtime.InteropServices;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.DotNet.RemoteExecutor;
+using Xunit;
+
+namespace System.Net.Sockets.Tests
+{
+ public class IoUring
+ {
+ private static class IoUringEnvironmentVariables
+ {
+ public const string Enabled = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING";
+ public const string ProvidedBufferSize = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PROVIDED_BUFFER_SIZE";
+ public const string AdaptiveBufferSizing = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ADAPTIVE_BUFFER_SIZING";
+ public const string RegisterBuffers = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS";
+ public const string SqPoll = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL";
+ public const string ZeroCopySend = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND";
+ public const string DirectSqe = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE";
+ public const string ForceEagainOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_EAGAIN_ONCE_MASK";
+ public const string ForceEcanceledOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ECANCELED_ONCE_MASK";
+ public const string TestEventBufferCount = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_EVENT_BUFFER_COUNT";
+ public const string PrepareQueueCapacity = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PREPARE_QUEUE_CAPACITY";
+ }
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // Uses Linux-only io_uring publication internals.
+ public static async Task IoUringNonPinnableFallbackPublication_ConcurrentPublishers_EmitSingleDelta()
+ {
+ await RemoteExecutor.Invoke(static () =>
+ {
+ const BindingFlags StaticNonPublic = BindingFlags.Static | BindingFlags.NonPublic;
+
+ Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+ MethodInfo getDeltaMethod = engineType.GetMethod("GetIoUringNonPinnablePrepareFallbackDelta", StaticNonPublic)!;
+ FieldInfo publishedCountField = engineType.GetField("s_ioUringPublishedNonPinnablePrepareFallbackCount", StaticNonPublic)!;
+ FieldInfo publishingGateField = engineType.GetField("s_ioUringPublishingNonPinnablePrepareFallback", StaticNonPublic)!;
+
+ Type contextType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncContext", throwOnError: true)!;
+ FieldInfo fallbackCountField = contextType.GetField("s_ioUringNonPinnablePrepareFallbackCount", StaticNonPublic)!;
+
+ long originalPublished = (long)publishedCountField.GetValue(null)!;
+ int originalPublishingGate = (int)publishingGateField.GetValue(null)!;
+ long originalFallback = (long)fallbackCountField.GetValue(null)!;
+
+ try
+ {
+ const long firstFallbackCount = 17;
+ const int publisherCount = 16;
+ long[] deltas = new long[publisherCount];
+ using var start = new ManualResetEventSlim(initialState: false);
+ var tasks = new Task[publisherCount];
+
+ publishedCountField.SetValue(null, 0L);
+ publishingGateField.SetValue(null, 0);
+ fallbackCountField.SetValue(null, firstFallbackCount);
+
+ for (int i = 0; i < publisherCount; i++)
+ {
+ int capturedIndex = i;
+ tasks[i] = Task.Run(() =>
+ {
+ start.Wait();
+ deltas[capturedIndex] = (long)getDeltaMethod.Invoke(null, null)!;
+ });
+ }
+
+ start.Set();
+ Task.WaitAll(tasks);
+
+ long deltaTotal = 0;
+ int nonZeroCount = 0;
+ long nonZeroValue = 0;
+ foreach (long delta in deltas)
+ {
+ deltaTotal += delta;
+ if (delta != 0)
+ {
+ nonZeroCount++;
+ nonZeroValue = delta;
+ }
+ }
+
+ Assert.Equal(firstFallbackCount, deltaTotal);
+ Assert.Equal(1, nonZeroCount);
+ Assert.Equal(firstFallbackCount, nonZeroValue);
+
+ const long secondFallbackCount = 23;
+ fallbackCountField.SetValue(null, secondFallbackCount);
+ Assert.Equal(secondFallbackCount - firstFallbackCount, (long)getDeltaMethod.Invoke(null, null)!);
+ Assert.Equal(0, (long)getDeltaMethod.Invoke(null, null)!);
+ }
+ finally
+ {
+ fallbackCountField.SetValue(null, originalFallback);
+ publishedCountField.SetValue(null, originalPublished);
+ publishingGateField.SetValue(null, originalPublishingGate);
+ }
+ }).DisposeAsync();
+ }
+
+ private static RemoteInvokeOptions CreateSocketEngineOptions(
+ string? ioUringValue = "1",
+ string? forceEagainOnceMask = null,
+ string? forceEcanceledOnceMask = null,
+ int? testEventBufferCount = null,
+ string? testEventBufferCountRaw = null,
+ int? prepareQueueCapacity = null,
+ int? providedBufferSize = null,
+ bool? adaptiveBufferSizingEnabled = null,
+ bool? registerBuffersEnabled = null,
+ bool? sqPollEnabled = null,
+ bool? directSqeEnabled = null,
+ bool? zeroCopySendEnabled = null)
+ {
+ static void SetOrRemoveEnvironmentVariable(RemoteInvokeOptions options, string name, string? value)
+ {
+ if (value is null)
+ {
+ options.StartInfo.EnvironmentVariables.Remove(name);
+ }
+ else
+ {
+ options.StartInfo.EnvironmentVariables[name] = value;
+ }
+ }
+
+ static void ValidateSocketEngineOptionCombination(int? configuredEventBufferCount, string? configuredEventBufferCountRaw)
+ {
+ if (configuredEventBufferCount.HasValue && configuredEventBufferCountRaw is not null)
+ {
+ throw new ArgumentException(
+ "Specify either testEventBufferCount or testEventBufferCountRaw, not both.",
+ nameof(configuredEventBufferCountRaw));
+ }
+ }
+
+ ValidateSocketEngineOptionCombination(testEventBufferCount, testEventBufferCountRaw);
+
+ RemoteInvokeOptions options = new RemoteInvokeOptions();
+ string? configuredEventBufferCount =
+ testEventBufferCountRaw ?? (testEventBufferCount.HasValue ? testEventBufferCount.Value.ToString() : null);
+ (string Name, string? Value)[] ioUringEnvironmentAssignments =
+ {
+ (IoUringEnvironmentVariables.Enabled, ioUringValue),
+ (IoUringEnvironmentVariables.ProvidedBufferSize, providedBufferSize?.ToString()),
+ (IoUringEnvironmentVariables.AdaptiveBufferSizing, adaptiveBufferSizingEnabled.HasValue ? (adaptiveBufferSizingEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.RegisterBuffers, registerBuffersEnabled.HasValue ? (registerBuffersEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.SqPoll, sqPollEnabled.HasValue ? (sqPollEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.DirectSqe, directSqeEnabled.HasValue ? (directSqeEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ZeroCopySend, zeroCopySendEnabled.HasValue ? (zeroCopySendEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ForceEagainOnceMask, string.IsNullOrEmpty(forceEagainOnceMask) ? null : forceEagainOnceMask),
+ (IoUringEnvironmentVariables.ForceEcanceledOnceMask, string.IsNullOrEmpty(forceEcanceledOnceMask) ? null : forceEcanceledOnceMask),
+ (IoUringEnvironmentVariables.TestEventBufferCount, configuredEventBufferCount),
+ (IoUringEnvironmentVariables.PrepareQueueCapacity, prepareQueueCapacity?.ToString()),
+ };
+
+ foreach ((string Name, string? Value) assignment in ioUringEnvironmentAssignments)
+ {
+ SetOrRemoveEnvironmentVariable(options, assignment.Name, assignment.Value);
+ }
+
+ options.TimeOut = (int)TimeSpan.FromMinutes(10).TotalMilliseconds;
+ return options;
+ }
+
+ private static Task ToTask(Task task) => task;
+ private static Task ToTask(ValueTask task) => task.AsTask();
+
+ private static async Task AwaitWithTimeoutAsync(Task task, string operationName)
+ {
+ Task completed = await Task.WhenAny(task, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.True(ReferenceEquals(task, completed), $"Timed out waiting for {operationName}");
+ return await task;
+ }
+
+ private static void AssertCanceledOrInterrupted(Exception? ex)
+ {
+ Assert.NotNull(ex);
+ Assert.True(
+ ex is OperationCanceledException ||
+ ex is SocketException socketException &&
+ (socketException.SocketErrorCode == SocketError.OperationAborted ||
+ socketException.SocketErrorCode == SocketError.Interrupted),
+ $"Unexpected exception: {ex}");
+ }
+
+ private static void AssertCanceledDisposedOrInterrupted(Exception? ex)
+ {
+ if (ex is null)
+ {
+ return;
+ }
+
+ Assert.True(
+ ex is ObjectDisposedException ||
+ ex is OperationCanceledException ||
+ ex is SocketException socketException &&
+ (socketException.SocketErrorCode == SocketError.OperationAborted ||
+ socketException.SocketErrorCode == SocketError.Interrupted),
+ $"Unexpected exception: {ex}");
+ }
+
+ private readonly struct IoUringNativeDiagnosticsSnapshot
+ {
+ public IoUringNativeDiagnosticsSnapshot(
+ bool hasIoUringPort,
+ ulong asyncCancelRequestCqeCount,
+ ulong asyncCancelRequestCqeEnoentCount,
+ ulong asyncCancelRequestCqeEalreadyCount,
+ ulong asyncCancelRequestCqeOtherCount,
+ ulong socketEventBufferFullCount,
+ ulong unsupportedOpcodePrepareCount,
+ ulong cqOverflowCount)
+ {
+ HasIoUringPort = hasIoUringPort;
+ AsyncCancelRequestCqeCount = asyncCancelRequestCqeCount;
+ AsyncCancelRequestCqeEnoentCount = asyncCancelRequestCqeEnoentCount;
+ AsyncCancelRequestCqeEalreadyCount = asyncCancelRequestCqeEalreadyCount;
+ AsyncCancelRequestCqeOtherCount = asyncCancelRequestCqeOtherCount;
+ SocketEventBufferFullCount = socketEventBufferFullCount;
+ UnsupportedOpcodePrepareCount = unsupportedOpcodePrepareCount;
+ CqOverflowCount = cqOverflowCount;
+ }
+
+ public bool HasIoUringPort { get; }
+ public ulong AsyncCancelRequestCqeCount { get; }
+ public ulong AsyncCancelRequestCqeEnoentCount { get; }
+ public ulong AsyncCancelRequestCqeEalreadyCount { get; }
+ public ulong AsyncCancelRequestCqeOtherCount { get; }
+ public ulong SocketEventBufferFullCount { get; }
+ public ulong UnsupportedOpcodePrepareCount { get; }
+ public ulong CqOverflowCount { get; }
+ }
+
+ private readonly struct IoUringProvidedBufferSnapshot
+ {
+ public IoUringProvidedBufferSnapshot(
+ bool hasIoUringPort,
+ bool supportsProvidedBufferRings,
+ bool hasProvidedBufferRing,
+ bool hasRegisteredBuffers,
+ bool adaptiveBufferSizingEnabled,
+ int availableCount,
+ int inUseCount,
+ int totalBufferCount,
+ int bufferSize,
+ int recommendedBufferSize,
+ long recycledCount,
+ long allocationFailureCount)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SupportsProvidedBufferRings = supportsProvidedBufferRings;
+ HasProvidedBufferRing = hasProvidedBufferRing;
+ HasRegisteredBuffers = hasRegisteredBuffers;
+ AdaptiveBufferSizingEnabled = adaptiveBufferSizingEnabled;
+ AvailableCount = availableCount;
+ InUseCount = inUseCount;
+ TotalBufferCount = totalBufferCount;
+ BufferSize = bufferSize;
+ RecommendedBufferSize = recommendedBufferSize;
+ RecycledCount = recycledCount;
+ AllocationFailureCount = allocationFailureCount;
+ }
+
+ public bool HasIoUringPort { get; }
+ public bool SupportsProvidedBufferRings { get; }
+ public bool HasProvidedBufferRing { get; }
+ public bool HasRegisteredBuffers { get; }
+ public bool AdaptiveBufferSizingEnabled { get; }
+ public int AvailableCount { get; }
+ public int InUseCount { get; }
+ public int TotalBufferCount { get; }
+ public int BufferSize { get; }
+ public int RecommendedBufferSize { get; }
+ public long RecycledCount { get; }
+ public long AllocationFailureCount { get; }
+ public bool IsUsable =>
+ HasIoUringPort &&
+ SupportsProvidedBufferRings &&
+ HasProvidedBufferRing &&
+ TotalBufferCount > 0;
+ public bool IsAdaptiveSizingUsable => IsUsable && AdaptiveBufferSizingEnabled;
+ }
+
+ private readonly struct IoUringZeroCopySendSnapshot
+ {
+ public IoUringZeroCopySendSnapshot(
+ bool hasIoUringPort,
+ bool supportsSendZc,
+ bool supportsSendMsgZc,
+ bool zeroCopySendEnabled)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SupportsSendZc = supportsSendZc;
+ SupportsSendMsgZc = supportsSendMsgZc;
+ ZeroCopySendEnabled = zeroCopySendEnabled;
+ }
+
+ public bool HasIoUringPort { get; }
+ public bool SupportsSendZc { get; }
+ public bool SupportsSendMsgZc { get; }
+ public bool ZeroCopySendEnabled { get; }
+ }
+
+ private readonly struct IoUringFixedRecvSnapshot
+ {
+ public IoUringFixedRecvSnapshot(
+ bool hasIoUringPort,
+ bool supportsReadFixed,
+ bool hasRegisteredBuffers)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SupportsReadFixed = supportsReadFixed;
+ HasRegisteredBuffers = hasRegisteredBuffers;
+ }
+
+ public bool HasIoUringPort { get; }
+ public bool SupportsReadFixed { get; }
+ public bool HasRegisteredBuffers { get; }
+ public bool FixedRecvEnabled => SupportsReadFixed && HasRegisteredBuffers;
+ }
+
+ private readonly struct IoUringSqPollSnapshot
+ {
+ public IoUringSqPollSnapshot(bool hasIoUringPort, bool sqPollEnabled)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SqPollEnabled = sqPollEnabled;
+ }
+
+ public bool HasIoUringPort { get; }
+ public bool SqPollEnabled { get; }
+ public bool IsActive => HasIoUringPort && SqPollEnabled;
+ }
+
+ private readonly struct IoUringZeroCopyPinHoldSnapshot
+ {
+ public IoUringZeroCopyPinHoldSnapshot(bool hasIoUringPort, int activePinHolds, int pendingNotificationCount)
+ {
+ HasIoUringPort = hasIoUringPort;
+ ActivePinHolds = activePinHolds;
+ PendingNotificationCount = pendingNotificationCount;
+ }
+
+ public bool HasIoUringPort { get; }
+ public int ActivePinHolds { get; }
+ public int PendingNotificationCount { get; }
+ }
+
+ private sealed class NonPinnableMemoryManager : MemoryManager
+ {
+ private readonly byte[] _buffer;
+
+ public NonPinnableMemoryManager(byte[] buffer)
+ {
+ _buffer = buffer;
+ }
+
+ public override Span GetSpan() => _buffer;
+
+ public override MemoryHandle Pin(int elementIndex = 0)
+ {
+ _ = elementIndex;
+ throw new NotSupportedException("Non-pinnable test memory.");
+ }
+
+ public override void Unpin()
+ {
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ }
+ }
+
+ private sealed unsafe class TrackingPinnableMemoryManager : MemoryManager
+ {
+ private readonly byte[] _buffer;
+ private int _pinCount;
+ private int _unpinCount;
+
+ public TrackingPinnableMemoryManager(byte[] buffer)
+ {
+ _buffer = buffer;
+ }
+
+ public int PinCount => Volatile.Read(ref _pinCount);
+ public int UnpinCount => Volatile.Read(ref _unpinCount);
+
+ public override Span GetSpan() => _buffer;
+
+ public override MemoryHandle Pin(int elementIndex = 0)
+ {
+ if ((uint)elementIndex > (uint)_buffer.Length)
+ {
+ throw new ArgumentOutOfRangeException(nameof(elementIndex));
+ }
+
+ Interlocked.Increment(ref _pinCount);
+ GCHandle handle = GCHandle.Alloc(_buffer, GCHandleType.Pinned);
+ byte* pointer = (byte*)handle.AddrOfPinnedObject() + elementIndex;
+ return new MemoryHandle(pointer, handle, this);
+ }
+
+ public override void Unpin()
+ {
+ Interlocked.Increment(ref _unpinCount);
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ }
+ }
+
+ private static long GetIoUringPrepareNonPinnableFallbackCounterValue()
+ {
+ Type telemetryType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketsTelemetry", throwOnError: true)!;
+ FieldInfo logField = telemetryType.GetField("Log", BindingFlags.Public | BindingFlags.Static)!;
+ object telemetry = logField.GetValue(null)!;
+ FieldInfo fallbackCounterField = telemetryType.GetField("_ioUringPrepareNonPinnableFallbacks", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ return Convert.ToInt64(fallbackCounterField.GetValue(telemetry));
+ }
+
+ private static long GetIoUringCompletionRequeueFailureCounterValue()
+ {
+ Type telemetryType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketsTelemetry", throwOnError: true)!;
+ FieldInfo logField = telemetryType.GetField("Log", BindingFlags.Public | BindingFlags.Static)!;
+ object telemetry = logField.GetValue(null)!;
+ FieldInfo counterField = telemetryType.GetField("_ioUringCompletionRequeueFailures", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ return Convert.ToInt64(counterField.GetValue(telemetry));
+ }
+
+ private static Type GetSocketAsyncEngineType() =>
+ typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+
+ private static bool InvokeSocketAsyncEngineBoolMethod(string methodName)
+ {
+ Type engineType = GetSocketAsyncEngineType();
+ MethodInfo method = engineType.GetMethod(methodName, BindingFlags.NonPublic | BindingFlags.Static)!;
+ return (bool)method.Invoke(null, null)!;
+ }
+
+ private static int InvokeSocketAsyncEngineIntMethod(string methodName)
+ {
+ Type engineType = GetSocketAsyncEngineType();
+ MethodInfo method = engineType.GetMethod(methodName, BindingFlags.NonPublic | BindingFlags.Static)!;
+ return (int)method.Invoke(null, null)!;
+ }
+
+ private static void AssertBooleanAppContextSwitch(
+ string switchName,
+ string methodName,
+ bool expectedWhenSwitchTrue,
+ bool expectedWhenSwitchFalse)
+ {
+ AppContext.SetSwitch(switchName, true);
+ Assert.Equal(expectedWhenSwitchTrue, InvokeSocketAsyncEngineBoolMethod(methodName));
+
+ AppContext.SetSwitch(switchName, false);
+ Assert.Equal(expectedWhenSwitchFalse, InvokeSocketAsyncEngineBoolMethod(methodName));
+ }
+
+ private static ulong GetIoUringTelemetryCounterValue(string fieldName)
+ {
+ Type telemetryType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketsTelemetry", throwOnError: true)!;
+ FieldInfo? logField = telemetryType.GetField("Log", BindingFlags.Public | BindingFlags.Static);
+ if (logField?.GetValue(null) is not object telemetry)
+ {
+ return 0;
+ }
+
+ FieldInfo? counterField = telemetryType.GetField(fieldName, BindingFlags.NonPublic | BindingFlags.Instance);
+ return counterField?.GetValue(telemetry) is object value ? Convert.ToUInt64(value) : 0UL;
+ }
+
+ private static readonly OpCode[] s_singleByteOpCodes = BuildSingleByteOpCodeTable();
+ private static readonly OpCode[] s_multiByteOpCodes = BuildMultiByteOpCodeTable();
+
+ private static OpCode[] BuildSingleByteOpCodeTable()
+ {
+ OpCode[] table = new OpCode[256];
+ foreach (FieldInfo field in typeof(OpCodes).GetFields(BindingFlags.Public | BindingFlags.Static))
+ {
+ if (field.GetValue(null) is not OpCode opCode)
+ {
+ continue;
+ }
+
+ ushort value = unchecked((ushort)opCode.Value);
+ if ((value & 0xFF00) == 0)
+ {
+ table[value & 0xFF] = opCode;
+ }
+ }
+
+ return table;
+ }
+
+ private static OpCode[] BuildMultiByteOpCodeTable()
+ {
+ OpCode[] table = new OpCode[256];
+ foreach (FieldInfo field in typeof(OpCodes).GetFields(BindingFlags.Public | BindingFlags.Static))
+ {
+ if (field.GetValue(null) is not OpCode opCode)
+ {
+ continue;
+ }
+
+ ushort value = unchecked((ushort)opCode.Value);
+ if ((value & 0xFF00) == 0xFE00)
+ {
+ table[value & 0xFF] = opCode;
+ }
+ }
+
+ return table;
+ }
+
+ private static int FindCallInstructionOffset(ReadOnlySpan il, int targetMetadataToken)
+ {
+ int offset = 0;
+ while (offset < il.Length)
+ {
+ int instructionOffset = offset;
+ OpCode opCode;
+ byte first = il[offset++];
+ if (first == 0xFE)
+ {
+ if (offset >= il.Length)
+ {
+ break;
+ }
+
+ opCode = s_multiByteOpCodes[il[offset++]];
+ }
+ else
+ {
+ opCode = s_singleByteOpCodes[first];
+ }
+
+ int operandSize = GetIlOperandSize(opCode.OperandType, il, offset);
+ if (operandSize < 0 || offset + operandSize > il.Length)
+ {
+ break;
+ }
+
+ if ((opCode == OpCodes.Call || opCode == OpCodes.Callvirt) &&
+ opCode.OperandType == OperandType.InlineMethod &&
+ operandSize == 4)
+ {
+ int metadataToken = BitConverter.ToInt32(il.Slice(offset, 4));
+ if (metadataToken == targetMetadataToken)
+ {
+ return instructionOffset;
+ }
+ }
+
+ offset += operandSize;
+ }
+
+ return -1;
+ }
+
+ private static int GetIlOperandSize(OperandType operandType, ReadOnlySpan il, int operandOffset)
+ {
+ return operandType switch
+ {
+ OperandType.InlineNone => 0,
+ OperandType.ShortInlineBrTarget => 1,
+ OperandType.ShortInlineI => 1,
+ OperandType.ShortInlineVar => 1,
+ OperandType.InlineVar => 2,
+ OperandType.InlineI => 4,
+ OperandType.InlineBrTarget => 4,
+ OperandType.InlineField => 4,
+ OperandType.InlineMethod => 4,
+ OperandType.InlineSig => 4,
+ OperandType.InlineString => 4,
+ OperandType.InlineTok => 4,
+ OperandType.InlineType => 4,
+ OperandType.ShortInlineR => 4,
+ OperandType.InlineI8 => 8,
+ OperandType.InlineR => 8,
+ OperandType.InlineSwitch => operandOffset + 4 <= il.Length
+ ? 4 + (BitConverter.ToInt32(il.Slice(operandOffset, 4)) * 4)
+ : -1,
+ _ => -1,
+ };
+ }
+
+ private static long GetIoUringPollReadinessCqeCount()
+ {
+ Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+ MethodInfo? countMethod = engineType.GetMethod(
+ "GetIoUringPollReadinessCqeCount",
+ BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Static);
+ if (countMethod is null)
+ {
+ return 0;
+ }
+
+ return Convert.ToInt64(countMethod.Invoke(null, null));
+ }
+
+ private static long GetIoUringPendingRetryQueuedToPrepareQueueCount()
+ {
+ Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+ MethodInfo? countMethod = engineType.GetMethod(
+ "GetIoUringPendingRetryQueuedToPrepareQueueCount",
+ BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Static);
+ if (countMethod is null)
+ {
+ return 0;
+ }
+
+ return Convert.ToInt64(countMethod.Invoke(null, null));
+ }
+
+ private static bool IsIoUringMultishotRecvSupported()
+ {
+ Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo supportsMultishotRecvField = engineType.GetField("_supportsMultishotRecv", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ if ((bool)supportsMultishotRecvField.GetValue(engine)!)
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ private static bool IsIoUringMultishotAcceptSupported()
+ {
+ Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ PropertyInfo? supportsMultishotAcceptProperty = engineType.GetProperty("SupportsMultishotAccept", BindingFlags.NonPublic | BindingFlags.Instance);
+ FieldInfo? supportsMultishotAcceptField = engineType.GetField("_supportsMultishotAccept", BindingFlags.NonPublic | BindingFlags.Instance);
+
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ if (supportsMultishotAcceptProperty is not null && (bool)supportsMultishotAcceptProperty.GetValue(engine)!)
+ {
+ return true;
+ }
+
+ if (supportsMultishotAcceptField is not null && (bool)supportsMultishotAcceptField.GetValue(engine)!)
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ private static bool TryGetSocketAsyncContextForTest(Socket socket, out object asyncContext)
+ {
+ asyncContext = null!;
+
+ try
+ {
+ var socketHandle = socket.SafeHandle;
+ Type safeSocketHandleType = socketHandle.GetType();
+ PropertyInfo? asyncContextProperty = safeSocketHandleType.GetProperty("AsyncContext", BindingFlags.NonPublic | BindingFlags.Instance);
+ if (asyncContextProperty?.GetValue(socketHandle) is not object context)
+ {
+ return false;
+ }
+
+ asyncContext = context;
+ return true;
+ }
+ catch (ObjectDisposedException)
+ {
+ return false;
+ }
+ }
+
+ private static bool IsListenerMultishotAcceptArmed(Socket listener)
+ {
+ if (!TryGetSocketAsyncContextForTest(listener, out object asyncContext))
+ {
+ return false;
+ }
+
+ Type asyncContextType = asyncContext.GetType();
+ PropertyInfo? armedProperty = asyncContextType.GetProperty(
+ "IsMultishotAcceptArmed",
+ BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance);
+ if (armedProperty is not null)
+ {
+ object? value = armedProperty.GetValue(asyncContext);
+ return value is bool armed && armed;
+ }
+
+ FieldInfo? armedField = asyncContextType.GetField("_multishotAcceptArmed", BindingFlags.NonPublic | BindingFlags.Instance);
+ return armedField?.GetValue(asyncContext) is int armedState && armedState != 0;
+ }
+
+ private static int GetListenerMultishotAcceptQueueCount(Socket listener)
+ {
+ if (!TryGetSocketAsyncContextForTest(listener, out object asyncContext))
+ {
+ return 0;
+ }
+
+ FieldInfo? queueField = asyncContext.GetType().GetField("_multishotAcceptQueue", BindingFlags.NonPublic | BindingFlags.Instance);
+ if (queueField is null)
+ {
+ return 0;
+ }
+
+ object? queue = queueField.GetValue(asyncContext);
+ if (queue is null)
+ {
+ return 0;
+ }
+
+ PropertyInfo? countProperty = queue.GetType().GetProperty("Count", BindingFlags.Public | BindingFlags.Instance);
+ return countProperty?.GetValue(queue) is int count ? count : 0;
+ }
+
+ private static async Task WaitForMultishotAcceptArmedStateAsync(Socket listener, bool expectedArmed, int timeoutMilliseconds = 5000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (IsListenerMultishotAcceptArmed(listener) == expectedArmed)
+ {
+ return true;
+ }
+
+ await Task.Delay(20);
+ }
+
+ return IsListenerMultishotAcceptArmed(listener) == expectedArmed;
+ }
+
+ private static bool IsPersistentMultishotRecvArmed(Socket socket)
+ {
+ if (!TryGetSocketAsyncContextForTest(socket, out object asyncContext))
+ {
+ return false;
+ }
+
+ MethodInfo? armedMethod = asyncContext.GetType().GetMethod(
+ "IsPersistentMultishotRecvArmed",
+ BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance);
+ if (armedMethod is null)
+ {
+ return false;
+ }
+
+ return (bool)armedMethod.Invoke(asyncContext, null)!;
+ }
+
+ private static async Task WaitForPersistentMultishotRecvArmedStateAsync(Socket socket, bool expectedArmed, int timeoutMilliseconds = 5000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (IsPersistentMultishotRecvArmed(socket) == expectedArmed)
+ {
+ return true;
+ }
+
+ await Task.Delay(20);
+ }
+
+ return IsPersistentMultishotRecvArmed(socket) == expectedArmed;
+ }
+
+ private static async Task WaitForZeroCopyPinHoldSnapshotAsync(
+ Func predicate,
+ int timeoutMilliseconds = 5000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ IoUringZeroCopyPinHoldSnapshot snapshot = GetIoUringZeroCopyPinHoldSnapshot();
+ while (DateTime.UtcNow < deadline)
+ {
+ if (predicate(snapshot))
+ {
+ return snapshot;
+ }
+
+ await Task.Delay(20);
+ snapshot = GetIoUringZeroCopyPinHoldSnapshot();
+ }
+
+ return snapshot;
+ }
+
+ private static async Task AssertConnectedPairRoundTripAsync(Socket client, Socket server, byte marker)
+ {
+ byte[] payload = new byte[] { marker };
+ byte[] receiveBuffer = new byte[1];
+ Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ Assert.Equal(marker, receiveBuffer[0]);
+ }
+
+ private static async Task AssertPinsReleasedAsync(TrackingPinnableMemoryManager manager)
+ {
+ DateTime start = DateTime.UtcNow;
+ while (manager.PinCount != manager.UnpinCount)
+ {
+ if (DateTime.UtcNow - start > TimeSpan.FromSeconds(10))
+ {
+ break;
+ }
+
+ await Task.Delay(20);
+ }
+
+ Assert.True(manager.PinCount > 0, "Expected at least one pin.");
+ Assert.Equal(manager.PinCount, manager.UnpinCount);
+ }
+
+ private static IoUringNativeDiagnosticsSnapshot GetIoUringNativeDiagnosticsSnapshot()
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+ Type? interopSysType = socketsAssembly.GetType("Interop+Sys", throwOnError: false);
+
+ FieldInfo? enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static);
+ PropertyInfo? isIoUringEnabledProp = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance);
+ FieldInfo? portField = engineType.GetField("_port", BindingFlags.NonPublic | BindingFlags.Instance);
+ if (enginesField is null || isIoUringEnabledProp is null || portField is null)
+ {
+ return new IoUringNativeDiagnosticsSnapshot(false, 0, 0, 0, 0, 0, 0, 0);
+ }
+
+ MethodInfo? tryGetDiagnosticsMethod = interopSysType?.GetMethod(
+ "TryGetIoUringSocketEventPortDiagnostics",
+ BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Static);
+
+ Type? diagnosticsType = null;
+ if (tryGetDiagnosticsMethod is not null)
+ {
+ ParameterInfo[] parameters = tryGetDiagnosticsMethod.GetParameters();
+ if (parameters.Length >= 2)
+ {
+ diagnosticsType = parameters[1].ParameterType.GetElementType();
+ }
+ }
+
+ FieldInfo? asyncCancelRequestCqeCountField = diagnosticsType?.GetField("AsyncCancelRequestCqeCount", BindingFlags.Public | BindingFlags.Instance);
+ FieldInfo? asyncCancelRequestCqeEnoentCountField = diagnosticsType?.GetField("AsyncCancelRequestCqeEnoentCount", BindingFlags.Public | BindingFlags.Instance);
+ FieldInfo? asyncCancelRequestCqeEalreadyCountField = diagnosticsType?.GetField("AsyncCancelRequestCqeEalreadyCount", BindingFlags.Public | BindingFlags.Instance);
+ FieldInfo? asyncCancelRequestCqeOtherCountField = diagnosticsType?.GetField("AsyncCancelRequestCqeOtherCount", BindingFlags.Public | BindingFlags.Instance);
+ FieldInfo? socketEventBufferFullCountField = diagnosticsType?.GetField("SocketEventBufferFullCount", BindingFlags.Public | BindingFlags.Instance);
+ FieldInfo? unsupportedOpcodePrepareCountField = diagnosticsType?.GetField("UnsupportedOpcodePrepareCount", BindingFlags.Public | BindingFlags.Instance);
+ FieldInfo? cqOverflowCountField = diagnosticsType?.GetField("CqOverflowCount", BindingFlags.Public | BindingFlags.Instance);
+
+ bool hasIoUringPort = false;
+ ulong asyncCancelRequestCqeCount = 0;
+ ulong asyncCancelRequestCqeEnoentCount = 0;
+ ulong asyncCancelRequestCqeEalreadyCount = 0;
+ ulong asyncCancelRequestCqeOtherCount = 0;
+ ulong socketEventBufferFullCount = 0;
+ ulong unsupportedOpcodePrepareCount = 0;
+ ulong cqOverflowCount = 0;
+
+ bool hasNativeDiagnosticsInterop =
+ tryGetDiagnosticsMethod is not null &&
+ asyncCancelRequestCqeCountField is not null &&
+ asyncCancelRequestCqeEnoentCountField is not null &&
+ asyncCancelRequestCqeEalreadyCountField is not null &&
+ asyncCancelRequestCqeOtherCountField is not null &&
+ socketEventBufferFullCountField is not null &&
+ unsupportedOpcodePrepareCountField is not null &&
+ cqOverflowCountField is not null;
+
+ if (enginesField.GetValue(null) is not Array engines)
+ {
+ return new IoUringNativeDiagnosticsSnapshot(false, 0, 0, 0, 0, 0, 0, 0);
+ }
+
+ foreach (object? engine in engines)
+ {
+ if (engine is null ||
+ isIoUringEnabledProp.GetValue(engine) is not bool isIoUringEnabled ||
+ !isIoUringEnabled)
+ {
+ continue;
+ }
+
+ hasIoUringPort = true;
+ if (!hasNativeDiagnosticsInterop)
+ {
+ continue;
+ }
+
+ if (portField.GetValue(engine) is not IntPtr port)
+ {
+ continue;
+ }
+
+ object?[] args = new object?[] { port, null };
+ object? error = tryGetDiagnosticsMethod!.Invoke(null, args);
+ if (error is null || Convert.ToInt32(error) != 0 || args[1] is null)
+ {
+ continue;
+ }
+
+ object diagnostics = args[1]!;
+ asyncCancelRequestCqeCount += Convert.ToUInt64(asyncCancelRequestCqeCountField!.GetValue(diagnostics));
+ asyncCancelRequestCqeEnoentCount += Convert.ToUInt64(asyncCancelRequestCqeEnoentCountField!.GetValue(diagnostics));
+ asyncCancelRequestCqeEalreadyCount += Convert.ToUInt64(asyncCancelRequestCqeEalreadyCountField!.GetValue(diagnostics));
+ asyncCancelRequestCqeOtherCount += Convert.ToUInt64(asyncCancelRequestCqeOtherCountField!.GetValue(diagnostics));
+ socketEventBufferFullCount += Convert.ToUInt64(socketEventBufferFullCountField!.GetValue(diagnostics));
+ unsupportedOpcodePrepareCount += Convert.ToUInt64(unsupportedOpcodePrepareCountField!.GetValue(diagnostics));
+ cqOverflowCount += Convert.ToUInt64(cqOverflowCountField!.GetValue(diagnostics));
+ }
+
+ if (hasIoUringPort && !hasNativeDiagnosticsInterop)
+ {
+ // Native diagnostics interop is not available in the managed io_uring path.
+ // Fall back to managed telemetry counters for the subset of metrics still exposed.
+ asyncCancelRequestCqeCount = GetIoUringTelemetryCounterValue("_ioUringAsyncCancelRequestCqes");
+ socketEventBufferFullCount = GetIoUringTelemetryCounterValue("_ioUringSocketEventBufferFull");
+ cqOverflowCount = GetIoUringTelemetryCounterValue("_ioUringCqOverflow");
+ }
+
+ return new IoUringNativeDiagnosticsSnapshot(
+ hasIoUringPort,
+ asyncCancelRequestCqeCount,
+ asyncCancelRequestCqeEnoentCount,
+ asyncCancelRequestCqeEalreadyCount,
+ asyncCancelRequestCqeOtherCount,
+ socketEventBufferFullCount,
+ unsupportedOpcodePrepareCount,
+ cqOverflowCount);
+ }
+
+ private static IoUringProvidedBufferSnapshot GetIoUringProvidedBufferSnapshot()
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo supportsProvidedBufferRingsField = engineType.GetField("_supportsProvidedBufferRings", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo providedBufferRingField = engineType.GetField("_ioUringProvidedBufferRing", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo registeredBuffersField = engineType.GetField("_ioUringBuffersRegistered", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo adaptiveSizingEnabledField = engineType.GetField("_adaptiveBufferSizingEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ bool hasIoUringPort = false;
+ bool supportsProvidedBufferRings = false;
+ bool hasProvidedBufferRing = false;
+ bool hasRegisteredBuffers = false;
+ bool adaptiveBufferSizingEnabled = false;
+ int availableCount = 0;
+ int inUseCount = 0;
+ int totalBufferCount = 0;
+ int bufferSize = 0;
+ int recommendedBufferSize = 0;
+ long recycledCount = 0;
+ long allocationFailureCount = 0;
+
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ hasIoUringPort = true;
+ if (!(bool)supportsProvidedBufferRingsField.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ supportsProvidedBufferRings = true;
+ if ((bool)adaptiveSizingEnabledField.GetValue(engine)!)
+ {
+ adaptiveBufferSizingEnabled = true;
+ }
+ if ((bool)registeredBuffersField.GetValue(engine)!)
+ {
+ hasRegisteredBuffers = true;
+ }
+ object? providedBufferRing = providedBufferRingField.GetValue(engine);
+ if (providedBufferRing is null)
+ {
+ continue;
+ }
+
+ hasProvidedBufferRing = true;
+ Type providedBufferRingType = providedBufferRing.GetType();
+ PropertyInfo? availableCountProperty = providedBufferRingType.GetProperty("AvailableCount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance);
+ PropertyInfo? inUseCountProperty = providedBufferRingType.GetProperty("InUseCount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance);
+ PropertyInfo? recycledCountProperty = providedBufferRingType.GetProperty("RecycledCount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance);
+ PropertyInfo? allocationFailureCountProperty = providedBufferRingType.GetProperty("AllocationFailureCount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance);
+ PropertyInfo? bufferSizeProperty = providedBufferRingType.GetProperty("BufferSize", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance);
+ PropertyInfo? recommendedBufferSizeProperty = providedBufferRingType.GetProperty("RecommendedBufferSize", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance);
+ FieldInfo? bufferStatesField = providedBufferRingType.GetField("_bufferStates", BindingFlags.NonPublic | BindingFlags.Instance);
+
+ if (availableCountProperty is null ||
+ inUseCountProperty is null ||
+ recycledCountProperty is null ||
+ allocationFailureCountProperty is null ||
+ bufferSizeProperty is null ||
+ recommendedBufferSizeProperty is null ||
+ bufferStatesField is null)
+ {
+ continue;
+ }
+
+ availableCount += Convert.ToInt32(availableCountProperty.GetValue(providedBufferRing));
+ inUseCount += Convert.ToInt32(inUseCountProperty.GetValue(providedBufferRing));
+ recycledCount += Convert.ToInt64(recycledCountProperty.GetValue(providedBufferRing));
+ allocationFailureCount += Convert.ToInt64(allocationFailureCountProperty.GetValue(providedBufferRing));
+ bufferSize = Math.Max(bufferSize, Convert.ToInt32(bufferSizeProperty.GetValue(providedBufferRing)));
+ recommendedBufferSize = Math.Max(recommendedBufferSize, Convert.ToInt32(recommendedBufferSizeProperty.GetValue(providedBufferRing)));
+
+ byte[] bufferStates = (byte[])bufferStatesField.GetValue(providedBufferRing)!;
+ totalBufferCount += bufferStates.Length;
+ }
+
+ return new IoUringProvidedBufferSnapshot(
+ hasIoUringPort,
+ supportsProvidedBufferRings,
+ hasProvidedBufferRing,
+ hasRegisteredBuffers,
+ adaptiveBufferSizingEnabled,
+ availableCount,
+ inUseCount,
+ totalBufferCount,
+ bufferSize,
+ recommendedBufferSize,
+ recycledCount,
+ allocationFailureCount);
+ }
+
+ private static IoUringZeroCopySendSnapshot GetIoUringZeroCopySendSnapshot()
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo supportsSendZcField = engineType.GetField("_supportsOpSendZc", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo supportsSendMsgZcField = engineType.GetField("_supportsOpSendMsgZc", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo zeroCopySendEnabledField = engineType.GetField("_zeroCopySendEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ bool hasIoUringPort = false;
+ bool supportsSendZc = false;
+ bool supportsSendMsgZc = false;
+ bool zeroCopySendEnabled = false;
+
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ hasIoUringPort = true;
+ if ((bool)supportsSendZcField.GetValue(engine)!)
+ {
+ supportsSendZc = true;
+ }
+
+ if ((bool)supportsSendMsgZcField.GetValue(engine)!)
+ {
+ supportsSendMsgZc = true;
+ }
+
+ if ((bool)zeroCopySendEnabledField.GetValue(engine)!)
+ {
+ zeroCopySendEnabled = true;
+ }
+ }
+
+ return new IoUringZeroCopySendSnapshot(
+ hasIoUringPort,
+ supportsSendZc,
+ supportsSendMsgZc,
+ zeroCopySendEnabled);
+ }
+
+ private static IoUringFixedRecvSnapshot GetIoUringFixedRecvSnapshot()
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo supportsReadFixedField = engineType.GetField("_supportsOpReadFixed", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo registeredBuffersField = engineType.GetField("_ioUringBuffersRegistered", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ bool hasIoUringPort = false;
+ bool supportsReadFixed = false;
+ bool hasRegisteredBuffers = false;
+
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ hasIoUringPort = true;
+ supportsReadFixed |= (bool)supportsReadFixedField.GetValue(engine)!;
+ hasRegisteredBuffers |= (bool)registeredBuffersField.GetValue(engine)!;
+ }
+
+ return new IoUringFixedRecvSnapshot(
+ hasIoUringPort,
+ supportsReadFixed,
+ hasRegisteredBuffers);
+ }
+
+ private static IoUringSqPollSnapshot GetIoUringSqPollSnapshot()
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo sqPollEnabledField = engineType.GetField("_sqPollEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ bool hasIoUringPort = false;
+ bool sqPollEnabled = false;
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ hasIoUringPort = true;
+ sqPollEnabled |= (bool)sqPollEnabledField.GetValue(engine)!;
+ }
+
+ return new IoUringSqPollSnapshot(hasIoUringPort, sqPollEnabled);
+ }
+
+ private static bool IsAnyIoUringSqPollEngineNeedingWakeup()
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo sqPollEnabledField = engineType.GetField("_sqPollEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ MethodInfo sqNeedWakeupMethod = engineType.GetMethod("SqNeedWakeup", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)! || !(bool)sqPollEnabledField.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ if (sqNeedWakeupMethod.Invoke(engine, null) is bool needsWakeup && needsWakeup)
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ private static bool ValidateSqNeedWakeupMatchesRawSqFlagBit()
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo sqPollEnabledField = engineType.GetField("_sqPollEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo sqFlagsPtrField = engineType.GetField("_managedSqFlagsPtr", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ MethodInfo sqNeedWakeupMethod = engineType.GetMethod("SqNeedWakeup", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)! || !(bool)sqPollEnabledField.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ bool methodValue = (bool)sqNeedWakeupMethod.Invoke(engine, null)!;
+ object? pointerBoxed = sqFlagsPtrField.GetValue(engine);
+ if (pointerBoxed is null)
+ {
+ Assert.True(methodValue, "SqNeedWakeup should return true when SQ flags pointer is unavailable.");
+ return true;
+ }
+
+ unsafe
+ {
+ uint* ptr = (uint*)Pointer.Unbox(pointerBoxed);
+ bool rawValue = ptr == null || (Volatile.Read(ref *ptr) & 0x1u) != 0;
+ Assert.Equal(rawValue, methodValue);
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ private static void EnableSqPollAppContextOptIn() =>
+ AppContext.SetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", true);
+
+ private static IoUringZeroCopyPinHoldSnapshot GetIoUringZeroCopyPinHoldSnapshot()
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo zeroCopyPinHoldsField = engineType.GetField("_zeroCopyPinHolds", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo completionSlotsField = engineType.GetField("_completionSlots", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ bool hasIoUringPort = false;
+ int activePinHolds = 0;
+ int pendingNotificationCount = 0;
+
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ hasIoUringPort = true;
+
+ if (zeroCopyPinHoldsField.GetValue(engine) is MemoryHandle[] pinHolds)
+ {
+ foreach (MemoryHandle pinHold in pinHolds)
+ {
+ if (!pinHold.Equals(default(MemoryHandle)))
+ {
+ activePinHolds++;
+ }
+ }
+ }
+
+ if (completionSlotsField.GetValue(engine) is Array completionSlots)
+ {
+ Type? completionSlotType = completionSlots.GetType().GetElementType();
+ FieldInfo? zeroCopyPendingField = completionSlotType?.GetField("ZeroCopyNotificationPending", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance);
+ if (zeroCopyPendingField is null)
+ {
+ continue;
+ }
+
+ foreach (object? slot in completionSlots)
+ {
+ if (slot is not null && (bool)zeroCopyPendingField.GetValue(slot)!)
+ {
+ pendingNotificationCount++;
+ }
+ }
+ }
+ }
+
+ return new IoUringZeroCopyPinHoldSnapshot(
+ hasIoUringPort,
+ activePinHolds,
+ pendingNotificationCount);
+ }
+
+ private static bool TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount)
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo supportsProvidedBufferRingsField = engineType.GetField("_supportsProvidedBufferRings", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo providedBufferRingField = engineType.GetField("_ioUringProvidedBufferRing", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null ||
+ !(bool)isIoUringEnabledProperty.GetValue(engine)! ||
+ !(bool)supportsProvidedBufferRingsField.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ object? providedBufferRing = providedBufferRingField.GetValue(engine);
+ if (providedBufferRing is null)
+ {
+ continue;
+ }
+
+ Type providedBufferRingType = providedBufferRing.GetType();
+ FieldInfo? bufferStatesField = providedBufferRingType.GetField("_bufferStates", BindingFlags.NonPublic | BindingFlags.Instance);
+ FieldInfo? availableCountField = providedBufferRingType.GetField("_availableCount", BindingFlags.NonPublic | BindingFlags.Instance);
+ FieldInfo? inUseCountField = providedBufferRingType.GetField("_inUseCount", BindingFlags.NonPublic | BindingFlags.Instance);
+
+ if (bufferStatesField is null || availableCountField is null || inUseCountField is null)
+ {
+ continue;
+ }
+
+ byte[] bufferStates = (byte[])bufferStatesField.GetValue(providedBufferRing)!;
+ for (int i = 0; i < bufferStates.Length; i++)
+ {
+ bufferStates[i] = 2; // BufferStateCheckedOut
+ }
+
+ availableCountField.SetValue(providedBufferRing, 0);
+ inUseCountField.SetValue(providedBufferRing, bufferStates.Length);
+ forcedBufferCount = bufferStates.Length;
+ return true;
+ }
+
+ forcedBufferCount = 0;
+ return false;
+ }
+
+ private static bool TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount)
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo supportsProvidedBufferRingsField = engineType.GetField("_supportsProvidedBufferRings", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo providedBufferRingField = engineType.GetField("_ioUringProvidedBufferRing", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null ||
+ !(bool)isIoUringEnabledProperty.GetValue(engine)! ||
+ !(bool)supportsProvidedBufferRingsField.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ object? providedBufferRing = providedBufferRingField.GetValue(engine);
+ if (providedBufferRing is null)
+ {
+ continue;
+ }
+
+ MethodInfo? recycleMethod = providedBufferRing.GetType().GetMethod(
+ "RecycleCheckedOutBuffersForTeardown",
+ BindingFlags.NonPublic | BindingFlags.Instance);
+ if (recycleMethod is null)
+ {
+ continue;
+ }
+
+ recycledBufferCount = Convert.ToInt32(recycleMethod.Invoke(providedBufferRing, null));
+ return true;
+ }
+
+ recycledBufferCount = 0;
+ return false;
+ }
+
+ private static ulong CounterDelta(ulong before, ulong after) =>
+ after >= before ? after - before : after;
+
+ private static async Task WithIoUringNativeDiagnosticsSnapshotDeltaAsync(
+ Func scenario,
+ Action validateDelta,
+ int settleDelayMilliseconds = 0,
+ bool skipScenarioWhenIoUringUnavailable = false)
+ {
+ IoUringNativeDiagnosticsSnapshot diagnosticsBefore = GetIoUringNativeDiagnosticsSnapshot();
+ if (skipScenarioWhenIoUringUnavailable && !diagnosticsBefore.HasIoUringPort)
+ {
+ return;
+ }
+
+ await scenario();
+
+ if (settleDelayMilliseconds > 0)
+ {
+ await Task.Delay(settleDelayMilliseconds);
+ }
+
+ IoUringNativeDiagnosticsSnapshot diagnosticsAfter = GetIoUringNativeDiagnosticsSnapshot();
+ if (!diagnosticsBefore.HasIoUringPort && !diagnosticsAfter.HasIoUringPort)
+ {
+ return;
+ }
+
+ validateDelta(diagnosticsBefore, diagnosticsAfter);
+ }
+
+ private static Task StartReceiveMessageFromAsync(Socket socket, SocketAsyncEventArgs eventArgs)
+ => StartSocketAsyncEventArgsOperation(socket, eventArgs, static (s, args) => s.ReceiveMessageFromAsync(args));
+
+ private static Task StartSocketAsyncEventArgsOperation(
+ Socket socket,
+ SocketAsyncEventArgs eventArgs,
+ Func startOperation)
+ {
+ var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+ EventHandler handler = null!;
+ handler = (_, completedArgs) =>
+ {
+ eventArgs.Completed -= handler;
+ tcs.TrySetResult(completedArgs);
+ };
+
+ eventArgs.Completed += handler;
+ if (!startOperation(socket, eventArgs))
+ {
+ eventArgs.Completed -= handler;
+ tcs.TrySetResult(eventArgs);
+ }
+
+ return tcs.Task;
+ }
+
+ private static async Task<(Socket Listener, Socket Client, Socket Server)> CreateConnectedTcpSocketTrioAsync(int listenBacklog = 1)
+ {
+ Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ try
+ {
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(listenBacklog);
+
+ Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ try
+ {
+ Task acceptTask = listener.AcceptAsync();
+ await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!);
+ Socket server = await acceptTask;
+ return (listener, client, server);
+ }
+ catch
+ {
+ client.Dispose();
+ throw;
+ }
+ }
+ catch
+ {
+ listener.Dispose();
+ throw;
+ }
+ }
+
+ private static async Task<(Socket Client, Socket Server)> AcceptConnectedTcpPairAsync(Socket listener, IPEndPoint endpoint)
+ {
+ Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ try
+ {
+ Task acceptTask = listener.AcceptAsync();
+ await client.ConnectAsync(endpoint);
+ Socket server = await acceptTask;
+ return (client, server);
+ }
+ catch
+ {
+ client.Dispose();
+ throw;
+ }
+ }
+
+ private static async Task RunTcpRoundTripAsync(int iterations)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] sendBuffer = new byte[] { 1 };
+ byte[] receiveBuffer = new byte[1];
+
+ for (int i = 0; i < iterations; i++)
+ {
+ var serverReceiveTask = server.ReceiveAsync(receiveBuffer, SocketFlags.None);
+ await Task.Yield();
+
+ int clientSent = await client.SendAsync(sendBuffer, SocketFlags.None);
+ Assert.Equal(1, clientSent);
+
+ int serverReceived = await serverReceiveTask;
+ Assert.Equal(1, serverReceived);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+
+ var clientReceiveTask = client.ReceiveAsync(receiveBuffer, SocketFlags.None);
+ await Task.Yield();
+
+ int serverSent = await server.SendAsync(sendBuffer, SocketFlags.None);
+ Assert.Equal(1, serverSent);
+
+ int clientReceived = await clientReceiveTask;
+ Assert.Equal(1, clientReceived);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+
+ unchecked
+ {
+ sendBuffer[0]++;
+ }
+ }
+ }
+
+ private static async Task RunBufferListSendRoundTripAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] payload = new byte[] { 0x11, 0x22, 0x33, 0x44, 0x55 };
+ var sendBuffers = new List>
+ {
+ new ArraySegment(payload, 0, 2),
+ new ArraySegment(payload, 2, 1),
+ new ArraySegment(payload, 3, 2)
+ };
+
+ byte[] receiveBuffer = new byte[payload.Length];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ int sent = await client.SendAsync(sendBuffers, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ Assert.Equal(payload.Length, await receiveTask);
+ Assert.Equal(payload, receiveBuffer);
+ }
+
+ private static async Task RunReceiveMessageFromRoundTripAsync()
+ {
+ using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+
+ receiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true);
+ receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ sender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ byte[] payload = new byte[] { 0x91, 0x92, 0x93 };
+ byte[] receiveBuffer = new byte[payload.Length];
+ EndPoint remoteEndPoint = new IPEndPoint(IPAddress.Any, 0);
+
+ var receiveTask = receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint);
+ await Task.Yield();
+
+ int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!);
+ Assert.Equal(payload.Length, sent);
+
+ SocketReceiveMessageFromResult result = await receiveTask;
+ Assert.Equal(payload.Length, result.ReceivedBytes);
+ Assert.Equal(payload, receiveBuffer);
+ Assert.Equal(sender.LocalEndPoint, result.RemoteEndPoint);
+ }
+
+ private static async Task RunReceiveMessageFromPacketInformationRoundTripAsync(bool useIpv6)
+ {
+ if (useIpv6 && !Socket.OSSupportsIPv6)
+ {
+ return;
+ }
+
+ AddressFamily addressFamily = useIpv6 ? AddressFamily.InterNetworkV6 : AddressFamily.InterNetwork;
+ SocketOptionLevel optionLevel = useIpv6 ? SocketOptionLevel.IPv6 : SocketOptionLevel.IP;
+ IPAddress loopbackAddress = useIpv6 ? IPAddress.IPv6Loopback : IPAddress.Loopback;
+ IPAddress anyAddress = useIpv6 ? IPAddress.IPv6Any : IPAddress.Any;
+
+ using Socket receiver = new Socket(addressFamily, SocketType.Dgram, ProtocolType.Udp);
+ using Socket sender = new Socket(addressFamily, SocketType.Dgram, ProtocolType.Udp);
+
+ receiver.SetSocketOption(optionLevel, SocketOptionName.PacketInformation, true);
+ receiver.Bind(new IPEndPoint(loopbackAddress, 0));
+ sender.Bind(new IPEndPoint(loopbackAddress, 0));
+
+ byte[] payload = useIpv6 ?
+ new byte[] { 0xA1, 0xA2, 0xA3 } :
+ new byte[] { 0x90, 0x91, 0x92, 0x93 };
+ byte[] receiveBuffer = new byte[payload.Length];
+ EndPoint remoteEndPoint = new IPEndPoint(anyAddress, 0);
+
+ Task receiveTask =
+ ToTask(receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint));
+ await Task.Yield();
+
+ int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!);
+ Assert.Equal(payload.Length, sent);
+
+ SocketReceiveMessageFromResult result = await receiveTask;
+ Assert.Equal(payload.Length, result.ReceivedBytes);
+ Assert.Equal(payload, receiveBuffer);
+ Assert.Equal(sender.LocalEndPoint, result.RemoteEndPoint);
+ Assert.Equal(((IPEndPoint)sender.LocalEndPoint!).Address, result.PacketInformation.Address);
+ }
+
+ private static async Task RunNonPinnableMemorySendFallbackScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] payload = new byte[] { 0x71, 0x72, 0x73, 0x74 };
+ using var nonPinnableMemory = new NonPinnableMemoryManager(payload);
+ byte[] receiveBuffer = new byte[payload.Length];
+
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ int sent = await client.SendAsync(nonPinnableMemory.Memory, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ Assert.Equal(payload.Length, await receiveTask);
+ Assert.Equal(payload, receiveBuffer);
+ }
+
+ private static async Task RunNonPinnableMemoryReceiveFallbackScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[4];
+ using var nonPinnableMemory = new NonPinnableMemoryManager(receiveBuffer);
+ byte[] payload = new byte[] { 0x81, 0x82, 0x83, 0x84 };
+
+ Task receiveTask = ToTask(server.ReceiveAsync(nonPinnableMemory.Memory, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(payload.Length, await receiveTask);
+ Assert.Equal(payload, receiveBuffer);
+ }
+
+ private static Task RunNonPinnableMemoryFallbackScenarioAsync(bool receivePath) =>
+ receivePath ? RunNonPinnableMemoryReceiveFallbackScenarioAsync() : RunNonPinnableMemorySendFallbackScenarioAsync();
+
+ private static async Task RunNonPinnableFallbackTelemetryScenarioAsync()
+ {
+ long before = 0;
+ long after = 0;
+
+ await WithIoUringNativeDiagnosticsSnapshotDeltaAsync(
+ async () =>
+ {
+ before = GetIoUringPrepareNonPinnableFallbackCounterValue();
+ await RunNonPinnableMemorySendFallbackScenarioAsync();
+ await RunNonPinnableMemoryReceiveFallbackScenarioAsync();
+ after = GetIoUringPrepareNonPinnableFallbackCounterValue();
+ },
+ (_, _) =>
+ {
+ Assert.True(
+ after > before,
+ $"Expected io_uring non-pinnable fallback telemetry to increase. before={before}, after={after}");
+ },
+ skipScenarioWhenIoUringUnavailable: true);
+ }
+
+ private static async Task RunPinnableMemoryPinReleaseLifecycleScenarioAsync()
+ {
+ await WithIoUringNativeDiagnosticsSnapshotDeltaAsync(
+ async () =>
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Completion path: receive completes with data and must release pin.
+ byte[] completionPayload = new byte[] { 0x91 };
+ using var completionMemory = new TrackingPinnableMemoryManager(new byte[completionPayload.Length]);
+ Task completionReceive = ToTask(server.ReceiveAsync(completionMemory.Memory, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(completionPayload, SocketFlags.None));
+ Assert.Equal(1, await completionReceive);
+ Assert.Equal(completionPayload, completionMemory.GetSpan().ToArray());
+ await AssertPinsReleasedAsync(completionMemory);
+
+ // Cancellation path: pending receive canceled by token must release pin.
+ using var cancellationMemory = new TrackingPinnableMemoryManager(new byte[16]);
+ using (var cts = new CancellationTokenSource())
+ {
+ Task canceledReceive = ToTask(server.ReceiveAsync(cancellationMemory.Memory, SocketFlags.None, cts.Token));
+ await Task.Delay(20);
+ cts.Cancel();
+
+ Exception? canceledException = await Record.ExceptionAsync(async () => await canceledReceive);
+ AssertCanceledOrInterrupted(canceledException);
+ }
+
+ await AssertPinsReleasedAsync(cancellationMemory);
+
+ // Teardown/abort path: pending receive interrupted by close must release pin.
+ using var teardownMemory = new TrackingPinnableMemoryManager(new byte[16]);
+ Task teardownReceive = ToTask(server.ReceiveAsync(teardownMemory.Memory, SocketFlags.None));
+ await Task.Yield();
+ client.Dispose();
+ server.Dispose();
+
+ Exception? teardownException = await Record.ExceptionAsync(async () => await teardownReceive);
+ AssertCanceledDisposedOrInterrupted(teardownException);
+ await AssertPinsReleasedAsync(teardownMemory);
+ },
+ static (_, _) => { },
+ skipScenarioWhenIoUringUnavailable: true);
+ }
+
+ private static async Task RunProvidedBufferRegistrationLifecycleScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[1];
+ Task initialReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xA1 }, SocketFlags.None));
+ Assert.Equal(1, await initialReceive);
+
+ IoUringProvidedBufferSnapshot initialSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!initialSnapshot.IsUsable)
+ {
+ return;
+ }
+
+ Assert.Equal(initialSnapshot.TotalBufferCount, initialSnapshot.AvailableCount + initialSnapshot.InUseCount);
+ Assert.Equal(0, initialSnapshot.InUseCount);
+
+ using (var cts = new CancellationTokenSource())
+ {
+ Task canceledReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None, cts.Token));
+ await Task.Yield();
+ cts.Cancel();
+
+ Exception? canceledException = await Record.ExceptionAsync(async () => await canceledReceive);
+ AssertCanceledOrInterrupted(canceledException);
+ }
+
+ await Task.Delay(50);
+ IoUringProvidedBufferSnapshot postCancellationSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.Equal(initialSnapshot.TotalBufferCount, postCancellationSnapshot.TotalBufferCount);
+ Assert.Equal(postCancellationSnapshot.TotalBufferCount, postCancellationSnapshot.AvailableCount + postCancellationSnapshot.InUseCount);
+ Assert.Equal(0, postCancellationSnapshot.InUseCount);
+ }
+
+ private static async Task RunProvidedBufferSelectReceiveScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!beforeSnapshot.IsUsable)
+ {
+ return;
+ }
+
+ ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xB2 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+ Assert.Equal(0xB2, receiveBuffer[0]);
+
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+
+ Assert.True(recycleAfter > recycleBefore, "Expected provided-buffer recycle counter to increase after a completion.");
+ Assert.Equal(depletionBefore, depletionAfter);
+ Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount);
+ Assert.Equal(0, afterSnapshot.InUseCount);
+ }
+
+ private static async Task RunProvidedBufferRecycleReuseScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!beforeSnapshot.IsUsable)
+ {
+ return;
+ }
+
+ ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+ long allocationFailuresBefore = beforeSnapshot.AllocationFailureCount;
+
+ int iterations = Math.Max(beforeSnapshot.TotalBufferCount + 64, 512);
+ byte[] receiveBuffer = new byte[1];
+ byte[] payload = new byte[1];
+
+ for (int i = 0; i < iterations; i++)
+ {
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ payload[0] = unchecked((byte)i);
+ Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+ Assert.Equal(payload[0], receiveBuffer[0]);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+
+ Assert.True(
+ recycleAfter >= recycleBefore + (ulong)iterations,
+ $"Expected at least {iterations} provided-buffer recycle increments. before={recycleBefore}, after={recycleAfter}");
+ Assert.Equal(depletionBefore, depletionAfter);
+ Assert.Equal(allocationFailuresBefore, afterSnapshot.AllocationFailureCount);
+ Assert.Equal(beforeSnapshot.TotalBufferCount, afterSnapshot.TotalBufferCount);
+ Assert.Equal(0, afterSnapshot.InUseCount);
+ Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount);
+ }
+
+ private static async Task RunProvidedBufferExhaustionScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] warmupBuffer = new byte[1];
+ Task warmupReceive = ToTask(server.ReceiveAsync(warmupBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xC1 }, SocketFlags.None));
+ Assert.Equal(1, await warmupReceive);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!snapshot.IsUsable)
+ {
+ return;
+ }
+
+ ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+ Assert.True(TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount));
+ Assert.True(forcedBufferCount > 0);
+
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xC2 }, SocketFlags.None));
+ Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(receiveTask, completed);
+
+ Exception? receiveException = await Record.ExceptionAsync(async () => await receiveTask);
+ SocketException socketException = Assert.IsType(receiveException);
+ Assert.Equal(SocketError.NoBufferSpaceAvailable, socketException.SocketErrorCode);
+ Assert.True(
+ GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions") > depletionBefore,
+ "Expected provided-buffer depletion counter to increase when ring buffers are forced unavailable.");
+ }
+
+ private static async Task RunProvidedBufferMixedWorkloadScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!beforeSnapshot.IsUsable)
+ {
+ return;
+ }
+
+ using Socket udpReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ using Socket udpSender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ udpReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ udpSender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+
+ byte[] tcpReceiveBuffer = new byte[1];
+ byte[] udpReceiveBuffer = new byte[2];
+
+ Task tcpReceive = ToTask(server.ReceiveAsync(tcpReceiveBuffer, SocketFlags.None));
+ Task udpReceive = ToTask(
+ udpReceiver.ReceiveFromAsync(
+ udpReceiveBuffer,
+ SocketFlags.None,
+ new IPEndPoint(IPAddress.Any, 0)));
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xD1 }, SocketFlags.None));
+ Assert.Equal(2, await udpSender.SendToAsync(new byte[] { 0xE1, 0xE2 }, SocketFlags.None, udpReceiver.LocalEndPoint!));
+
+ Assert.Equal(1, await tcpReceive);
+ Assert.Equal(0xD1, tcpReceiveBuffer[0]);
+
+ SocketReceiveFromResult udpResult = await udpReceive;
+ Assert.Equal(2, udpResult.ReceivedBytes);
+ Assert.Equal(0xE1, udpReceiveBuffer[0]);
+ Assert.Equal(0xE2, udpReceiveBuffer[1]);
+
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+
+ Assert.True(recycleAfter > recycleBefore, "Expected provided-buffer recycle counter to increase in mixed workload.");
+ Assert.Equal(depletionBefore, depletionAfter);
+ Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount);
+ Assert.Equal(0, afterSnapshot.InUseCount);
+ }
+
+ private static async Task SendExactlyAsync(Socket socket, ReadOnlyMemory buffer)
+ {
+ int totalSent = 0;
+ while (totalSent < buffer.Length)
+ {
+ int sent = await socket.SendAsync(buffer.Slice(totalSent), SocketFlags.None);
+ Assert.True(sent > 0, "Socket.SendAsync returned 0 before sending all bytes.");
+ totalSent += sent;
+ }
+ }
+
+ private static async Task ReceiveExactlyAsync(Socket socket, Memory buffer)
+ {
+ int totalReceived = 0;
+ while (totalReceived < buffer.Length)
+ {
+ int received = await socket.ReceiveAsync(buffer.Slice(totalReceived), SocketFlags.None);
+ Assert.True(received > 0, "Socket.ReceiveAsync returned 0 before receiving all expected bytes.");
+ totalReceived += received;
+ }
+ }
+
+ private static async Task WaitForProvidedBufferSnapshotAsync(
+ Func predicate,
+ int timeoutMilliseconds = 10000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ while (DateTime.UtcNow < deadline)
+ {
+ if (predicate(snapshot))
+ {
+ return snapshot;
+ }
+
+ await Task.Delay(50);
+ snapshot = GetIoUringProvidedBufferSnapshot();
+ }
+
+ return snapshot;
+ }
+
+ private static async Task RunAdaptiveProvidedBufferSmallMessageShrinkScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!beforeSnapshot.IsAdaptiveSizingUsable)
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ const int payloadSize = 64;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+
+ for (int i = 0; i < 320; i++)
+ {
+ sendBuffer.AsSpan().Fill(unchecked((byte)i));
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => snapshot.IsAdaptiveSizingUsable &&
+ (snapshot.RecommendedBufferSize < initialBufferSize || snapshot.BufferSize < initialBufferSize));
+
+ Assert.True(
+ afterSnapshot.RecommendedBufferSize < initialBufferSize || afterSnapshot.BufferSize < initialBufferSize,
+ $"Expected adaptive recommendation to shrink from {initialBufferSize}. " +
+ $"actual buffer={afterSnapshot.BufferSize}, recommended={afterSnapshot.RecommendedBufferSize}");
+ }
+
+ private static async Task RunAdaptiveProvidedBufferLargeMessageGrowScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!beforeSnapshot.IsAdaptiveSizingUsable)
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ int payloadSize = initialBufferSize;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+ sendBuffer.AsSpan().Fill(0x5A);
+
+ for (int i = 0; i < 320; i++)
+ {
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => snapshot.IsAdaptiveSizingUsable &&
+ (snapshot.RecommendedBufferSize > initialBufferSize || snapshot.BufferSize > initialBufferSize));
+
+ Assert.True(
+ afterSnapshot.RecommendedBufferSize > initialBufferSize || afterSnapshot.BufferSize > initialBufferSize,
+ $"Expected adaptive recommendation to grow from {initialBufferSize}. " +
+ $"actual buffer={afterSnapshot.BufferSize}, recommended={afterSnapshot.RecommendedBufferSize}");
+ }
+
+ private static async Task RunAdaptiveProvidedBufferMixedWorkloadStableScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!beforeSnapshot.IsAdaptiveSizingUsable)
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ byte[] smallSend = new byte[64];
+ byte[] smallReceive = new byte[64];
+ byte[] largeSend = new byte[initialBufferSize];
+ byte[] largeReceive = new byte[initialBufferSize];
+ smallSend.AsSpan().Fill(0x11);
+ largeSend.AsSpan().Fill(0x77);
+
+ for (int i = 0; i < 320; i++)
+ {
+ bool useLarge = (i & 1) == 1;
+ byte[] send = useLarge ? largeSend : smallSend;
+ byte[] receive = useLarge ? largeReceive : smallReceive;
+
+ Task receiveTask = ReceiveExactlyAsync(server, receive);
+ await SendExactlyAsync(client, send);
+ await receiveTask;
+ Assert.Equal(send, receive);
+ }
+
+ await Task.Delay(250);
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.True(afterSnapshot.IsAdaptiveSizingUsable);
+ Assert.Equal(initialBufferSize, afterSnapshot.RecommendedBufferSize);
+ }
+
+ private static async Task RunAdaptiveProvidedBufferResizeSwapNoDataLossScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!beforeSnapshot.IsAdaptiveSizingUsable)
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ const int payloadSize = 64;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+ for (int i = 0; i < 384; i++)
+ {
+ sendBuffer.AsSpan().Fill(unchecked((byte)i));
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => snapshot.IsAdaptiveSizingUsable && snapshot.BufferSize < initialBufferSize,
+ timeoutMilliseconds: 15000);
+
+ Assert.True(
+ afterSnapshot.BufferSize < initialBufferSize,
+ $"Expected adaptive resize swap to shrink active ring. initial={initialBufferSize}, current={afterSnapshot.BufferSize}");
+ }
+
+ private static async Task RunAdaptiveProvidedBufferDisabledScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!beforeSnapshot.IsUsable)
+ {
+ return;
+ }
+
+ Assert.False(beforeSnapshot.AdaptiveBufferSizingEnabled);
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ int initialRecommendedSize = beforeSnapshot.RecommendedBufferSize;
+
+ const int payloadSize = 64;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+ sendBuffer.AsSpan().Fill(0xA5);
+
+ for (int i = 0; i < 320; i++)
+ {
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ await Task.Delay(250);
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.True(afterSnapshot.IsUsable);
+ Assert.False(afterSnapshot.AdaptiveBufferSizingEnabled);
+ Assert.Equal(initialBufferSize, afterSnapshot.BufferSize);
+ Assert.Equal(initialRecommendedSize, afterSnapshot.RecommendedBufferSize);
+ }
+
+ private static async Task RunAdaptiveProvidedBufferSizingStateScenarioAsync(bool expectedEnabled)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Warm up receive path so io_uring provided-buffer ring state is initialized.
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!snapshot.IsUsable)
+ {
+ return;
+ }
+
+ Assert.Equal(expectedEnabled, snapshot.AdaptiveBufferSizingEnabled);
+ }
+
+ private static async Task RunProvidedBufferKernelRegistrationDisabledScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Warm up receive path so io_uring provided-buffer ring state is initialized.
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!snapshot.IsUsable)
+ {
+ return;
+ }
+
+ Assert.False(snapshot.HasRegisteredBuffers);
+ }
+
+ private static async Task RunProvidedBufferKernelRegistrationSuccessScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Warm up receive path so io_uring provided-buffer ring state and telemetry are initialized.
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!snapshot.IsUsable)
+ {
+ return;
+ }
+
+ ulong successCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialSuccess");
+ ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure");
+ Assert.True(
+ successCount + failureCount > 0,
+ "Expected at least one registered-buffer initialization attempt.");
+
+ // Best-effort success-path assertion: only enforce when registration succeeded on this machine.
+ if (!snapshot.HasRegisteredBuffers)
+ {
+ return;
+ }
+
+ Assert.True(successCount > 0, "Expected success telemetry when registered buffers are active.");
+ }
+
+ private static async Task RunProvidedBufferKernelRegistrationFailureNonFatalScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Warm up receive path so io_uring provided-buffer ring state and telemetry are initialized.
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!snapshot.IsUsable || snapshot.HasRegisteredBuffers)
+ {
+ // No observed registration failure in this environment.
+ return;
+ }
+
+ // Registration is not active: verify provided-buffer receive path still works.
+ byte[] payload = new byte[4096];
+ byte[] received = new byte[payload.Length];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 31));
+ }
+
+ Task receiveAllTask = ReceiveExactlyAsync(server, received);
+ await SendExactlyAsync(client, payload);
+ await receiveAllTask;
+ Assert.Equal(payload, received);
+
+ ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure");
+ Assert.True(failureCount > 0, "Expected failure telemetry when registered buffers are inactive.");
+ }
+
+ private static async Task RunProvidedBufferKernelReregistrationOnResizeScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!beforeSnapshot.IsAdaptiveSizingUsable)
+ {
+ return;
+ }
+
+ ulong reregSuccessBefore = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationSuccess");
+ ulong reregFailureBefore = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationFailure");
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ const int payloadSize = 64;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+ for (int i = 0; i < 384; i++)
+ {
+ sendBuffer.AsSpan().Fill(unchecked((byte)(i + 1)));
+ Task receivePayloadTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receivePayloadTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => snapshot.IsAdaptiveSizingUsable && snapshot.BufferSize < initialBufferSize,
+ timeoutMilliseconds: 15000);
+
+ Assert.True(afterSnapshot.BufferSize < initialBufferSize);
+
+ ulong reregSuccessAfter = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationSuccess");
+ ulong reregFailureAfter = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationFailure");
+ Assert.True(
+ (reregSuccessAfter + reregFailureAfter) > (reregSuccessBefore + reregFailureBefore),
+ "Expected at least one registered-buffer re-registration attempt after adaptive resize.");
+ }
+
+ private static async Task RunProvidedBufferRegisteredBuffersDataCorrectnessScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!snapshot.IsUsable || !snapshot.HasRegisteredBuffers)
+ {
+ return;
+ }
+
+ // Reuse the mixed workload profile to validate payload correctness with registered buffers active.
+ byte[] smallSend = new byte[64];
+ byte[] largeSend = new byte[Math.Max(snapshot.BufferSize, 4096)];
+ byte[] smallReceive = new byte[smallSend.Length];
+ byte[] largeReceive = new byte[largeSend.Length];
+
+ for (int i = 0; i < 64; i++)
+ {
+ smallSend.AsSpan().Fill(unchecked((byte)(i + 5)));
+ largeSend.AsSpan().Fill(unchecked((byte)(i + 11)));
+
+ Task smallReceiveTask = ReceiveExactlyAsync(server, smallReceive);
+ await SendExactlyAsync(client, smallSend);
+ await smallReceiveTask;
+ Assert.Equal(smallSend, smallReceive);
+
+ Task largeReceiveTask = ReceiveExactlyAsync(server, largeReceive);
+ await SendExactlyAsync(client, largeSend);
+ await largeReceiveTask;
+ Assert.Equal(largeSend, largeReceive);
+ }
+ }
+
+ private static async Task RunProvidedBufferRegistrationMemoryPressureScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!snapshot.IsUsable)
+ {
+ return;
+ }
+
+ int payloadSize = Math.Min(snapshot.BufferSize, 16 * 1024);
+ payloadSize = Math.Max(payloadSize, 1024);
+ byte[] payload = new byte[payloadSize];
+ byte[] received = new byte[payloadSize];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 41));
+ }
+
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ await SendExactlyAsync(client, payload);
+ await receiveTask;
+ Assert.Equal(payload, received);
+
+ ulong successCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialSuccess");
+ ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure");
+ if (snapshot.HasRegisteredBuffers)
+ {
+ Assert.True(successCount > 0, "Expected successful registration telemetry when buffers are registered.");
+ }
+ else
+ {
+ Assert.True(failureCount > 0, "Expected failure telemetry when registration falls back under pressure.");
+ }
+ }
+
+ private static Task RunProvidedBufferTeardownOrderingContractScenarioAsync()
+ {
+ Type engineType = typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+ MethodInfo teardownMethod = engineType.GetMethod("LinuxFreeIoUringResources", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ MethodInfo freeProvidedBufferRingMethod = engineType.GetMethod("FreeIoUringProvidedBufferRing", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ MethodInfo cleanupManagedRingsMethod = engineType.GetMethod("CleanupManagedRings", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ byte[] ilBytes = teardownMethod.GetMethodBody()?.GetILAsByteArray() ?? Array.Empty();
+ Assert.NotEmpty(ilBytes);
+ ReadOnlySpan il = ilBytes;
+
+ int freeProvidedBufferRingOffset = FindCallInstructionOffset(il, freeProvidedBufferRingMethod.MetadataToken);
+ int cleanupManagedRingsOffset = FindCallInstructionOffset(il, cleanupManagedRingsMethod.MetadataToken);
+
+ Assert.True(freeProvidedBufferRingOffset >= 0, "Expected teardown method to call FreeIoUringProvidedBufferRing.");
+ Assert.True(cleanupManagedRingsOffset >= 0, "Expected teardown method to call CleanupManagedRings.");
+ Assert.True(
+ freeProvidedBufferRingOffset < cleanupManagedRingsOffset,
+ "Expected teardown to unregister/dispose provided buffers before ring unmap/close.");
+
+ return Task.CompletedTask;
+ }
+
+ private static async Task RunZeroCopySendStateScenarioAsync(bool expectedEnabledWhenSupported)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] sendBuffer = new byte[64];
+ byte[] receiveBuffer = new byte[sendBuffer.Length];
+ Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None));
+ await ReceiveExactlyAsync(server, receiveBuffer);
+
+ IoUringZeroCopySendSnapshot snapshot = GetIoUringZeroCopySendSnapshot();
+ if (!snapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ if (!snapshot.SupportsSendZc)
+ {
+ Assert.False(snapshot.ZeroCopySendEnabled);
+ return;
+ }
+
+ Assert.Equal(expectedEnabledWhenSupported, snapshot.ZeroCopySendEnabled);
+ }
+
+ private static async Task RunFixedRecvStateScenarioAsync(bool expectedEnabled)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] sendBuffer = new byte[64];
+ byte[] receiveBuffer = new byte[sendBuffer.Length];
+ Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None));
+ await ReceiveExactlyAsync(server, receiveBuffer);
+
+ IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot();
+ if (!snapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ Assert.Equal(expectedEnabled, snapshot.FixedRecvEnabled);
+ }
+
+ private static async Task RunFixedRecvActivationFollowsRuntimeCapabilitiesScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] sendBuffer = new byte[64];
+ byte[] receiveBuffer = new byte[sendBuffer.Length];
+ Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None));
+ await ReceiveExactlyAsync(server, receiveBuffer);
+
+ IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot();
+ if (!snapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ Assert.Equal(snapshot.SupportsReadFixed && snapshot.HasRegisteredBuffers, snapshot.FixedRecvEnabled);
+ }
+
+ private static async Task RunFixedRecvDataCorrectnessScenarioAsync()
+ {
+ IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot();
+ if (!snapshot.HasIoUringPort || !snapshot.FixedRecvEnabled || !snapshot.SupportsReadFixed || !snapshot.HasRegisteredBuffers)
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ byte[] payload = new byte[32 * 1024];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i * 13));
+ }
+
+ byte[] received = new byte[payload.Length];
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None));
+ await receiveTask;
+ Assert.Equal(payload, received);
+ }
+
+ private static async Task RunSqPollBasicSendReceiveScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(8);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!snapshot.IsActive)
+ {
+ return;
+ }
+
+ await RunTcpRoundTripAsync(16);
+ }
+
+ private static async Task RunSqPollRequestedScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(8);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ // Some Helix legs can run without an active io_uring port (kernel/config/runtime gating).
+ // In that case this SQPOLL-request scenario is not applicable.
+ if (!snapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ if (!snapshot.SqPollEnabled)
+ {
+ // SQPOLL wasn't active on this leg, but socket operations must continue to succeed.
+ await RunTcpRoundTripAsync(16);
+ }
+ }
+
+ private static async Task RunSqPollWakeupAfterIdleScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(4);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!snapshot.IsActive)
+ {
+ return;
+ }
+
+ ulong wakeupsBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups");
+
+ // Let the kernel SQPOLL thread go idle and set SQ_NEED_WAKEUP.
+ bool observedNeedWakeup = false;
+ for (int i = 0; i < 25; i++)
+ {
+ await Task.Delay(100);
+ if (IsAnyIoUringSqPollEngineNeedingWakeup())
+ {
+ observedNeedWakeup = true;
+ break;
+ }
+ }
+
+ if (!observedNeedWakeup)
+ {
+ return;
+ }
+
+ await RunTcpRoundTripAsync(2);
+
+ ulong wakeupsAfter = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups");
+ Assert.True(
+ wakeupsAfter > wakeupsBefore,
+ $"Expected SQPOLL wakeups to increase after idle wake path. before={wakeupsBefore}, after={wakeupsAfter}");
+ }
+
+ private static async Task RunSqPollMultishotRecvScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(4);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!snapshot.IsActive)
+ {
+ return;
+ }
+
+ await RunMultishotRecvBasicScenarioAsync(iterations: 32);
+ }
+
+ private static async Task RunSqPollZeroCopySendScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(4);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!snapshot.IsActive)
+ {
+ return;
+ }
+
+ await RunZeroCopySendLargeBufferRoundTripScenarioAsync();
+ }
+
+ private static async Task RunSqPollTelemetryCountersScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(4);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!snapshot.IsActive)
+ {
+ return;
+ }
+
+ ulong skippedBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollSubmissionsSkipped");
+ ulong wakeupsBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups");
+
+ await RunTcpRoundTripAsync(32);
+ ulong skippedAfterBurst = GetIoUringTelemetryCounterValue("_ioUringSqPollSubmissionsSkipped");
+ Assert.True(
+ skippedAfterBurst > skippedBefore,
+ $"Expected SQPOLL submission-skipped counter to increase. before={skippedBefore}, after={skippedAfterBurst}");
+
+ await Task.Delay(1500);
+ await RunTcpRoundTripAsync(2);
+
+ ulong wakeupsAfter = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups");
+ Assert.True(
+ wakeupsAfter >= wakeupsBefore,
+ $"Expected SQPOLL wakeup counter to be readable/nondecreasing. before={wakeupsBefore}, after={wakeupsAfter}");
+ }
+
+ private static async Task RunSqPollNeedWakeupContractScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(4);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!snapshot.IsActive)
+ {
+ return;
+ }
+
+ Assert.True(
+ ValidateSqNeedWakeupMatchesRawSqFlagBit(),
+ "Expected at least one active SQPOLL io_uring engine for SqNeedWakeup contract validation.");
+ }
+
+ private static bool IsZeroCopySendEnabledAndSupported(out IoUringZeroCopySendSnapshot snapshot)
+ {
+ snapshot = GetIoUringZeroCopySendSnapshot();
+ return snapshot.HasIoUringPort && snapshot.SupportsSendZc && snapshot.ZeroCopySendEnabled;
+ }
+
+ private static bool IsZeroCopySendMessageEnabledAndSupported(out IoUringZeroCopySendSnapshot snapshot)
+ {
+ snapshot = GetIoUringZeroCopySendSnapshot();
+ return snapshot.HasIoUringPort && snapshot.SupportsSendMsgZc && snapshot.ZeroCopySendEnabled;
+ }
+
+ private static async Task RunZeroCopySendLargeBufferRoundTripScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ byte[] payload = new byte[64 * 1024];
+ byte[] received = new byte[payload.Length];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)i);
+ }
+
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ int sent = await client.SendAsync(payload, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ await receiveTask;
+ Assert.Equal(payload, received);
+ }
+
+ private static async Task RunZeroCopySendSmallBufferUsesRegularSendScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ byte[] smallPayload = new byte[1024];
+ Exception? sendException = await Record.ExceptionAsync(async () => await client.SendAsync(smallPayload, SocketFlags.None));
+ AssertCanceledOrInterrupted(sendException);
+
+ byte[] verificationPayload = new byte[] { 0x5A };
+ byte[] verificationReceive = new byte[1];
+ Task verificationReceiveTask = ToTask(server.ReceiveAsync(verificationReceive, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(verificationPayload, SocketFlags.None));
+ Assert.Equal(1, await verificationReceiveTask);
+ Assert.Equal(verificationPayload[0], verificationReceive[0]);
+ }
+
+ private static async Task RunZeroCopySendNotifCqeReleasesPinHoldsScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ byte[] payload = new byte[128 * 1024];
+ byte[] received = new byte[payload.Length];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 1));
+ }
+
+ const int iterations = 8;
+ for (int i = 0; i < iterations; i++)
+ {
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ int sent = await client.SendAsync(payload, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ await receiveTask;
+ Assert.Equal(payload, received);
+ }
+
+ IoUringZeroCopyPinHoldSnapshot releasedSnapshot = await WaitForZeroCopyPinHoldSnapshotAsync(
+ static snapshot => !snapshot.HasIoUringPort || (snapshot.ActivePinHolds == 0 && snapshot.PendingNotificationCount == 0));
+ if (!releasedSnapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ Assert.Equal(0, releasedSnapshot.ActivePinHolds);
+ Assert.Equal(0, releasedSnapshot.PendingNotificationCount);
+ }
+
+ private static async Task RunZeroCopySendPartialSendResubmissionScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ await RunLargeSendWithBackpressureAsync(useBufferListSend: false);
+ }
+
+ private static async Task RunZeroCopySendCompletionPinLifetimeScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ byte[] payload = new byte[96 * 1024];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 3));
+ }
+
+ using var trackingMemory = new TrackingPinnableMemoryManager(payload);
+ byte[] received = new byte[payload.Length];
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ int sent = await client.SendAsync(trackingMemory.Memory, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ await receiveTask;
+ await AssertPinsReleasedAsync(trackingMemory);
+ Assert.Equal(payload, received);
+ }
+
+ private static async Task RunZeroCopySendUnsupportedOpcodeFallbackScenarioAsync()
+ {
+ Assembly socketsAssembly = typeof(Socket).Assembly;
+ Type engineType = socketsAssembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true)!;
+ FieldInfo enginesField = engineType.GetField("s_engines", BindingFlags.NonPublic | BindingFlags.Static)!;
+ PropertyInfo isIoUringEnabledProperty = engineType.GetProperty("IsIoUringCompletionModeEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo supportsSendZcField = engineType.GetField("_supportsOpSendZc", BindingFlags.NonPublic | BindingFlags.Instance)!;
+ FieldInfo zeroCopySendEnabledField = engineType.GetField("_zeroCopySendEnabled", BindingFlags.NonPublic | BindingFlags.Instance)!;
+
+ var overrides = new List<(object Engine, bool SupportsSendZc, bool ZeroCopyEnabled)>();
+ bool hasIoUringPort = false;
+ foreach (object? engine in (Array)enginesField.GetValue(null)!)
+ {
+ if (engine is null || !(bool)isIoUringEnabledProperty.GetValue(engine)!)
+ {
+ continue;
+ }
+
+ hasIoUringPort = true;
+ bool supports = (bool)supportsSendZcField.GetValue(engine)!;
+ bool enabled = (bool)zeroCopySendEnabledField.GetValue(engine)!;
+ overrides.Add((engine, supports, enabled));
+ supportsSendZcField.SetValue(engine, false);
+ zeroCopySendEnabledField.SetValue(engine, false);
+ }
+
+ if (!hasIoUringPort)
+ {
+ return;
+ }
+
+ try
+ {
+ IoUringZeroCopySendSnapshot snapshot = GetIoUringZeroCopySendSnapshot();
+ Assert.False(snapshot.SupportsSendZc);
+ Assert.False(snapshot.ZeroCopySendEnabled);
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] payload = new byte[64 * 1024];
+ byte[] received = new byte[payload.Length];
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ int sent = await client.SendAsync(payload, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ await receiveTask;
+ Assert.Equal(payload, received);
+ }
+ finally
+ {
+ foreach ((object engine, bool supports, bool enabled) in overrides)
+ {
+ supportsSendZcField.SetValue(engine, supports);
+ zeroCopySendEnabledField.SetValue(engine, enabled);
+ }
+ }
+ }
+
+ private static async Task RunZeroCopySendBufferListSegmentThresholdScenarioAsync()
+ {
+ if (!IsZeroCopySendMessageEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ const int segmentCount = 8;
+ const int segmentSize = 4 * 1024;
+ int payloadLength = segmentCount * segmentSize;
+ byte[] payload = new byte[payloadLength];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 17));
+ }
+
+ var sendBuffers = new List>(segmentCount);
+ for (int i = 0; i < segmentCount; i++)
+ {
+ sendBuffers.Add(new ArraySegment(payload, i * segmentSize, segmentSize));
+ }
+
+ byte[] received = new byte[payload.Length];
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ int sent = await client.SendAsync(sendBuffers, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ await receiveTask;
+ Assert.Equal(payload, received);
+ }
+
+ private static async Task RunZeroCopySendToAboveThresholdScenarioAsync()
+ {
+ if (!IsZeroCopySendMessageEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ sender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ byte[] payload = new byte[20 * 1024];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 23));
+ }
+
+ byte[] receiveBuffer = new byte[payload.Length];
+ Task receiveTask =
+ ToTask(receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0)));
+ await Task.Yield();
+
+ int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!);
+ Assert.Equal(payload.Length, sent);
+
+ SocketReceiveFromResult receiveResult = await receiveTask;
+ Assert.Equal(payload.Length, receiveResult.ReceivedBytes);
+ Assert.Equal(payload, receiveBuffer);
+ Assert.Equal(sender.LocalEndPoint, receiveResult.RemoteEndPoint);
+ }
+
+ private static async Task RunMultishotRecvBasicScenarioAsync(int iterations)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ if (!IsIoUringMultishotRecvSupported())
+ {
+ return;
+ }
+
+ ulong reuseBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvReuse");
+ ulong asyncCancelBefore = GetIoUringTelemetryCounterValue("_ioUringAsyncCancelRequestCqes");
+ byte[] receiveBuffer = new byte[1];
+ byte[] payload = new byte[1];
+ for (int i = 0; i < iterations; i++)
+ {
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ payload[0] = unchecked((byte)(i + 1));
+ Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+ Assert.Equal(payload[0], receiveBuffer[0]);
+ }
+
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true),
+ "Expected persistent multishot recv to remain armed after repeated ReceiveAsync calls.");
+ Assert.True(
+ GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvReuse") > reuseBefore,
+ "Expected ReceiveAsync calls to reuse an armed multishot recv (TryReplace path).");
+ Assert.Equal(
+ asyncCancelBefore,
+ GetIoUringTelemetryCounterValue("_ioUringAsyncCancelRequestCqes"));
+ }
+
+ private static async Task RunMultishotRecvCancellationScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+ _ = client;
+
+ if (!IsIoUringMultishotRecvSupported())
+ {
+ return;
+ }
+
+ ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination");
+ byte[] receiveBuffer = new byte[16];
+ using var cts = new CancellationTokenSource();
+ Task pendingReceive = ToTask(server.ReceiveAsync(receiveBuffer.AsMemory(), SocketFlags.None, cts.Token));
+ await Task.Yield();
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true),
+ "Expected persistent multishot recv to arm before cancellation.");
+
+ cts.Cancel();
+ Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(pendingReceive, completed);
+ Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive);
+ AssertCanceledOrInterrupted(ex);
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: false),
+ "Expected persistent multishot recv to disarm after cancellation.");
+ Assert.True(
+ GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore,
+ "Expected cancellation to produce a terminal persistent multishot recv completion.");
+ }
+
+ private static async Task RunMultishotRecvPeerCloseScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ if (!IsIoUringMultishotRecvSupported())
+ {
+ return;
+ }
+
+ ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination");
+ byte[] receiveBuffer = new byte[8];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ client.Shutdown(SocketShutdown.Both);
+ client.Dispose();
+
+ Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(receiveTask, completed);
+
+ Exception? ex = await Record.ExceptionAsync(async () => await receiveTask);
+ if (ex is null)
+ {
+ Assert.Equal(0, await receiveTask);
+ }
+ else
+ {
+ SocketException socketException = Assert.IsType(ex);
+ Assert.True(
+ socketException.SocketErrorCode == SocketError.ConnectionReset ||
+ socketException.SocketErrorCode == SocketError.OperationAborted ||
+ socketException.SocketErrorCode == SocketError.Interrupted,
+ $"Unexpected socket error after multishot peer close: {socketException.SocketErrorCode}");
+ }
+
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: false),
+ "Expected persistent multishot recv to disarm after terminal peer-close completion.");
+ Assert.True(
+ GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore,
+ "Expected terminal completion to increment persistent multishot recv termination telemetry.");
+ }
+
+ private static async Task RunPersistentMultishotRecvProvidedBufferExhaustionScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ if (!IsIoUringMultishotRecvSupported())
+ {
+ return;
+ }
+
+ byte[] receiveBuffer = new byte[1];
+ Task armReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xC3 }, SocketFlags.None));
+ Assert.Equal(1, await armReceive);
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true),
+ "Expected persistent multishot recv to arm before forced provided-buffer exhaustion.");
+
+ ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+ ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination");
+
+ Assert.True(TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount));
+ Assert.True(forcedBufferCount > 0);
+
+ Task exhaustedReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xC4 }, SocketFlags.None));
+ Task exhaustedCompleted = await Task.WhenAny(exhaustedReceive, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(exhaustedReceive, exhaustedCompleted);
+
+ Exception? exhaustedException = await Record.ExceptionAsync(async () => await exhaustedReceive);
+ SocketException exhaustedSocketException = Assert.IsType(exhaustedException);
+ Assert.Equal(SocketError.NoBufferSpaceAvailable, exhaustedSocketException.SocketErrorCode);
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: false),
+ "Expected persistent multishot recv to disarm after ENOBUFS terminal completion.");
+ Assert.True(
+ GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions") > depletionBefore,
+ "Expected provided-buffer depletion counter to increase after forced exhaustion.");
+ Assert.True(
+ GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore,
+ "Expected persistent multishot recv termination counter to increase after ENOBUFS.");
+
+ Assert.True(TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount));
+ Assert.True(recycledBufferCount > 0, "Expected forced checked-out provided buffers to be recycled for recovery.");
+
+ Task recoveredReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xC5 }, SocketFlags.None));
+ Assert.Equal(1, await recoveredReceive);
+ Assert.Equal(0xC5, receiveBuffer[0]);
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true),
+ "Expected persistent multishot recv to re-arm after provided buffers were recycled.");
+ }
+
+ private static async Task RunPersistentMultishotRecvShapeChangeScenarioAsync()
+ {
+ if (!IsIoUringMultishotRecvSupported())
+ {
+ return;
+ }
+
+ using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ sender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ receiver.Connect(sender.LocalEndPoint!);
+ sender.Connect(receiver.LocalEndPoint!);
+
+ byte[] receiveBuffer = new byte[1];
+ Task armReceive = ToTask(receiver.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await sender.SendAsync(new byte[] { 0xD1 }, SocketFlags.None));
+ Assert.Equal(1, await armReceive);
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(receiver, expectedArmed: true),
+ "Expected persistent multishot recv to arm before shape-change scenario.");
+
+ ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination");
+
+ byte[] receiveFromBuffer = new byte[1];
+ Task receiveFromTask = ToTask(
+ receiver.ReceiveFromAsync(receiveFromBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0)));
+ await Task.Yield();
+ Assert.Equal(1, await sender.SendAsync(new byte[] { 0xD2 }, SocketFlags.None));
+ SocketReceiveFromResult receiveFromResult = await receiveFromTask;
+ Assert.Equal(1, receiveFromResult.ReceivedBytes);
+ Assert.Equal(0xD2, receiveFromBuffer[0]);
+
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(receiver, expectedArmed: false),
+ "Expected persistent multishot recv to disarm when receive shape switches to ReceiveFromAsync.");
+ Assert.True(
+ GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore,
+ "Expected shape-change cancellation to increment persistent multishot recv terminations.");
+
+ Task rearmReceive = ToTask(receiver.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await sender.SendAsync(new byte[] { 0xD3 }, SocketFlags.None));
+ Assert.Equal(1, await rearmReceive);
+ Assert.Equal(0xD3, receiveBuffer[0]);
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(receiver, expectedArmed: true),
+ "Expected persistent multishot recv to re-arm after shape-change operation completed.");
+ }
+
+ private static async Task RunPersistentMultishotRecvConcurrentCloseRaceScenarioAsync(int iterations)
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(Math.Max(4, iterations));
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ if (!IsIoUringMultishotRecvSupported())
+ {
+ return;
+ }
+
+ for (int i = 0; i < iterations; i++)
+ {
+ var pair = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ using Socket client = pair.Client;
+ using Socket server = pair.Server;
+
+ byte[] armBuffer = new byte[1];
+ Task armReceive = ToTask(server.ReceiveAsync(armBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xE1 }, SocketFlags.None));
+ Assert.Equal(1, await armReceive);
+
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true),
+ "Expected persistent multishot recv to arm before concurrent close race.");
+
+ Task pendingReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None));
+ await Task.Yield();
+
+ _ = Task.Run(() =>
+ {
+ server.Dispose();
+ client.Dispose();
+ });
+
+ Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(pendingReceive, completed);
+
+ Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive);
+ if (ex is SocketException socketException)
+ {
+ Assert.True(
+ socketException.SocketErrorCode == SocketError.ConnectionReset ||
+ socketException.SocketErrorCode == SocketError.OperationAborted ||
+ socketException.SocketErrorCode == SocketError.Interrupted,
+ $"Unexpected socket error from persistent multishot recv close race: {socketException.SocketErrorCode}");
+ }
+ else if (ex is not ObjectDisposedException and not null)
+ {
+ throw ex;
+ }
+ }
+ }
+
+ private static async Task RunNetworkStreamReadAsyncCancellationTokenScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ using var networkStream = new NetworkStream(server, ownsSocket: false);
+
+ byte[] readBuffer = new byte[1];
+ using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(15));
+ ValueTask readTask = networkStream.ReadAsync(readBuffer, cts.Token);
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xF1 }, SocketFlags.None));
+ Assert.Equal(1, await readTask);
+ Assert.Equal(0xF1, readBuffer[0]);
+ }
+
+ private static async Task RunReceiveAsyncSocketAsyncEventArgsBufferListScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[1];
+ using var receiveEventArgs = new SocketAsyncEventArgs
+ {
+ BufferList = new List>
+ {
+ new ArraySegment(receiveBuffer)
+ }
+ };
+
+ Task receiveTask = StartSocketAsyncEventArgsOperation(
+ server,
+ receiveEventArgs,
+ static (s, args) => s.ReceiveAsync(args));
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xF2 }, SocketFlags.None));
+ SocketAsyncEventArgs completedReceive = await receiveTask;
+ Assert.Equal(SocketError.Success, completedReceive.SocketError);
+ Assert.Equal(1, completedReceive.BytesTransferred);
+ Assert.Equal(0xF2, receiveBuffer[0]);
+ Assert.False(
+ IsPersistentMultishotRecvArmed(server),
+ "SAEA BufferList receive path should not arm persistent multishot recv state.");
+ }
+
+ private static async Task RunMultishotAcceptBasicScenarioAsync(int connectionCount)
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(connectionCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ if (!IsIoUringMultishotAcceptSupported())
+ {
+ return;
+ }
+
+ Task firstAcceptTask = listener.AcceptAsync();
+ Assert.True(
+ await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true),
+ "Multishot accept was not armed while first accept was pending.");
+
+ using (Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp))
+ {
+ await firstClient.ConnectAsync(endpoint);
+ using Socket firstServer = await AwaitWithTimeoutAsync(firstAcceptTask, "first multishot accept");
+ await AssertConnectedPairRoundTripAsync(firstClient, firstServer, 0x41);
+ }
+
+ for (int i = 1; i < connectionCount; i++)
+ {
+ (Socket clientSocket, Socket serverSocket) = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ using Socket client = clientSocket;
+ using Socket server = serverSocket;
+ await AssertConnectedPairRoundTripAsync(client, server, unchecked((byte)(0x41 + i)));
+ }
+ }
+
+ private static async Task RunMultishotAcceptPrequeueScenarioAsync(int prequeuedConnectionCount)
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(prequeuedConnectionCount + 2);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ if (!IsIoUringMultishotAcceptSupported())
+ {
+ return;
+ }
+
+ // Arm multishot accept once, then connect a burst of clients before issuing
+ // subsequent AcceptAsync calls to create a pre-queue opportunity.
+ Task armingAcceptTask = listener.AcceptAsync();
+ Assert.True(
+ await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true),
+ "Multishot accept was not armed while arming accept was pending.");
+
+ var connectedClients = new List(prequeuedConnectionCount + 1);
+ try
+ {
+ var connectTasks = new List(prequeuedConnectionCount + 1);
+ for (int i = 0; i < prequeuedConnectionCount + 1; i++)
+ {
+ var client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ connectedClients.Add(client);
+ connectTasks.Add(client.ConnectAsync(endpoint));
+ }
+
+ await Task.WhenAll(connectTasks);
+ using Socket armingServer = await AwaitWithTimeoutAsync(armingAcceptTask, "arming multishot accept");
+
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromSeconds(5);
+ int queueCount = 0;
+ while (DateTime.UtcNow < deadline)
+ {
+ queueCount = GetListenerMultishotAcceptQueueCount(listener);
+ if (queueCount > 0)
+ {
+ break;
+ }
+
+ await Task.Delay(25);
+ }
+
+ Assert.True(queueCount > 0, "Expected at least one pre-accepted connection to be queued.");
+
+ for (int i = 0; i < prequeuedConnectionCount; i++)
+ {
+ using Socket _ = await AwaitWithTimeoutAsync(listener.AcceptAsync(), "prequeued accept completion");
+ }
+ }
+ finally
+ {
+ foreach (Socket client in connectedClients)
+ {
+ client.Dispose();
+ }
+ }
+ }
+
+ private static async Task RunMultishotAcceptListenerCloseScenarioAsync()
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(4);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ if (!IsIoUringMultishotAcceptSupported())
+ {
+ return;
+ }
+
+ Task firstAcceptTask = listener.AcceptAsync();
+ Assert.True(
+ await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true),
+ "Multishot accept was not armed while first accept was pending.");
+
+ using (Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp))
+ {
+ await firstClient.ConnectAsync(endpoint);
+ using Socket firstServer = await AwaitWithTimeoutAsync(firstAcceptTask, "first accept before listener close");
+ await AssertConnectedPairRoundTripAsync(firstClient, firstServer, 0x71);
+ }
+
+ Task pendingAccept = listener.AcceptAsync();
+ await Task.Yield();
+ listener.Dispose();
+
+ Task completed = await Task.WhenAny(pendingAccept, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(pendingAccept, completed);
+
+ Exception? acceptException = await Record.ExceptionAsync(async () => await pendingAccept);
+ Assert.NotNull(acceptException);
+ Assert.True(
+ acceptException is ObjectDisposedException ||
+ acceptException is SocketException,
+ $"Unexpected pending-accept exception after listener close: {acceptException}");
+
+ Assert.Equal(0, GetListenerMultishotAcceptQueueCount(listener));
+ Assert.False(IsListenerMultishotAcceptArmed(listener));
+ }
+
+ private static async Task RunMultishotAcceptDisposeDuringArmingRaceScenarioAsync(int iterations)
+ {
+ if (!IsIoUringMultishotAcceptSupported())
+ {
+ return;
+ }
+
+ for (int i = 0; i < iterations; i++)
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(1);
+
+ Task pendingAccept = listener.AcceptAsync();
+ Task disposeTask = Task.Run(listener.Dispose);
+
+ Task completed = await Task.WhenAny(pendingAccept, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(pendingAccept, completed);
+ await disposeTask;
+
+ Exception? acceptException = await Record.ExceptionAsync(async () => await pendingAccept);
+ Assert.NotNull(acceptException);
+ Assert.True(
+ acceptException is ObjectDisposedException || acceptException is SocketException,
+ $"Unexpected accept exception during dispose/arm race at iteration {i}: {acceptException}");
+ }
+ }
+
+ private static async Task RunMultishotAcceptUnavailableOneShotScenarioAsync()
+ {
+ if (IsIoUringMultishotAcceptSupported())
+ {
+ return;
+ }
+
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(2);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ Task acceptTask = listener.AcceptAsync();
+ await Task.Yield();
+ Assert.False(IsListenerMultishotAcceptArmed(listener), "Listener should remain in one-shot accept mode.");
+
+ using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ await client.ConnectAsync(endpoint);
+ using Socket server = await AwaitWithTimeoutAsync(acceptTask, "one-shot accept fallback");
+ await AssertConnectedPairRoundTripAsync(client, server, 0x7A);
+ }
+
+ private static async Task RunMultishotAcceptRearmAfterTerminalCqeScenarioAsync()
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(4);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ if (!IsIoUringMultishotAcceptSupported())
+ {
+ return;
+ }
+
+ Task firstAcceptTask = listener.AcceptAsync();
+ Assert.True(
+ await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true),
+ "Multishot accept was not armed before forced terminal CQE.");
+
+ using (Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp))
+ {
+ await firstClient.ConnectAsync(endpoint);
+ Exception? firstAcceptException = await Record.ExceptionAsync(async () => await firstAcceptTask);
+ Assert.NotNull(firstAcceptException);
+ Assert.True(
+ firstAcceptException is SocketException ||
+ firstAcceptException is ObjectDisposedException,
+ $"Unexpected forced-accept exception type: {firstAcceptException}");
+ }
+
+ Assert.True(
+ await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: false),
+ "Expected multishot accept to disarm after terminal CQE.");
+
+ Task secondAcceptTask = listener.AcceptAsync();
+ Assert.True(
+ await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true),
+ "Expected multishot accept to re-arm on subsequent accept.");
+
+ using Socket secondClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ await secondClient.ConnectAsync(endpoint);
+ using Socket secondServer = await AwaitWithTimeoutAsync(secondAcceptTask, "re-armed multishot accept");
+ await AssertConnectedPairRoundTripAsync(secondClient, secondServer, 0x33);
+ }
+
+ private static async Task RunMultishotAcceptHighConnectionRateScenarioAsync(int connectionCount)
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(connectionCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ if (!IsIoUringMultishotAcceptSupported())
+ {
+ return;
+ }
+
+ var acceptTasks = new Task[connectionCount];
+ var clients = new Socket?[connectionCount];
+ var connectTasks = new Task[connectionCount];
+
+ for (int i = 0; i < connectionCount; i++)
+ {
+ acceptTasks[i] = listener.AcceptAsync();
+ }
+
+ try
+ {
+ for (int i = 0; i < connectionCount; i++)
+ {
+ clients[i] = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ connectTasks[i] = clients[i].ConnectAsync(endpoint);
+ }
+
+ await Task.WhenAll(connectTasks);
+ Socket[] servers = await Task.WhenAll(acceptTasks);
+
+ try
+ {
+ var verificationTasks = new List(connectionCount);
+ for (int i = 0; i < connectionCount; i++)
+ {
+ Socket client = Assert.IsType(clients[i]);
+ Socket server = servers[i];
+ byte marker = unchecked((byte)i);
+ verificationTasks.Add(AssertConnectedPairRoundTripAsync(client, server, marker));
+ }
+
+ await Task.WhenAll(verificationTasks);
+ }
+ finally
+ {
+ foreach (Socket server in servers)
+ {
+ server.Dispose();
+ }
+ }
+ }
+ finally
+ {
+ foreach (Socket? client in clients)
+ {
+ client?.Dispose();
+ }
+ }
+ }
+
+ private static async Task RunLargeSendWithBackpressureAsync(bool useBufferListSend)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ client.SendBufferSize = 1024;
+ server.ReceiveBufferSize = 1024;
+
+ const int PayloadLength = 2 * 1024 * 1024;
+ byte[] payload = new byte[PayloadLength];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = (byte)i;
+ }
+
+ Task sendTask;
+ if (useBufferListSend)
+ {
+ const int SegmentSize = 1024;
+ var sendBuffers = new List>();
+ for (int offset = 0; offset < payload.Length; offset += SegmentSize)
+ {
+ int count = Math.Min(SegmentSize, payload.Length - offset);
+ sendBuffers.Add(new ArraySegment(payload, offset, count));
+ }
+
+ sendTask = ToTask(client.SendAsync(sendBuffers, SocketFlags.None));
+ }
+ else
+ {
+ sendTask = ToTask(client.SendAsync(payload, SocketFlags.None));
+ }
+
+ await Task.Delay(20);
+
+ byte[] received = new byte[payload.Length];
+ int totalReceived = 0;
+ while (totalReceived < payload.Length)
+ {
+ int receivedNow = await ToTask(server.ReceiveAsync(received.AsMemory(totalReceived), SocketFlags.None));
+ Assert.True(receivedNow > 0);
+ totalReceived += receivedNow;
+ if ((totalReceived & 0x3FFF) == 0)
+ {
+ await Task.Delay(1);
+ }
+ }
+
+ Assert.Equal(payload.Length, await sendTask);
+ Assert.Equal(payload.Length, totalReceived);
+ Assert.Equal(payload, received);
+ }
+
+ private static async Task RunAsyncCancelRequestIsolationScenarioAsync(int iterations)
+ {
+ await WithIoUringNativeDiagnosticsSnapshotDeltaAsync(
+ async () =>
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(2);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ var cancelPair = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ using Socket cancelClient = cancelPair.Client;
+ using Socket cancelServer = cancelPair.Server;
+
+ var activePair = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ using Socket activeClient = activePair.Client;
+ using Socket activeServer = activePair.Server;
+
+ byte[] cancelBuffer = new byte[1];
+ byte[] activeBuffer = new byte[1];
+ for (int i = 0; i < iterations; i++)
+ {
+ using var cts = new CancellationTokenSource();
+ Task canceledReceive = ToTask(cancelServer.ReceiveAsync(cancelBuffer, SocketFlags.None, cts.Token));
+ Task activeReceive = ToTask(activeServer.ReceiveAsync(activeBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ cts.Cancel();
+ byte expected = unchecked((byte)(i + 1));
+ Assert.Equal(1, await activeClient.SendAsync(new byte[] { expected }, SocketFlags.None));
+
+ Assert.Equal(1, await activeReceive);
+ Assert.Equal(expected, activeBuffer[0]);
+
+ Exception? cancelException = await Record.ExceptionAsync(async () => await canceledReceive);
+ AssertCanceledOrInterrupted(cancelException);
+ }
+ },
+ (diagnosticsBefore, diagnosticsAfter) =>
+ {
+ ulong asyncCancelRequestCqeDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeCount,
+ diagnosticsAfter.AsyncCancelRequestCqeCount);
+ if (asyncCancelRequestCqeDelta == 0)
+ {
+ return;
+ }
+
+ ulong asyncCancelRequestCqeEnoentDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeEnoentCount,
+ diagnosticsAfter.AsyncCancelRequestCqeEnoentCount);
+ ulong asyncCancelRequestCqeEalreadyDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeEalreadyCount,
+ diagnosticsAfter.AsyncCancelRequestCqeEalreadyCount);
+ ulong asyncCancelRequestCqeOtherDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeOtherCount,
+ diagnosticsAfter.AsyncCancelRequestCqeOtherCount);
+
+ Assert.True(
+ asyncCancelRequestCqeEnoentDelta + asyncCancelRequestCqeEalreadyDelta + asyncCancelRequestCqeOtherDelta <= asyncCancelRequestCqeDelta,
+ $"Unexpected async-cancel accounting for isolation scenario: enoent_delta={asyncCancelRequestCqeEnoentDelta}, ealready_delta={asyncCancelRequestCqeEalreadyDelta}, other_delta={asyncCancelRequestCqeOtherDelta}, total_delta={asyncCancelRequestCqeDelta}");
+ },
+ settleDelayMilliseconds: 200);
+ }
+
+ private static async Task RunReceiveMessageFromCancellationAndDisposeScenariosAsync()
+ {
+ using Socket cancelReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ cancelReceiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true);
+ cancelReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ EndPoint cancelRemoteEndPoint = new IPEndPoint(IPAddress.Any, 0);
+ using var cts = new CancellationTokenSource();
+ Task canceledReceive = ToTask(
+ cancelReceiver.ReceiveMessageFromAsync(new byte[64], SocketFlags.None, cancelRemoteEndPoint, cts.Token));
+ await Task.Yield();
+ cts.Cancel();
+
+ Task cancelCompleted = await Task.WhenAny(canceledReceive, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(canceledReceive, cancelCompleted);
+ Exception? cancelException = await Record.ExceptionAsync(async () => await canceledReceive);
+ AssertCanceledOrInterrupted(cancelException);
+
+ using Socket disposeReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ disposeReceiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true);
+ disposeReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ byte[] receiveBuffer = new byte[32];
+ using var receiveEventArgs = new SocketAsyncEventArgs
+ {
+ BufferList = new List>
+ {
+ new ArraySegment(receiveBuffer, 0, 16),
+ new ArraySegment(receiveBuffer, 16, 16)
+ },
+ RemoteEndPoint = new IPEndPoint(IPAddress.Any, 0)
+ };
+
+ Task pendingReceive = StartReceiveMessageFromAsync(disposeReceiver, receiveEventArgs);
+ await Task.Yield();
+ disposeReceiver.Dispose();
+
+ Task disposeCompleted = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(pendingReceive, disposeCompleted);
+ SocketAsyncEventArgs completedArgs = await pendingReceive;
+ Assert.True(
+ completedArgs.SocketError == SocketError.OperationAborted ||
+ completedArgs.SocketError == SocketError.Interrupted);
+ }
+
+ private static async Task RunReceiveMessageFromCancelThenReceiveScenarioAsync()
+ {
+ using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ receiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true);
+ receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ sender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ EndPoint initialRemoteEndPoint = new IPEndPoint(IPAddress.Any, 0);
+ using var cts = new CancellationTokenSource();
+ Task canceledReceive = ToTask(
+ receiver.ReceiveMessageFromAsync(new byte[64], SocketFlags.None, initialRemoteEndPoint, cts.Token));
+ await Task.Yield();
+ cts.Cancel();
+
+ Task canceledCompleted = await Task.WhenAny(canceledReceive, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(canceledReceive, canceledCompleted);
+ Exception? cancelException = await Record.ExceptionAsync(async () => await canceledReceive);
+ AssertCanceledOrInterrupted(cancelException);
+
+ byte[] payload = new byte[] { 0x10, 0x20, 0x30, 0x40 };
+ Assert.Equal(
+ payload.Length,
+ await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!));
+
+ byte[] receiveBuffer = new byte[64];
+ EndPoint remoteEndPoint = new IPEndPoint(IPAddress.Any, 0);
+ SocketReceiveMessageFromResult received = await ToTask(
+ receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint, CancellationToken.None));
+
+ Assert.Equal(payload.Length, received.ReceivedBytes);
+ Assert.True(payload.AsSpan().SequenceEqual(receiveBuffer.AsSpan(0, payload.Length)));
+ }
+
+ private static async Task RunReceiveMessageFromCancellationAndDisposeScenariosWithGcPressureAsync(int iterations)
+ {
+ for (int i = 0; i < iterations; i++)
+ {
+ await RunReceiveMessageFromCancellationAndDisposeScenariosAsync();
+ if ((i & 0x3) == 0)
+ {
+ GC.Collect();
+ GC.WaitForPendingFinalizers();
+ GC.Collect();
+ }
+ }
+ }
+
+ private static async Task RunTeardownDrainTrackedOperationsScenarioAsync(int iterations)
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(8);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ for (int i = 0; i < iterations; i++)
+ {
+ var pair = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ using Socket client = pair.Client;
+ using Socket server = pair.Server;
+
+ Task pendingReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None));
+ await Task.Yield();
+
+ client.Dispose();
+ server.Dispose();
+
+ Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(pendingReceive, completed);
+ Exception? receiveException = await Record.ExceptionAsync(async () => await pendingReceive);
+ AssertCanceledDisposedOrInterrupted(receiveException);
+ }
+ }
+
+ private static async Task RunTeardownCancellationDuplicateGuardScenarioAsync(int iterations)
+ {
+ await WithIoUringNativeDiagnosticsSnapshotDeltaAsync(
+ async () =>
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(8);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ for (int i = 0; i < iterations; i++)
+ {
+ var pair = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ using Socket client = pair.Client;
+ using Socket server = pair.Server;
+
+ using var cts = new CancellationTokenSource();
+ Task pendingReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None, cts.Token));
+ await Task.Yield();
+ cts.Cancel();
+
+ server.Dispose();
+ client.Dispose();
+
+ Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(pendingReceive, completed);
+ Exception? receiveException = await Record.ExceptionAsync(async () => await pendingReceive);
+ AssertCanceledDisposedOrInterrupted(receiveException);
+ }
+ },
+ (diagnosticsBefore, diagnosticsAfter) =>
+ {
+ ulong asyncCancelRequestCqeDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeCount,
+ diagnosticsAfter.AsyncCancelRequestCqeCount);
+ if (asyncCancelRequestCqeDelta == 0)
+ {
+ return;
+ }
+
+ ulong asyncCancelRequestCqeEnoentDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeEnoentCount,
+ diagnosticsAfter.AsyncCancelRequestCqeEnoentCount);
+ ulong asyncCancelRequestCqeEalreadyDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeEalreadyCount,
+ diagnosticsAfter.AsyncCancelRequestCqeEalreadyCount);
+ ulong asyncCancelRequestCqeOtherDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeOtherCount,
+ diagnosticsAfter.AsyncCancelRequestCqeOtherCount);
+
+ // Guardrail: one operation per iteration should not devolve into persistent multi-request cancellation churn.
+ ulong maxExpectedCancelRequestCqes = (ulong)(iterations + (iterations / 2) + 8);
+ Assert.True(
+ asyncCancelRequestCqeDelta <= maxExpectedCancelRequestCqes,
+ $"Unexpected async-cancel CQE inflation: delta={asyncCancelRequestCqeDelta}, max={maxExpectedCancelRequestCqes}, iterations={iterations}");
+ Assert.True(
+ asyncCancelRequestCqeEnoentDelta + asyncCancelRequestCqeEalreadyDelta + asyncCancelRequestCqeOtherDelta <= asyncCancelRequestCqeDelta,
+ $"Unexpected async-cancel accounting: enoent_delta={asyncCancelRequestCqeEnoentDelta}, ealready_delta={asyncCancelRequestCqeEalreadyDelta}, other_delta={asyncCancelRequestCqeOtherDelta}, total_delta={asyncCancelRequestCqeDelta}");
+ },
+ settleDelayMilliseconds: 200);
+ }
+
+ private static async Task RunCancellationSubmitContentionScenarioAsync(int connectionCount, int cancellationsPerConnection)
+ {
+ await WithIoUringNativeDiagnosticsSnapshotDeltaAsync(
+ async () =>
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(connectionCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ var clients = new List(connectionCount);
+ var servers = new List(connectionCount);
+ try
+ {
+ for (int i = 0; i < connectionCount; i++)
+ {
+ var pair = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ clients.Add(pair.Client);
+ servers.Add(pair.Server);
+ }
+
+ Task[] churnTasks = new Task[connectionCount];
+ for (int index = 0; index < connectionCount; index++)
+ {
+ Socket server = servers[index];
+ churnTasks[index] = Task.Run(async () =>
+ {
+ byte[] receiveBuffer = new byte[1];
+ for (int i = 0; i < cancellationsPerConnection; i++)
+ {
+ using var cts = new CancellationTokenSource();
+ Task pendingReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None, cts.Token));
+ cts.Cancel();
+
+ Exception? receiveException = await Record.ExceptionAsync(async () => await pendingReceive);
+ AssertCanceledOrInterrupted(receiveException);
+ }
+ });
+ }
+
+ await Task.WhenAll(churnTasks);
+
+ // Ensure the cancellation churn does not stall normal completion progress afterward.
+ for (int i = 0; i < connectionCount; i++)
+ {
+ byte expected = unchecked((byte)(i + 1));
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(servers[i].ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ Assert.Equal(1, await clients[i].SendAsync(new byte[] { expected }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+ Assert.Equal(expected, receiveBuffer[0]);
+ }
+ }
+ finally
+ {
+ foreach (Socket server in servers)
+ {
+ server.Dispose();
+ }
+
+ foreach (Socket client in clients)
+ {
+ client.Dispose();
+ }
+ }
+ },
+ (diagnosticsBefore, diagnosticsAfter) =>
+ {
+ ulong asyncCancelRequestCqeDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeCount,
+ diagnosticsAfter.AsyncCancelRequestCqeCount);
+ if (asyncCancelRequestCqeDelta == 0)
+ {
+ // On kernels without async-cancel opcode support this path may fallback without cancel-request CQEs.
+ return;
+ }
+
+ ulong asyncCancelRequestCqeEnoentDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeEnoentCount,
+ diagnosticsAfter.AsyncCancelRequestCqeEnoentCount);
+ ulong asyncCancelRequestCqeEalreadyDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeEalreadyCount,
+ diagnosticsAfter.AsyncCancelRequestCqeEalreadyCount);
+ ulong asyncCancelRequestCqeOtherDelta = CounterDelta(
+ diagnosticsBefore.AsyncCancelRequestCqeOtherCount,
+ diagnosticsAfter.AsyncCancelRequestCqeOtherCount);
+
+ ulong maxExpectedCancelRequestCqes = (ulong)(connectionCount * cancellationsPerConnection * 2) + 64;
+ Assert.True(
+ asyncCancelRequestCqeDelta <= maxExpectedCancelRequestCqes,
+ $"Unexpected async-cancel request CQE inflation under contention: delta={asyncCancelRequestCqeDelta}, max={maxExpectedCancelRequestCqes}, connections={connectionCount}, cancels_per_connection={cancellationsPerConnection}");
+ Assert.True(
+ asyncCancelRequestCqeEnoentDelta + asyncCancelRequestCqeEalreadyDelta + asyncCancelRequestCqeOtherDelta <= asyncCancelRequestCqeDelta,
+ $"Unexpected async-cancel accounting under contention: enoent_delta={asyncCancelRequestCqeEnoentDelta}, ealready_delta={asyncCancelRequestCqeEalreadyDelta}, other_delta={asyncCancelRequestCqeOtherDelta}, total_delta={asyncCancelRequestCqeDelta}");
+ },
+ settleDelayMilliseconds: 200);
+ }
+
+ private static async Task RunMixedModeReadinessCompletionStressScenarioAsync(int iterations)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] completionBuffer = new byte[1];
+ byte[] payload = new byte[1];
+
+ for (int i = 0; i < iterations; i++)
+ {
+ Task completionReceive = ToTask(server.ReceiveAsync(completionBuffer, SocketFlags.None));
+ Task readinessProbe = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None));
+ await Task.Yield();
+
+ payload[0] = unchecked((byte)(i + 1));
+ Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(1, await completionReceive);
+ Assert.Equal(payload[0], completionBuffer[0]);
+
+ Task completed = await Task.WhenAny(readinessProbe, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(readinessProbe, completed);
+ Assert.Equal(0, await readinessProbe);
+ }
+ }
+
+ private static async Task RunSameSocketReadinessCompletionBacklogScenarioAsync(int iterations, int completionBatchSize)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] sendPayload = new byte[completionBatchSize];
+ for (int iteration = 0; iteration < iterations; iteration++)
+ {
+ var receiveBuffers = new byte[completionBatchSize][];
+ var completionReceives = new Task[completionBatchSize];
+ for (int i = 0; i < completionBatchSize; i++)
+ {
+ byte expected = unchecked((byte)((iteration + i + 1) & 0xFF));
+ sendPayload[i] = expected;
+ byte[] receiveBuffer = new byte[1];
+ receiveBuffers[i] = receiveBuffer;
+ completionReceives[i] = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ }
+
+ Task readinessProbe = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None));
+ await Task.Yield();
+
+ int sent = 0;
+ while (sent < sendPayload.Length)
+ {
+ sent += await client.SendAsync(sendPayload.AsMemory(sent), SocketFlags.None);
+ }
+
+ Assert.Equal(sendPayload.Length, sent);
+
+ Task readinessCompleted = await Task.WhenAny(readinessProbe, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(readinessProbe, readinessCompleted);
+ Assert.Equal(0, await readinessProbe);
+
+ int[] receivedCounts = await Task.WhenAll(completionReceives);
+ for (int i = 0; i < completionBatchSize; i++)
+ {
+ Assert.Equal(1, receivedCounts[i]);
+ Assert.Equal(sendPayload[i], receiveBuffers[i][0]);
+ }
+ }
+ }
+
+ private static async Task RunPureCompletionScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] tcpSendPayload = new byte[] { 0x11 };
+ byte[] tcpReceiveBuffer = new byte[1];
+
+ Task tcpReceive = ToTask(server.ReceiveAsync(tcpReceiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(tcpSendPayload, SocketFlags.None));
+ Assert.Equal(1, await AwaitWithTimeoutAsync(tcpReceive, nameof(tcpReceive)));
+ Assert.Equal(tcpSendPayload[0], tcpReceiveBuffer[0]);
+
+ Task tcpZeroByteReceive = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None));
+ await Task.Yield();
+
+ byte[] tcpPayloadAfterProbe = new byte[] { 0x22 };
+ Assert.Equal(1, await client.SendAsync(tcpPayloadAfterProbe, SocketFlags.None));
+ Task completed = await Task.WhenAny(tcpZeroByteReceive, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(tcpZeroByteReceive, completed);
+ Assert.Equal(0, await tcpZeroByteReceive);
+
+ byte[] tcpDataAfterZeroByte = new byte[1];
+ Task tcpTailReceive = ToTask(server.ReceiveAsync(tcpDataAfterZeroByte, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await AwaitWithTimeoutAsync(tcpTailReceive, nameof(tcpTailReceive)));
+ Assert.Equal(tcpPayloadAfterProbe[0], tcpDataAfterZeroByte[0]);
+
+ using Socket connectListener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ connectListener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ connectListener.Listen(1);
+ IPEndPoint connectEndPoint = (IPEndPoint)connectListener.LocalEndPoint!;
+
+ using Socket connectClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ Task acceptTask = connectListener.AcceptAsync();
+ await connectClient.ConnectAsync(connectEndPoint);
+ using Socket connectServer = await AwaitWithTimeoutAsync(acceptTask, nameof(acceptTask));
+
+ byte[] connectPayload = new byte[] { 0x33 };
+ Assert.Equal(1, await connectClient.SendAsync(connectPayload, SocketFlags.None));
+ byte[] connectReceiveBuffer = new byte[1];
+ Assert.Equal(1, await connectServer.ReceiveAsync(connectReceiveBuffer, SocketFlags.None));
+ Assert.Equal(connectPayload[0], connectReceiveBuffer[0]);
+
+ using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ using Socket udpSender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ udpSender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ byte[] udpPayload = new byte[] { 0x33, 0x44, 0x55 };
+ byte[] udpReceiveBuffer = new byte[udpPayload.Length];
+
+ Task receiveFromTask =
+ ToTask(receiver.ReceiveFromAsync(udpReceiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0)));
+ await Task.Yield();
+ Assert.Equal(udpPayload.Length, await udpSender.SendToAsync(udpPayload, SocketFlags.None, receiver.LocalEndPoint!));
+
+ SocketReceiveFromResult receiveFromResult = await receiveFromTask;
+ Assert.Equal(udpPayload.Length, receiveFromResult.ReceivedBytes);
+ Assert.Equal(udpPayload, udpReceiveBuffer);
+ Assert.Equal(udpSender.LocalEndPoint, receiveFromResult.RemoteEndPoint);
+ }
+
+ private static async Task RunBoundedWaitBufferPressureScenarioAsync(int connectionCount)
+ {
+ await WithIoUringNativeDiagnosticsSnapshotDeltaAsync(
+ async () =>
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(connectionCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ var clients = new List(connectionCount);
+ var servers = new List(connectionCount);
+ var receiveBuffers = new List(connectionCount);
+ var receiveTasks = new List>(connectionCount);
+ var sendTasks = new List>(connectionCount);
+
+ try
+ {
+ for (int i = 0; i < connectionCount; i++)
+ {
+ var pair = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ clients.Add(pair.Client);
+ servers.Add(pair.Server);
+
+ byte[] receiveBuffer = new byte[1];
+ receiveBuffers.Add(receiveBuffer);
+ receiveTasks.Add(ToTask(pair.Server.ReceiveAsync(receiveBuffer, SocketFlags.None)));
+ }
+
+ await Task.Yield();
+
+ for (int i = 0; i < clients.Count; i++)
+ {
+ byte payload = unchecked((byte)(i + 1));
+ sendTasks.Add(ToTask(clients[i].SendAsync(new byte[] { payload }, SocketFlags.None)));
+ }
+
+ int[] sentBytes = await Task.WhenAll(sendTasks);
+ int[] receivedBytes = await Task.WhenAll(receiveTasks);
+
+ for (int i = 0; i < connectionCount; i++)
+ {
+ Assert.Equal(1, sentBytes[i]);
+ Assert.Equal(1, receivedBytes[i]);
+ Assert.Equal(unchecked((byte)(i + 1)), receiveBuffers[i][0]);
+ }
+ }
+ finally
+ {
+ foreach (Socket server in servers)
+ {
+ server.Dispose();
+ }
+
+ foreach (Socket client in clients)
+ {
+ client.Dispose();
+ }
+ }
+ },
+ (diagnosticsBefore, diagnosticsAfter) =>
+ {
+ ulong socketEventBufferFullDelta = CounterDelta(
+ diagnosticsBefore.SocketEventBufferFullCount,
+ diagnosticsAfter.SocketEventBufferFullCount);
+
+ Assert.True(
+ socketEventBufferFullDelta != 0,
+ $"Expected io_uring wait-buffer pressure counter to increase. socket_delta={socketEventBufferFullDelta}");
+ },
+ skipScenarioWhenIoUringUnavailable: true);
+ }
+
+ private static async Task RunPrepareQueueOverflowFallbackScenarioAsync(int connectionCount)
+ {
+ ulong overflowBefore = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflows");
+ ulong fallbackBefore = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflowFallbacks");
+ bool observedOverflow = false;
+
+ for (int round = 0; round < 4; round++)
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(connectionCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ var clients = new List(connectionCount);
+ var servers = new List(connectionCount);
+ var receiveTasks = new List>(connectionCount);
+ try
+ {
+ for (int i = 0; i < connectionCount; i++)
+ {
+ var pair = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ clients.Add(pair.Client);
+ servers.Add(pair.Server);
+
+ receiveTasks.Add(ToTask(pair.Server.ReceiveAsync(new byte[1], SocketFlags.None)));
+ }
+
+ await Task.Yield();
+
+ for (int i = 0; i < connectionCount; i++)
+ {
+ Assert.Equal(1, await clients[i].SendAsync(new byte[] { 0x5A }, SocketFlags.None));
+ }
+
+ for (int i = 0; i < receiveTasks.Count; i++)
+ {
+ Assert.Equal(1, await AwaitWithTimeoutAsync(receiveTasks[i], $"overflow_receive_{round}_{i}"));
+ }
+ }
+ finally
+ {
+ foreach (Socket server in servers)
+ {
+ server.Dispose();
+ }
+
+ foreach (Socket client in clients)
+ {
+ client.Dispose();
+ }
+ }
+
+ ulong overflowAfterRound = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflows");
+ ulong fallbackAfterRound = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflowFallbacks");
+ if (overflowAfterRound > overflowBefore)
+ {
+ observedOverflow = true;
+ Assert.True(
+ fallbackAfterRound > fallbackBefore,
+ $"Expected prepare queue overflow fallback counter to increase once overflow is observed. before={fallbackBefore}, after={fallbackAfterRound}");
+ return;
+ }
+ }
+
+ if (!observedOverflow)
+ {
+ // With very fast event-loop draining, queue overflow can be scheduler-dependent even at capacity=1.
+ // The scenario still validates that completion-mode operations make progress without hangs.
+ ulong fallbackAfter = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflowFallbacks");
+ Assert.True(
+ fallbackAfter >= fallbackBefore,
+ $"Prepare queue overflow fallback counter should be nondecreasing. before={fallbackBefore}, after={fallbackAfter}");
+ }
+ }
+
+ private static async Task RunConnectQueueOverflowFallbackScenarioAsync(int connectionCount)
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(connectionCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ var clients = new List(connectionCount);
+ var connectTasks = new List(connectionCount);
+ var acceptTasks = new List>(connectionCount);
+ var acceptedSockets = new List(connectionCount);
+
+ try
+ {
+ for (int i = 0; i < connectionCount; i++)
+ {
+ Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ clients.Add(client);
+ acceptTasks.Add(listener.AcceptAsync());
+ connectTasks.Add(client.ConnectAsync(endpoint));
+ }
+
+ Task connectAll = Task.WhenAll(connectTasks);
+ Task connectCompleted = await Task.WhenAny(connectAll, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(connectAll, connectCompleted);
+ await connectAll;
+
+ foreach (Task acceptTask in acceptTasks)
+ {
+ acceptedSockets.Add(await AwaitWithTimeoutAsync(acceptTask, nameof(RunConnectQueueOverflowFallbackScenarioAsync)));
+ }
+ }
+ finally
+ {
+ foreach (Socket acceptedSocket in acceptedSockets)
+ {
+ acceptedSocket.Dispose();
+ }
+
+ foreach (Socket client in clients)
+ {
+ client.Dispose();
+ }
+ }
+ }
+
+ private static async Task RunCompletionCancellationRaceAsync(int iterations)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[1];
+ int completedCount = 0;
+ int canceledCount = 0;
+ for (int i = 0; i < iterations; i++)
+ {
+ while (server.Available > 0)
+ {
+ int drainLength = Math.Min(server.Available, 256);
+ byte[] drainBuffer = new byte[drainLength];
+ int drained = await ToTask(server.ReceiveAsync(drainBuffer, SocketFlags.None));
+ if (drained == 0)
+ {
+ break;
+ }
+ }
+
+ using var cts = new CancellationTokenSource();
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None, cts.Token));
+ Task sendTask;
+
+ if ((i & 1) == 0)
+ {
+ cts.Cancel();
+ sendTask = ToTask(client.SendAsync(new byte[] { unchecked((byte)(i + 1)) }, SocketFlags.None));
+ }
+ else
+ {
+ sendTask = ToTask(client.SendAsync(new byte[] { unchecked((byte)(i + 1)) }, SocketFlags.None));
+ await Task.Yield();
+ cts.Cancel();
+ }
+
+ Exception? receiveException = await Record.ExceptionAsync(async () => await receiveTask);
+ if (receiveException is null)
+ {
+ completedCount++;
+ Assert.Equal(1, receiveTask.Result);
+ }
+ else
+ {
+ canceledCount++;
+ AssertCanceledOrInterrupted(receiveException);
+ }
+
+ Assert.Equal(1, await sendTask);
+ }
+
+ Assert.True(completedCount > 0);
+ Assert.True(canceledCount > 0);
+ }
+
+ private static async Task DrainAvailableBytesAsync(Socket socket)
+ {
+ while (socket.Available > 0)
+ {
+ int bytesToRead = Math.Min(socket.Available, 256);
+ byte[] drainBuffer = new byte[bytesToRead];
+ int read = await ToTask(socket.ReceiveAsync(drainBuffer, SocketFlags.None));
+ if (read <= 0)
+ {
+ return;
+ }
+ }
+ }
+
+ private static async Task RunForcedEagainReceiveScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] firstReceiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(firstReceiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ byte sendByte = 0x31;
+ for (int i = 0; i < 6 && !receiveTask.IsCompleted; i++)
+ {
+ Assert.Equal(1, await client.SendAsync(new byte[] { sendByte }, SocketFlags.None));
+ sendByte++;
+ await Task.Delay(10);
+ }
+
+ Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(receiveTask, completed);
+ Assert.True(await receiveTask > 0);
+ await DrainAvailableBytesAsync(server);
+
+ byte[] secondReceiveBuffer = new byte[1];
+ Task followUpReceiveTask = ToTask(server.ReceiveAsync(secondReceiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x40 }, SocketFlags.None));
+ Task followUpCompleted = await Task.WhenAny(followUpReceiveTask, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(followUpReceiveTask, followUpCompleted);
+ Assert.True(await followUpReceiveTask > 0);
+ }
+
+ private static async Task RunForcedEcanceledReceiveScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[1];
+ Task forcedReceiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x44 }, SocketFlags.None));
+
+ Task completed = await Task.WhenAny(forcedReceiveTask, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(forcedReceiveTask, completed);
+ Exception? forcedReceiveException = await Record.ExceptionAsync(async () => await forcedReceiveTask);
+ if (forcedReceiveException is null)
+ {
+ Assert.True(forcedReceiveTask.Result > 0);
+ }
+ else
+ {
+ AssertCanceledOrInterrupted(forcedReceiveException);
+ }
+ await DrainAvailableBytesAsync(server);
+
+ byte[] followUpReceiveBuffer = new byte[1];
+ Task followUpReceiveTask = ToTask(server.ReceiveAsync(followUpReceiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x45 }, SocketFlags.None));
+ Task followUpCompleted = await Task.WhenAny(followUpReceiveTask, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(followUpReceiveTask, followUpCompleted);
+ Assert.True(await followUpReceiveTask > 0);
+ }
+
+ private static Task RunForcedReceiveScenarioAsync(bool forceEcanceled) =>
+ forceEcanceled ? RunForcedEcanceledReceiveScenarioAsync() : RunForcedEagainReceiveScenarioAsync();
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific.
+ public static async Task IoUringOptIn_DoesNotBreakAsyncSocketWorkflows()
+ {
+ await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(64), CreateSocketEngineOptions()).DisposeAsync();
+ }
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific.
+ public static async Task SocketEngine_DefaultOptOut_DoesNotBreakAsyncSocketWorkflows()
+ {
+ await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(ioUringValue: null)).DisposeAsync();
+ }
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific.
+ public static async Task SocketEngine_KillSwitchZero_DoesNotBreakAsyncSocketWorkflows()
+ {
+ await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(ioUringValue: "0")).DisposeAsync();
+ }
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific.
+ public static async Task IoUringConfig_AppContextSwitches_HonoredWhenEnvUnset()
+ {
+ await RemoteExecutor.Invoke(
+ static () =>
+ {
+ AssertBooleanAppContextSwitch(
+ switchName: "System.Net.Sockets.IoUring.Enable",
+ methodName: "IsIoUringEnabled",
+ expectedWhenSwitchTrue: true,
+ expectedWhenSwitchFalse: false);
+ AppContext.SetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", true);
+ Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested"));
+ AppContext.SetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", false);
+ Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested"));
+ },
+ CreateSocketEngineOptions(ioUringValue: null)).DisposeAsync();
+ }
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific.
+ public static async Task IoUringConfig_EnvironmentOverridesAppContext()
+ {
+ await RemoteExecutor.Invoke(
+ static () =>
+ {
+ AppContext.SetSwitch("System.Net.Sockets.IoUring.Enable", true);
+ Assert.False(InvokeSocketAsyncEngineBoolMethod("IsIoUringEnabled"));
+
+ AppContext.SetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", true);
+ Assert.True(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested"));
+ AppContext.SetSwitch("System.Net.Sockets.IoUring.EnableSqPoll", false);
+ Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested"));
+ },
+ CreateSocketEngineOptions(
+ ioUringValue: "0",
+ sqPollEnabled: true)).DisposeAsync();
+ }
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific.
+ public static async Task IoUringConfig_RemovedProductionKnobs_DefaultEnabled()
+ {
+ await RemoteExecutor.Invoke(
+ static () =>
+ {
+ Assert.False(InvokeSocketAsyncEngineBoolMethod("IsIoUringDirectSqeDisabled"));
+ Assert.True(InvokeSocketAsyncEngineBoolMethod("IsZeroCopySendOptedIn"));
+ Assert.True(InvokeSocketAsyncEngineBoolMethod("IsIoUringRegisterBuffersEnabled"));
+ },
+ CreateSocketEngineOptions(ioUringValue: null)).DisposeAsync();
+ }
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific.
+ public static async Task IoUringOptIn_UdpSendReceive_Works()
+ {
+ await RemoteExecutor.Invoke(static async () =>
+ {
+ using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ IPEndPoint receiverEndpoint = (IPEndPoint)receiver.LocalEndPoint!;
+
+ using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ sender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ IPEndPoint senderEndpoint = (IPEndPoint)sender.LocalEndPoint!;
+ sender.Connect(receiverEndpoint);
+
+ byte[] sendBuffer = new byte[] { 7 };
+ byte[] receiveBuffer = new byte[1];
+
+ for (int i = 0; i < 64; i++)
+ {
+ int sent = await sender.SendAsync(sendBuffer, SocketFlags.None);
+ Assert.Equal(1, sent);
+
+ EndPoint remote = new IPEndPoint(IPAddress.Any, 0);
+ SocketReceiveFromResult receiveFrom = await receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, remote);
+ Assert.Equal(1, receiveFrom.ReceivedBytes);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+ Assert.Equal(senderEndpoint, receiveFrom.RemoteEndPoint);
+
+ int echoed = await receiver.SendToAsync(sendBuffer, SocketFlags.None, receiveFrom.RemoteEndPoint);
+ Assert.Equal(1, echoed);
+
+ int received = await sender.ReceiveAsync(receiveBuffer, SocketFlags.None);
+ Assert.Equal(1, received);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+
+ unchecked
+ {
+ sendBuffer[0]++;
+ }
+ }
+ }, CreateSocketEngineOptions()).DisposeAsync();
+ }
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific.
+ public static async Task IoUringOptIn_MultipleConcurrentConnections_Work()
+ {
+ await RemoteExecutor.Invoke(static async () =>
+ {
+ const int ConnectionCount = 32;
+
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(ConnectionCount);
+
+ var acceptTasks = new Task[ConnectionCount];
+ var clients = new Socket[ConnectionCount];
+
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ acceptTasks[i] = listener.AcceptAsync();
+ }
+
+ var connectTasks = new Task[ConnectionCount];
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ clients[i] = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ connectTasks[i] = clients[i].ConnectAsync(endpoint);
+ }
+
+ await Task.WhenAll(connectTasks);
+ Socket[] servers = await Task.WhenAll(acceptTasks);
+
+ var roundTripTasks = new List(ConnectionCount);
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ Socket client = clients[i];
+ Socket server = servers[i];
+ byte value = (byte)(i + 1);
+ roundTripTasks.Add(Task.Run(async () =>
+ {
+ byte[] tx = new byte[] { value };
+ byte[] rx = new byte[1];
+
+ int sent = await client.SendAsync(tx, SocketFlags.None);
+ Assert.Equal(1, sent);
+
+ int received = await server.ReceiveAsync(rx, SocketFlags.None);
+ Assert.Equal(1, received);
+ Assert.Equal(value, rx[0]);
+
+ sent = await server.SendAsync(tx, SocketFlags.None);
+ Assert.Equal(1, sent);
+
+ received = await client.ReceiveAsync(rx, SocketFlags.None);
+ Assert.Equal(1, received);
+ Assert.Equal(value, rx[0]);
+ }));
+ }
+
+ await Task.WhenAll(roundTripTasks);
+
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ servers[i].Dispose();
+ clients[i].Dispose();
+ }
+ }, CreateSocketEngineOptions()).DisposeAsync();
+ }
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific.
+ public static async Task IoUringOptIn_DisconnectReconnectAndCancellation_Work()
+ {
+ await RemoteExecutor.Invoke(static async () =>
+ {
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(2);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ // First connection lifecycle — block scope ensures disposal before reconnect.
+ {
+ var firstPair = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ using Socket firstClient = firstPair.Client;
+ using Socket firstServer = firstPair.Server;
+ }
+
+ // Reconnect and validate cancellation + subsequent data flow.
+ var secondPair = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ using Socket secondClient = secondPair.Client;
+ using Socket secondServer = secondPair.Server;
+
+ byte[] receiveBuffer = new byte[1];
+ using (var cts = new CancellationTokenSource())
+ {
+ var pendingReceive = secondServer.ReceiveAsync(receiveBuffer.AsMemory(), SocketFlags.None, cts.Token);
+ cts.Cancel();
+
+ Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive);
+ Assert.NotNull(ex);
+ Assert.True(
+ ex is OperationCanceledException ||
+ ex is SocketException socketException &&
+ (socketException.SocketErrorCode == SocketError.OperationAborted || socketException.SocketErrorCode == SocketError.Interrupted),
+ $"Unexpected exception: {ex}");
+ }
+
+ byte[] sendBuffer = new byte[] { 42 };
+ int sent = await secondClient.SendAsync(sendBuffer, SocketFlags.None);
+ Assert.Equal(1, sent);
+
+ int received = await secondServer.ReceiveAsync(receiveBuffer, SocketFlags.None);
+ Assert.Equal(1, received);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+ }, CreateSocketEngineOptions()).DisposeAsync();
+ }
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific.
+ public static async Task IoUringCompletionMode_QueuedZeroByteReceive_DoesNotStall()
+ {
+ await RemoteExecutor.Invoke(static async () =>
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] firstReceiveBuffer = new byte[1];
+ Task firstReceive = ToTask(server.ReceiveAsync(firstReceiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ Task